In [63]:
import pandas as pd
from datasets import Dataset
from transformers import (BertTokenizer, BertForSequenceClassification,
            AutoModelForSequenceClassification,
            AdamW, get_scheduler, DataCollatorWithPadding, AutoModel, 
            AutoTokenizer)
from torch.utils.data import DataLoader
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
from tqdm.auto import tqdm

from collections import Counter
import time

In [64]:
data = pd.read_csv('/kaggle/input/fhd-data/data/incidents_train.csv', index_col=0)
valid = pd.read_csv('/kaggle/input/fhd-data/data/incidents_dev.csv', index_col=0)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['title'], padding='max_length', max_length=512, truncation=True)



In [65]:
def predict(texts, model_path, tokenizer_path="/kaggle/input/bert_tokenizer2/pytorch/default/1"):
    # Load the saved tokenizer and the saved model
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) # moved to Auto
    #model = BertForSequenceClassification.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Set the model to evaluation mode
    model.eval()

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    return predictions.cpu().numpy()

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.

def compute_class_weights_as_list(sample_counts):
    """
    Compute class weights based on sample counts for each class.
    
    Args:
        sample_counts (list or array): List of sample counts for each class.
    
    Returns:
        list: List of class weights corresponding to each class.
    """
    total_samples = sum(sample_counts)
    num_classes = len(sample_counts)
    
    class_weights = torch.tensor([
        total_samples / (num_classes * count)
        for count in sample_counts
    ]).to('cuda')
    return class_weights

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        """
        Args:
            alpha: Tensor of shape (num_classes,) specifying weight for each class, or a scalar.
                   If None, no class weights are applied.
            gamma: Focusing parameter (default: 2.0).
            reduction: Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
        """
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, labels):
        """
        Args:
            logits: Tensor of shape (batch_size, num_classes). Raw model outputs.
            labels: Tensor of shape (batch_size,). True class indices (not one-hot).
        """
        # Compute softmax probabilities
        probs = F.softmax(logits, dim=1)  # Shape: (batch_size, num_classes)

        # Select the probabilities corresponding to the true class
        # Shape: (batch_size,)
        true_probs = probs[torch.arange(labels.size(0)), labels]

        # Compute the focal loss
        focal_weight = (1 - true_probs) ** self.gamma
        log_probs = torch.log(true_probs + 1e-9)  # Add epsilon for numerical stability
        loss = -focal_weight * log_probs

        # Apply alpha weighting if provided
        if self.alpha is not None:
            if isinstance(self.alpha, torch.Tensor):  # Class-specific alpha
                alpha_t = self.alpha[labels]
            else:  # Scalar alpha
                alpha_t = self.alpha
            loss *= alpha_t

        # Reduction: none | mean | sum
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:  # 'none'
            return loss


In [66]:
import torch
from torch import nn

class CustomLoss(nn.Module):
    def __init__(self, whichloss='crossentropy', class_count=None):
        super(CustomLoss, self).__init__()
        self.class_count = torch.tensor(class_count).to('cuda')
        self.class_weights = compute_class_weights_as_list(self.class_count)
        self.whichloss = whichloss
        self.num_classes = len(class_count)

        # for focalloss
        self.gamma = 2.0
        self.alpha = None
        self.reduction = 'mean'

        # for classbalancedloss
        self.beta = 0.99

        # for equalizationloss
        self.suppression_factor = 1.5

        # for ldam loss
        #self.cls_num_list = torch.tensor(cls_num_list, dtype=torch.float)
        self.max_margin = 0.5
        #self.weight = weight
        #self.reduction = reduction
        self.margins = self.max_margin / torch.sqrt(self.class_count)
        self.margins = self.margins.to(torch.float)
        

    def forward(self, logits, labels):
        if self.whichloss == 'softmax':
            loss = nn.functional.cross_entropy(logits, labels, weight=None)
        elif self.whichloss == 'wsoftmax':
            loss = nn.functional.cross_entropy(logits, labels, weight=self.class_weights)
        elif self.whichloss == 'focalloss':
            # Compute softmax probabilities
            probs = F.softmax(logits, dim=1)  # Shape: (batch_size, num_classes)
            # Select the probabilities corresponding to the true class
            # Shape: (batch_size,)
            true_probs = probs[torch.arange(labels.size(0)), labels]
            # Compute the focal loss
            focal_weight = (1 - true_probs) ** self.gamma
            log_probs = torch.log(true_probs + 1e-9)  # Add epsilon for numerical stability
            loss = -focal_weight * log_probs
            # Apply alpha weighting if provided
            if self.alpha is not None:
                if isinstance(self.alpha, torch.Tensor):  # Class-specific alpha
                    alpha_t = self.alpha[labels]
                else:  # Scalar alpha
                    alpha_t = self.alpha
                loss *= alpha_t
            if self.reduction == 'mean':
                return loss.mean()
            elif self.reduction == 'sum':
                return loss.sum()
            else:  # 'none'
                return loss
        elif self.whichloss == 'classbalancedloss':
            effective_num = 1.0 - torch.pow(self.beta, self.class_count)
            weights = (1.0 - self.beta) / (effective_num + 1e-8)
            weights = weights / weights.sum()  # Normalize weights
    
            # Convert targets to one-hot encoding
            one_hot_targets = F.one_hot(labels, num_classes=self.num_classes).float()
    
            # Apply softmax to logits
            probs = F.softmax(logits, dim=1)
    
            # Compute class-balanced cross-entropy loss
            weighted_loss = -weights * one_hot_targets * torch.log(probs + 1e-8)
            loss = weighted_loss.sum(dim=1)
    
            # Apply reduction
            if self.reduction == 'mean':
                loss = loss.mean()
            elif self.reduction == 'sum':
                loss = loss.sum()

        elif self.whichloss == 'balancedsoftmax':
            log_class_counts = torch.log(self.class_count.float() + 1e-8)  # Avoid log(0)
            # Adjust logits by subtracting log class counts
            adjusted_logits = logits - log_class_counts
            # Compute the balanced softmax probabilities
            balanced_probs = F.log_softmax(adjusted_logits, dim=1)
            # Gather the log probabilities of the true classes
            log_probs = balanced_probs[torch.arange(logits.size(0)), labels]
            # Compute the loss
            loss = -log_probs
            # Apply reduction
            if self.reduction == 'mean':
                loss = loss.mean()
            elif self.reduction == 'sum':
                loss = loss.sum()
        elif self.whichloss == 'equalizationloss':
            one_hot_targets = F.one_hot(labels, num_classes=self.num_classes).float()
    
            # Compute probabilities with softmax
            probs = F.softmax(logits, dim=1)
    
            # Suppression weights for negative samples
            effective_num = torch.pow(self.class_count.float(), self.suppression_factor)
            weights = (1.0 / effective_num).to(logits.device)
    
            # Broadcast weights to match batch size and one-hot targets
            weight_matrix = one_hot_targets + (1 - one_hot_targets) * weights.unsqueeze(0)
    
            # Compute weighted cross-entropy loss
            ce_loss = -one_hot_targets * torch.log(probs + 1e-8)
            suppressed_loss = ce_loss * weight_matrix
    
            # Sum over classes and apply reduction
            loss = suppressed_loss.sum(dim=1)
            if self.reduction == 'mean':
                loss = loss.mean()
            elif self.reduction == 'sum':
                loss = loss.sum()
        elif self.whichloss == 'ldamloss':
            batch_size, num_classes = logits.size()
            
            # Create a margin matrix
            margins = torch.zeros_like(logits)
            margins[torch.arange(batch_size), labels] = self.margins[labels]
            
            # Adjust logits with the margin
            adjusted_logits = logits - margins
            
            # Compute cross-entropy loss
            loss = F.cross_entropy(adjusted_logits, labels, weight=self.class_weights, reduction=self.reduction)
        else:
            print("NOT IMPLEMENTED ERROR")
        return loss

# Instantiate your loss


In [68]:
for label in tqdm(['hazard-category', 'product-category', 'hazard', 'product']):
    label_encoder = LabelEncoder()
    data[f'{label}_label'] = label_encoder.fit_transform(data[label])

for label in tqdm(['hazard-category', 'product-category', 'hazard', 'product']):
    #label_encoder = LabelEncoder()
    data['label'] = data[f'{label}_label']

    # Data preprocessing
    train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
    train_df, test_df = train_df.iloc[:100,:], test_df
    
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    print(train_dataset)
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length', max_length=512)
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
    test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()) , output_hidden_states=False)
    #model = AutoModel.from_pretrained('bert-base-uncased')
    model.to('cuda')  # Move model to GPU if available

    # training
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    num_epochs = 4
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    dd=dict(Counter(data['label'].values))
    class_count = [dd[i] for i in range(len(dd))]
    
    custom_loss_fn = CustomLoss(whichloss = 'ldamloss',
                               class_count = class_count)

    model.train()
    progress_bar = tqdm(range(num_training_steps))
    print("training starting ..")
    total_loss_list = []
    
    for epoch in tqdm(range(num_epochs), desc="TRAIN"):
        curr_ep_loss = 0
        t1 = time.time()
        for batch in train_dataloader:
            inputs = {k: v.to('cuda') for k, v in batch.items() if k not in ['labels']}  # Move batch to GPU if available
            labels = {k: v.to('cuda') for k, v in batch.items() if k in ['labels']}
            outputs = model(**inputs)
            #print(outputs.last_hidden_state.shape, outputs.pooler_output.shape)
            #loss = outputs.loss
            logits = outputs.logits  # Raw logits from model
            # Compute custom loss
            loss = custom_loss_fn(logits, **labels)
            curr_ep_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
        t2 = time.time()
        print(f"Epoch {epoch + 1}, Loss: {curr_ep_loss:.4f} | Time : {(t2-t1):.4f} seconds")

    # assess model
    model.eval()
    total_predictions = []
    with torch.no_grad():
        for batch in test_dataloader:
            inputs = {k: v.to('cuda') for k, v in batch.items() if k not in ['labels']}  # Move batch to GPU if available
            labels = {k: v.to('cuda') for k, v in batch.items() if k in ['labels']}
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_predictions.extend([p.item() for p in predictions])

    predicted_labels = label_encoder.inverse_transform(total_predictions)
    gold_labels = label_encoder.inverse_transform(test_df.label.values)
    print(classification_report(gold_labels, predicted_labels, zero_division=0))

    model.save_pretrained(f"bert_{label}")
    break
    

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

Dataset({
    features: ['year', 'month', 'day', 'country', 'title', 'text', 'hazard-category', 'product-category', 'hazard', 'product', 'hazard-category_label', 'product-category_label', 'hazard_label', 'product_label', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/52 [00:00<?, ?it/s]

training starting ..


TRAIN:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1, Loss: 29.9738 | Time : 10.2226 seconds
Epoch 2, Loss: 26.2441 | Time : 10.8286 seconds
Epoch 3, Loss: 24.6166 | Time : 11.1994 seconds
Epoch 4, Loss: 22.8788 | Time : 11.2598 seconds
                                     precision    recall  f1-score   support

             Catfishes (freshwater)       0.44      0.89      0.59       377
                    Dried pork meat       0.86      0.09      0.16       339
              Fishes not identified       0.00      0.00      0.00        68
                 Groupers (generic)       0.00      0.00      0.00         5
           Not classified pork meat       0.00      0.00      0.00       111
         Pangas catfishes (generic)       0.18      0.56      0.27        68
Precooked cooked pork meat products       0.00      0.00      0.00         1
 Torpedo-shaped catfishes (generic)       0.00      0.00      0.00        10
                      Veggie Burger       0.00      0.00      0.00        27
                    adobo seasoning  

In [22]:
{k:v for k,v in batch.items() if k not in ['labels']}

{'input_ids': tensor([[  101, 10651,  1015,  1024, 28390,  2080, 17722,  1037,  4989,  1997,
           2785,  2121,  3688,  2138,  1997,  1996,  2825,  3739,  1997, 11840,
           8411,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0],
         [  101,  2088,  2740,  3688,  1010, 11775,  1012,  3314,  2035, 24395,
           9131,  2006,  6151,  8586,  8017,  2098,  6501,  2035,  2121,  6914,
           1999,  6892, 11263,  2884,  4487,  5397,  4588,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,   

In [None]:
##### PREDICTIONS #####

# prediction ST1
valid_predictions_category = {}
for label in tqdm(['hazard-category', 'product-category']):
  # Decode predictions back to string labels
  label_encoder = LabelEncoder()
  label_encoder.fit(data[label])
  valid_predictions_category[label] = predict(valid.title.to_list(), f'bert_{label}')
  valid_predictions_category[label] = label_encoder.inverse_transform(valid_predictions_category[label])

# save predictions
solution = pd.DataFrame({'hazard-category': valid_predictions_category['hazard-category'], 'product-category': valid_predictions_category['product-category']})
solution.to_csv('/kaggle/working/submission_bert_st1.csv', index=False)
print("submission ST1 created!")

# prediction ST2
valid_predictions = {}
for label in tqdm(['hazard', 'product']):
  # Decode predictions back to string labels
  label_encoder = LabelEncoder()
  label_encoder.fit(data[label])
  valid_predictions[label] = predict(valid.title.to_list(), f'bert_{label}')
  valid_predictions[label] = label_encoder.inverse_transform(valid_predictions[label])