In [47]:
import pandas as pd
from datasets import Dataset
from transformers import (BertTokenizer, BertForSequenceClassification, 
            AdamW, get_scheduler, DataCollatorWithPadding, AutoModel, 
            AutoTokenizer)
from torch.utils.data import DataLoader
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
from tqdm.auto import tqdm

import time

In [9]:
data = pd.read_csv('/kaggle/input/fhd-data/data/incidents_train.csv', index_col=0)
valid = pd.read_csv('/kaggle/input/fhd-data/data/incidents_dev.csv', index_col=0)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['title'], padding=True, truncation=True)



In [12]:
def predict(texts, model_path, tokenizer_path="/kaggle/input/bert_tokenizer2/pytorch/default/1"):
    # Load the saved tokenizer and the saved model
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) # moved to Auto
    #model = BertForSequenceClassification.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Set the model to evaluation mode
    model.eval()

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    return predictions.cpu().numpy()

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.

In [45]:
import torch
from torch import nn

class CustomLoss(nn.Module):
    def __init__(self, whichloss='crossentropy', weights=None):
        super(CustomLoss, self).__init__()
        self.class_weights = weights

    def forward(self, logits, labels):
        # Apply sigmoid to logits to convert to probabilities
        #probs = torch.sigmoid(logits)
        #print(logits.shape, labels.shape)
        # Calculate the weighted binary cross-entropy loss
        #loss = -(
        #    self.pos_weight * labels * torch.log(probs + 1e-9) + 
        #    (1 - labels) * torch.log(1 - probs + 1e-9)
        #)
        if whichloss == 'crossentropy':
            loss = nn.functional.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            print("NOT IMPLEMENTED ERROR")
        return loss

# Instantiate your loss
custom_loss_fn = CustomLoss()

In [48]:
for label in tqdm(['hazard-category', 'product-category', 'hazard', 'product']):
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data[label])

    # Data preprocessing
    train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
    test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)


    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()) , output_hidden_states=False)
    #model = AutoModel.from_pretrained('bert-base-uncased')
    model.to('cuda')  # Move model to GPU if available

    # training
    optimizer = AdamW(model.parameters(), lr=5e-5)
    num_epochs = 1
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    model.train()
    progress_bar = tqdm(range(num_training_steps))
    print("training starting ..")
    total_loss_list = []
    
    for epoch in tqdm(range(num_epochs), desc="TRAIN"):
        curr_ep_loss = 0
        t1 = time.time()
        for batch in train_dataloader:
            inputs = {k: v.to('cuda') for k, v in batch.items() if k not in ['labels']}  # Move batch to GPU if available
            labels = {k: v.to('cuda') for k, v in batch.items() if k in ['labels']}
            outputs = model(**inputs)
            #print(outputs.last_hidden_state.shape, outputs.pooler_output.shape)
            #loss = outputs.loss
            logits = outputs.logits  # Raw logits from model
            # Compute custom loss
            loss = custom_loss_fn(logits, **labels)
            curr_ep_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
        t2 = time.time()
        print(f"Epoch {epoch + 1}, Loss: {curr_ep_loss:.4f} | Time : {(t2-t1):.4f} seconds")

    # assess model
    model.eval()
    total_predictions = []
    with torch.no_grad():
        for batch in test_dataloader:
            inputs = {k: v.to('cuda') for k, v in batch.items() if k not in ['labels']}  # Move batch to GPU if available
            labels = {k: v.to('cuda') for k, v in batch.items() if k in ['labels']}
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_predictions.extend([p.item() for p in predictions])

    predicted_labels = label_encoder.inverse_transform(total_predictions)
    gold_labels = label_encoder.inverse_transform(test_df.label.values)
    print(classification_report(gold_labels, predicted_labels, zero_division=0))

    model.save_pretrained(f"bert_{label}")
    break

  0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/4065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/509 [00:00<?, ?it/s]

training starting ..


TRAIN:   0%|          | 0/1 [00:00<?, ?it/s]



Epoch 1, Loss: 415.8496 | Time : 62.6634 seconds
                                precision    recall  f1-score   support

                     allergens       0.82      0.94      0.88       377
                    biological       0.83      0.93      0.88       339
                      chemical       0.80      0.59      0.68        68
food additives and flavourings       0.00      0.00      0.00         5
                foreign bodies       0.90      0.68      0.77       111
                         fraud       0.80      0.57      0.67        68
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       1.00      0.10      0.18        10
                  other hazard       0.59      0.37      0.45        27
              packaging defect       0.00      0.00      0.00        11

                      accuracy                           0.82      1017
                     macro avg       0.57      0.42      0.45      1017
             

In [22]:
{k:v for k,v in batch.items() if k not in ['labels']}

{'input_ids': tensor([[  101, 10651,  1015,  1024, 28390,  2080, 17722,  1037,  4989,  1997,
           2785,  2121,  3688,  2138,  1997,  1996,  2825,  3739,  1997, 11840,
           8411,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0],
         [  101,  2088,  2740,  3688,  1010, 11775,  1012,  3314,  2035, 24395,
           9131,  2006,  6151,  8586,  8017,  2098,  6501,  2035,  2121,  6914,
           1999,  6892, 11263,  2884,  4487,  5397,  4588,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,   

In [None]:
##### PREDICTIONS #####

# prediction ST1
valid_predictions_category = {}
for label in tqdm(['hazard-category', 'product-category']):
  # Decode predictions back to string labels
  label_encoder = LabelEncoder()
  label_encoder.fit(data[label])
  valid_predictions_category[label] = predict(valid.title.to_list(), f'bert_{label}')
  valid_predictions_category[label] = label_encoder.inverse_transform(valid_predictions_category[label])

# save predictions
solution = pd.DataFrame({'hazard-category': valid_predictions_category['hazard-category'], 'product-category': valid_predictions_category['product-category']})
solution.to_csv('/kaggle/working/submission_bert_st1.csv', index=False)
print("submission ST1 created!")

# prediction ST2
valid_predictions = {}
for label in tqdm(['hazard', 'product']):
  # Decode predictions back to string labels
  label_encoder = LabelEncoder()
  label_encoder.fit(data[label])
  valid_predictions[label] = predict(valid.title.to_list(), f'bert_{label}')
  valid_predictions[label] = label_encoder.inverse_transform(valid_predictions[label])