#My Kaggle ID: abhiramkrishnam

In [1]:
!pip install transformers



In [4]:
!gdown --id '1LABaYT-2gWthtNnW7PKlG9pM8Mh3NvuA' --output DATA.zip
!unzip DATA.zip

Downloading...
From: https://drive.google.com/uc?id=1LABaYT-2gWthtNnW7PKlG9pM8Mh3NvuA
To: /content/DATA.zip
100% 1.89M/1.89M [00:00<00:00, 17.0MB/s]
Archive:  DATA.zip
   creating: data/
  inflating: data/data_test.csv      
  inflating: data/data_train.csv     


#Training part

In [2]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset

os.environ["WANDB_MODE"] = "disabled"

###Load Dataset and model

In [5]:
# Load the dataset
df = pd.read_csv('data/data_train.csv')

# Concatenate context, question, and answer columns for BERT input
df['input_text'] = df['context'] + " [SEP] " + df['question'] + " [SEP] " + df['answer0'] + " [SEP] " + df['answer1'] + " [SEP] " + df['answer2']

# Split into training and validation sets (80-20 split)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Utilities

In [6]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, is_test=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        inputs = self.tokenizer(row['input_text'], max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")

        if self.is_test:
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze()
            }
        else:
            label = torch.tensor(row['label'], dtype=torch.long)
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': label
            }

# Define accuracy metric function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create training and validation datasets
train_dataset = CustomDataset(train_df, tokenizer, max_len=32)  # Reduced max_len for faster processing
val_dataset = CustomDataset(val_df, tokenizer, max_len=32)

###Params

In [7]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    report_to="none",
    fp16=True
)



In [8]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Ensure all tensors in the model are contiguous to avoid ValueError during saving
for param in model.parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print("Validation accuracy:", eval_results['eval_accuracy'])

# Save the model
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.108084,0.314488
2,1.107100,1.103559,0.314488
3,1.103500,1.102803,0.327444
4,1.094500,1.146166,0.310365
5,0.953700,1.469408,0.342167
6,0.613100,1.78524,0.3298
7,0.613100,2.105581,0.3351


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.108084,0.314488
2,1.107100,1.103559,0.314488
3,1.103500,1.102803,0.327444
4,1.094500,1.146166,0.310365
5,0.953700,1.469408,0.342167
6,0.613100,1.78524,0.3298
7,0.613100,2.105581,0.3351
8,0.398400,2.525935,0.343345
9,0.273600,2.824151,0.3404
10,0.217700,3.048904,0.343934


Validation accuracy: 0.34393404004711425


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

#Testing part

In [24]:
# Load the saved model for inference
loaded_model = BertForSequenceClassification.from_pretrained("./saved_model")
loaded_tokenizer = BertTokenizer.from_pretrained("./saved_model")

new_df = pd.read_csv('data/data_test.csv')
new_df['input_text'] = new_df['context'] + " [SEP] " + new_df['question'] + " [SEP] " + new_df['answer0'] + " [SEP] " + new_df['answer1'] + " [SEP] " + new_df['answer2']

predict_dataset = CustomDataset(new_df, loaded_tokenizer, max_len=32, is_test=True)


# Initialize Trainer for prediction
predict_trainer = Trainer(model=loaded_model)

# Make predictions
predictions = predict_trainer.predict(predict_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

prediction_df = pd.DataFrame({
    'id': new_df['id'],
    'label': predicted_labels
})

prediction_df.to_csv('prediction.csv', index=False)

A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.
