In [None]:

!pip install transformers
!pip install gdown



In [None]:
!gdown --id '1LABaYT-2gWthtNnW7PKlG9pM8Mh3NvuA' --output DATA.zip
!unzip -o DATA.zip -d data

Downloading...
From: https://drive.google.com/uc?id=1LABaYT-2gWthtNnW7PKlG9pM8Mh3NvuA
To: /content/DATA.zip
100% 1.89M/1.89M [00:00<00:00, 128MB/s]
Archive:  DATA.zip
   creating: data/data/
  inflating: data/data/data_test.csv  
  inflating: data/data/data_train.csv  


In [None]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import (
    BertTokenizer,
    BertForMultipleChoice,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from torch.utils.data import Dataset
import random
import re

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)
os.environ["WANDB_MODE"] = "disabled"
train_df = pd.read_csv('/kaggle/input/dl-assignment-last/data_last/data_train.csv')
test_df = pd.read_csv('/kaggle/input/dl-assignment-last/data_last/data_test.csv')
print("Training Data Sample:")
print(train_df.head())
print("\nTest Data Sample:")
print(test_df.head())

In [None]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text
for col in ['context', 'question', 'answer0', 'answer1', 'answer2']:
    train_df[col] = train_df[col].apply(lambda x: clean_text(str(x)))
    test_df[col] = test_df[col].apply(lambda x: clean_text(str(x)))
def prepare_multiple_choice_inputs(df, is_test=False):
    contexts = df['context'].tolist()
    questions = df['question'].tolist()
    answer0 = df['answer0'].tolist()
    answer1 = df['answer1'].tolist()
    answer2 = df['answer2'].tolist()

    input_texts = []
    for context, question, a0, a1, a2 in zip(contexts, questions, answer0, answer1, answer2):
        choices = [
            context + " [SEP] " + question + " [SEP] " + a0,
            context + " [SEP] " + question + " [SEP] " + a1,
            context + " [SEP] " + question + " [SEP] " + a2
        ]
        input_texts.append(choices)

    if not is_test and 'label' in df.columns:
        labels = pd.to_numeric(df['label'], errors='coerce')
        valid_indices = labels.notna()
        input_texts = [input_texts[i] for i in range(len(input_texts)) if valid_indices[i]]
        labels = labels[valid_indices].astype(int).tolist()
        return input_texts, labels
    return input_texts
train_inputs, train_labels = prepare_multiple_choice_inputs(train_df, is_test=False)
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_inputs, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)
test_inputs = prepare_multiple_choice_inputs(test_df, is_test=True)
class MultipleChoiceDataset(Dataset):
    def __init__(self, inputs, labels=None, tokenizer=None, max_length=256):
        self.inputs = inputs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.inputs[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        input_ids = input_ids.squeeze(0)
        attention_mask = attention_mask.squeeze(0)
        item = {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item


In [None]:
model_name = 'bert-large-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMultipleChoice.from_pretrained(model_name, num_labels=3)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

train_dataset = MultipleChoiceDataset(train_inputs, train_labels, tokenizer, max_length=256)
val_dataset = MultipleChoiceDataset(val_inputs, val_labels, tokenizer, max_length=256)
test_dataset = MultipleChoiceDataset(test_inputs, tokenizer=tokenizer, max_length=256)



def compute_metrics(eval_pred):
    """
    Computes accuracy metric.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {'accuracy': accuracy_score(labels, predictions)}
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    warmup_steps=1000,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,
    save_total_limit=3,
    seed=42
)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop training if no improvement for 2 evaluations
)

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print("Validation Accuracy:", eval_results['eval_accuracy'])
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")


In [None]:
class PredictionTrainer(Trainer):
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
        return logits, None, None
predict_trainer = PredictionTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator
)
predictions = predict_trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'label': predicted_labels
})
submission_df.to_csv('prediction_improved.csv', index=False)
print("\nPredictions Sample:")
print(submission_df.head())