In [None]:
%cd /content/drive/MyDrive/NLP/semevaltask9

/content/drive/MyDrive/NLP/semevaltask9


In [None]:
! pip install accelerate

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2


In [None]:
import torch
import numpy as np
import pandas as pd
import random
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, BertForMultipleChoice

In [None]:
GLOBAL_SEED = 255

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


env: CUBLAS_WORKSPACE_CONFIG=:4096:8


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Baselines:

## Import train and test dataset

In [None]:
data = np.load("./Data/SP_train.npy", allow_pickle = True ).tolist()
SP_train = pd.DataFrame(data)
data = np.load("./Data/SP_test.npy", allow_pickle = True ).tolist()
SP_test = pd.DataFrame(data)
data = np.load("./Data/SP_test_answer.npy", allow_pickle = True ).tolist()
SP_test_answer = pd.DataFrame(data)

data = np.load("./Data/WP_train.npy", allow_pickle = True ).tolist()
WP_train = pd.DataFrame(data)
data = np.load("./Data/WP_test.npy", allow_pickle = True ).tolist()
WP_test = pd.DataFrame(data)
data = np.load("./Data/WP_test_answer.npy", allow_pickle = True ).tolist()
WP_test_answer = pd.DataFrame(data)

data = np.load("./Data/sentence_puzzle.npy", allow_pickle = True ).tolist()
SP = pd.DataFrame(data)
data = np.load("./Data/word_puzzle.npy", allow_pickle = True ).tolist()
WP = pd.DataFrame(data)

In [None]:
for index, row in SP_test.iterrows():
  SP_test.at[index, 'label'] = int(SP_test_answer.loc[index][1])

In [None]:
for index, row in WP_test.iterrows():
  WP_test.at[index, 'label'] = int(WP_test_answer.loc[index][1])

## Dataset Class

In [None]:
# Define a custom dataset class for multiple-choice questions
class BrainTeaser(Dataset):
    def __init__(self, data, tokenizer, max_length = 512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        df = self.data.iloc[idx]

        choices = df['choice_list']

        question = df['question']

        true_label = df['label']

        # Tokenize the inputs
        tokenized_inputs = tokenizer([question, question, question, question], [choices[0], choices[1], choices[2], choices[3]], return_tensors='pt', padding='max_length', max_length=self.max_length)

        return {
            'input_ids': tokenized_inputs['input_ids'],
            'attention_mask': tokenized_inputs['attention_mask'],
            'labels': torch.tensor(true_label, dtype=torch.long)
        }

# Import BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForMultipleChoice.from_pretrained("bert-base-uncased").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
torch.cuda.empty_cache()

In [None]:
# Define a function to perform inference
def inference(model, dataloader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Choose the predicted label (choice with the highest logit)
            predicted_labels = torch.argmax(logits, dim=1).tolist()
            predictions.extend(predicted_labels)

    return predictions

In [None]:
# create dataloaders
sp_test_dataset = BrainTeaser(SP_test, tokenizer)
wp_test_dataset = BrainTeaser(WP_test, tokenizer)

In [None]:
# Create a DataLoader for evaluation
sp_test_dataloader = DataLoader(sp_test_dataset, batch_size=4)
wp_test_dataloader = DataLoader(wp_test_dataset, batch_size=4)

In [None]:
# Perform inference
predictions = inference(model, sp_test_dataloader)
accuracy = 100 * accuracy_score([int(label) for label in SP_test_answer[1].tolist()], predictions)

print("SP:", accuracy)

SP: 39.166666666666664


In [None]:
for pred in predictions:
  print(pred)

3
2
2
1
3
0
2
1
1
0
1
2
0
0
0
0
1
1
0
0
1
0
2
0
0
2
0
2
1
2
1
1
3
1
0
1
3
2
1
1
1
1
0
0
1
0
1
2
0
0
2
0
1
2
0
0
0
3
1
1
0
1
2
1
1
0
3
2
2
1
2
1
1
2
1
2
0
0
2
2
0
2
3
0
2
1
1
1
2
1
2
1
0
2
1
0
2
2
1
0
1
0
0
0
1
2
1
1
0
0
0
1
1
2
2
2
1
2
2
1


In [None]:
# Perform inference
predictions = inference(model, wp_test_dataloader)
accuracy = 100 * accuracy_score([int(label) for label in WP_test_answer[1].tolist()], predictions)

print("WP:", accuracy)

WP: 42.70833333333333


In [None]:
for pred in predictions:
  print(pred)

1
1
2
2
1
0
0
2
2
2
1
0
0
0
0
0
1
2
2
2
0
2
2
2
0
0
1
2
2
2
1
1
2
2
0
2
0
2
0
1
1
0
0
2
1
2
1
0
1
1
1
0
0
1
1
1
1
2
0
1
0
2
1
2
0
0
0
1
1
0
1
1
0
2
1
2
1
2
1
2
2
2
2
0
0
0
0
1
0
2
1
0
1
0
2
1


## Training


###SP

In [None]:
# create datasets
sp_train_dataset = BrainTeaser(SP_train, tokenizer)
sp_test_dataset = BrainTeaser(SP_test, tokenizer)

In [None]:
epochs = 2
batch = 1
weight_decay= 0.01
logging_steps = 100
lr = 1e-5

In [None]:
from transformers import TrainingArguments, Trainer

%cd /content/

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=epochs,              # total number of training epochs
    per_device_train_batch_size=batch,  # batch size per device during training
    learning_rate = lr,
    weight_decay=weight_decay,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=logging_steps,
    evaluation_strategy = "steps",
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

# Create a function to compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).astype(np.float32).mean().item()}

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=sp_train_dataset,
    eval_dataset=sp_test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


/content


Step,Training Loss,Validation Loss,Accuracy
100,1.2961,1.171351,0.541667
200,1.0687,1.008927,0.591667
300,0.8149,0.955668,0.583333
400,0.6808,0.883855,0.633333
500,0.5874,0.829286,0.675
600,0.3868,0.794042,0.641667
700,0.3241,0.751192,0.708333
800,0.3121,0.715847,0.733333
900,0.265,0.735169,0.7
1000,0.1828,0.744776,0.708333


TrainOutput(global_step=1014, training_loss=0.5882043133120565, metrics={'train_runtime': 584.0998, 'train_samples_per_second': 1.736, 'train_steps_per_second': 1.736, 'total_flos': 1067168858775552.0, 'train_loss': 0.5882043133120565, 'epoch': 2.0})

### Predict labels

In [None]:
# Evaluate the model on the test data
predictions_output = trainer.predict(sp_test_dataset)

# Get the predictions
predictions = predictions_output.predictions

predicted_classes = np.argmax(predictions, axis=1)

# Now, predicted_classes is a list of predictions for the test data
print(predicted_classes)

[0 2 1 3 3 0 0 3 0 1 1 0 2 0 0 1 0 2 2 0 0 1 1 0 3 3 0 2 3 0 3 2 0 2 2 2 3
 0 0 3 1 1 0 3 3 2 2 2 0 2 0 0 3 1 3 0 2 0 1 1 3 1 1 1 2 3 2 1 2 0 3 0 3 2
 1 2 1 0 1 0 0 2 3 0 2 1 3 3 3 2 3 0 3 0 2 3 1 2 0 2 1 0 0 2 2 2 3 2 3 3 1
 0 1 3 2 3 2 0 1 2]


In [None]:
for item in predicted_classes:
  print(item)

0
2
1
3
3
0
0
3
0
1
1
0
2
0
0
1
0
2
2
0
0
1
1
0
3
3
0
2
3
0
3
2
0
2
2
2
3
0
0
3
1
1
0
3
3
2
2
2
0
2
0
0
3
1
3
0
2
0
1
1
3
1
1
1
2
3
2
1
2
0
3
0
3
2
1
2
1
0
1
0
0
2
3
0
2
1
3
3
3
2
3
0
3
0
2
3
1
2
0
2
1
0
0
2
2
2
3
2
3
3
1
0
1
3
2
3
2
0
1
2


### WP

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained("bert-base-uncased")
model2 = BertForMultipleChoice.from_pretrained("bert-base-uncased").to(device)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# create datasets
wp_train_dataset = BrainTeaser(WP_train, tokenizer2)
wp_test_dataset = BrainTeaser(WP_test, tokenizer2)

In [None]:
epochs = 2
batch = 1
weight_decay= 0.01
logging_steps = 100
lr = 1e-5

In [None]:
from transformers import TrainingArguments, Trainer

%cd /content/

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=epochs,              # total number of training epochs
    per_device_train_batch_size=batch,  # batch size per device during training
    learning_rate = lr,
    weight_decay=weight_decay,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=logging_steps,
    evaluation_strategy = "steps",
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

# Create a function to compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).astype(np.float32).mean().item()}

# Create a Trainer
trainer = Trainer(
    model=model2,
    args=training_args,
    train_dataset=wp_train_dataset,
    eval_dataset=wp_test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


/content


Step,Training Loss,Validation Loss,Accuracy
100,1.2508,1.12221,0.40625
200,1.1755,1.05052,0.59375
300,1.0897,1.018901,0.53125
400,1.0607,0.951734,0.59375
500,0.9297,0.938362,0.604167
600,0.7802,0.948041,0.59375
700,0.6385,0.949525,0.614583


TrainOutput(global_step=792, training_loss=0.9474091818838408, metrics={'train_runtime': 422.7325, 'train_samples_per_second': 1.874, 'train_steps_per_second': 1.874, 'total_flos': 833528339398656.0, 'train_loss': 0.9474091818838408, 'epoch': 2.0})

### Predict labels

In [None]:
# Evaluate the model on the test data
predictions_output = trainer.predict(wp_test_dataset)

# Get the predictions
predictions = predictions_output.predictions

predicted_classes = np.argmax(predictions, axis=1)

# Now, predicted_classes is a list of predictions for the test data
print(predicted_classes)

[1 1 0 2 1 0 0 0 1 0 1 2 0 2 0 2 1 0 2 2 0 1 0 0 1 1 1 2 2 2 0 0 2 0 1 2 0
 2 0 0 0 0 2 0 0 1 0 0 1 0 1 2 2 2 0 1 2 2 0 2 1 1 2 1 0 1 2 0 0 2 1 2 0 1
 0 2 2 0 2 0 0 1 0 1 0 0 2 1 1 1 2 1 2 2 2 2]


In [None]:
for item in predicted_classes:
  print(item)

1
1
0
2
1
0
0
0
1
0
1
2
0
2
0
2
1
0
2
2
0
1
0
0
1
1
1
2
2
2
0
0
2
0
1
2
0
2
0
0
0
0
2
0
0
1
0
0
1
0
1
2
2
2
0
1
2
2
0
2
1
1
2
1
0
1
2
0
0
2
1
2
0
1
0
2
2
0
2
0
0
1
0
1
0
0
2
1
1
1
2
1
2
2
2
2
