In [1]:
"""
RoBerta transformer for classification finetuning for counterfactual classification
process is compute heavy so I can only run this on Free TPU in colab notebooks
careful with all the filepaths
"""
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import f1_score
import torch
import os
!pip install sentencepiece
import sentencepiece
from tqdm import tqdm



In [5]:
# Load the training and test dataset
prefix_path = "/content/gdrive/MyDrive/Colab/subtask-1/"
train_path = prefix_path + "train.csv"
test_path = prefix_path + "test.csv"
train_data = pd.read_csv(train_path, sep=',')
test_data = pd.read_csv(test_path, sep=',')
# Define input data and labels for training
train_sentences = train_data['sentence'].values
train_labels = train_data['gold_label'].values
test_sentences = test_data['sentence'].values
test_labels = test_data['gold_label'].values

# Tokenize the input sentences
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model_name = "roberta-base"
roberta_model = RobertaForSequenceClassification.from_pretrained(model_name)
tokenizer = RobertaTokenizer.from_pretrained(model_name)

train_tokenized_inputs = tokenizer(train_sentences.tolist(), padding=True, truncation=True, return_tensors="np")
test_tokenized_inputs = tokenizer(test_sentences.tolist(), padding=True, truncation=True, return_tensors="np")
train_labels = torch.tensor(train_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [11]:
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_tokenized_inputs['input_ids']),
    torch.tensor(train_tokenized_inputs['attention_mask']),
    torch.tensor(train_labels)
)
test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_tokenized_inputs['input_ids']),
    torch.tensor(test_tokenized_inputs['attention_mask']),
    torch.tensor(test_labels)
)

  torch.tensor(train_labels)


In [7]:
# Define model parameters
num_labels = 2  # Assuming binary classification
batch_size = 32
epochs = 5
learning_rate = 2e-5

# Create DataLoader for training
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Freeze all layers except the last one
for param in roberta_model.base_model.parameters():
    param.requires_grad = False

# Make sure the last layer is set to require gradients
for param in roberta_model.roberta.encoder.layer[-1].parameters():
    param.requires_grad = True

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
roberta_model.to(device)

# Define optimizer and loss function
optimizer = AdamW(roberta_model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()



In [8]:
# Training loop
losses = []
for epoch in range(epochs):
    roberta_model.train()
    total_loss = 0.0
    tqdm_iterator = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch')

    for batch in tqdm_iterator:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = roberta_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    tqdm_iterator.close()
    average_loss = total_loss / len(train_dataloader)
    losses.append(average_loss)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {average_loss}")

# Plot the loss
plt.plot(losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()
plt.show()

Epoch 1/5, Loss: 0.2548892955874672
Epoch 2/5, Loss: 0.11815759841883577
Epoch 3/5, Loss: 0.10224549632510331
Epoch 4/5, Loss: 0.08784553565368679
Epoch 5/5, Loss: 0.08158122913005787


In [10]:
# Save the fine-tuned model
output_model_path = "/content/gdrive/MyDrive/Colab/subtask-1/fine_tuned_model"
roberta_model.save_pretrained(output_model_path)

In [12]:
# Load the fine-tuned model, turn true to do load model
if False:
  roberta_model = RobertaForSequenceClassification.from_pretrained(output_model_path)
  roberta_model.to(device)

# Create DataLoader for testing
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Evaluate the model on the test set
roberta_model.eval()
test_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = roberta_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())

# Export test predictions to CSV
test_output_path = "/content/gdrive/MyDrive/Colab/subtask-1/test_predictions.csv"
test_data['predicted_label'] = test_predictions
test_data.to_csv(test_output_path, index=False)

# Compute F1 score on the test set
test_f1 = f1_score(test_labels, test_predictions)
print(f"F1 Score on Test Set: {test_f1}")

F1 Score on Test Set: 0.8664310954063604


In [13]:
# Evaluate the model on the training set
train_predictions = []

with torch.no_grad():
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = roberta_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        train_predictions.extend(predictions.cpu().numpy())

# Compute F1 score on the training set
train_f1 = f1_score(train_labels, train_predictions)
print(f"F1 Score on Training Set: {train_f1}")

F1 Score on Training Set: 0.10124777183600712
