<a href="https://colab.research.google.com/github/ashagedo/MSCI-641-project/blob/main/Task_1_RoBERTa_tuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import torch
from tqdm import tqdm  # Import tqdm for progress tracking
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import get_linear_schedule_with_warmup


In [None]:
# Import the files.upload() function from the google.colab module
from google.colab import files

# Upload files using the files.upload() function
uploaded_files = files.upload()

# Access the uploaded files
for filename in uploaded_files.keys():
    print('Uploaded file:', filename)
    with open(filename, 'r') as file:
        train_file = file.read()

Saving train.jsonl to train.jsonl
Uploaded file: train.jsonl


In [None]:
# Import the files.upload() function from the google.colab module
from google.colab import files

# Upload files using the files.upload() function
uploaded_files = files.upload()

# Access the uploaded files
for filename in uploaded_files.keys():
    print('Uploaded file:', filename)
    with open(filename, 'r') as file:
        val_file = file.read()

Saving val.jsonl to val.jsonl
Uploaded file: val.jsonl


In [None]:
# Import the files.upload() function from the google.colab module
from google.colab import files

# Upload files using the files.upload() function
uploaded_files = files.upload()

# Access the uploaded files
for filename in uploaded_files.keys():
    print('Uploaded file:', filename)
    with open(filename, 'r') as file:
        test_file = file.read()

Saving test.jsonl to test.jsonl
Uploaded file: test.jsonl


In [None]:
# Load data into DataFrames
train_data = pd.read_json(train_file, lines=True)
val_data = pd.read_json(val_file, lines=True)
test_data = pd.read_json(test_file, lines=True)

In [None]:
#Inititlaize Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#RoBERTa
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)  # 3 output classes
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=100)
class_freq = [1274, 1367, 559]
inv_freq = [1 / freq for freq in class_freq]
total_classes = len(class_freq)
class_weights = torch.tensor(inv_freq, dtype=torch.float32) / total_classes
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

batch_size = 32


In [None]:
# Initialize tokenizer

# Define training and validation datasets
train_data['postText'] = train_data['postText'].apply(lambda x: x if isinstance(x, list) else [])
train_data['targetParagraphs'] = train_data['targetParagraphs'].apply(lambda x: x if isinstance(x, list) else [])
train_data['concatenated_text'] = train_data.apply(lambda row: ' '.join(row['postText']) + ' ' + ' '.join(row['targetParagraphs']), axis=1)

val_data['postText'] = val_data['postText'].apply(lambda x: x if isinstance(x, list) else [])
val_data['targetParagraphs'] = val_data['targetParagraphs'].apply(lambda x: x if isinstance(x, list) else [])
val_data['concatenated_text'] = val_data.apply(lambda row: ' '.join(row['postText']) + ' ' + ' '.join(row['targetParagraphs']), axis=1)

# Tokenize training and validation datasets
tokenized_train_inputs = tokenizer(train_data['concatenated_text'].tolist(),
                                   padding=True, truncation=True, return_tensors='pt')
train_labels = train_data['tags'].apply(lambda x: x[0]).tolist()
label_map = {'phrase': 0, 'passage': 1, 'multi': 2}
train_labels = [label_map[label] for label in train_labels]
train_labels = torch.tensor(train_labels, dtype=torch.long)
train_dataset = TensorDataset(tokenized_train_inputs['input_ids'], tokenized_train_inputs['attention_mask'], train_labels)

tokenized_val_inputs = tokenizer(val_data['concatenated_text'].tolist(),
                                 padding=True, truncation=True, return_tensors='pt')
val_labels = val_data['tags'].apply(lambda x: x[0]).tolist()
val_labels = [label_map[label] for label in val_labels]
val_labels = torch.tensor(val_labels, dtype=torch.long)
val_dataset = TensorDataset(tokenized_val_inputs['input_ids'], tokenized_val_inputs['attention_mask'], val_labels)


In [None]:
#Initialize Datasets for Models
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
##Model Training

# Define class weights for weighted loss function
model.train()

accumulation_steps = 4  # Accumulate gradients over 4 batches
for epoch in range(5):
    total_loss = 0.0
    print(f'Epoch {epoch + 1}')
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training", leave=False)):
        input_ids, attention_mask, labels = batch

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss = loss / accumulation_steps  # Scale the loss
        loss.backward()

        if (step + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    print(f'Epoch {epoch + 1}, Average Training Loss: {total_loss / len(train_dataloader):.4f}')

In [None]:
#Save fine-tuned model
model.save_pretrained('/content/drive/MyDrive/Task1/roberta_task1_batch32_epoch.5_lr.1e-4_paragraphs.bin')
tuned_model = RobertaForSequenceClassification.from_pretrained('/content/drive/MyDrive/Task1/roberta_task1_batch32_epoch.5_lr.1e-4_paragraphs.bin')


In [None]:
# Evaluate on Training Set

tuned_model.eval() #load fine-tuned model
tuned_model.to(device)

predictions = []
true_labels = []
with torch.no_grad():
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = tuned_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, dim=1)
        predictions.extend(predicted.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

train_accuracy = accuracy_score(true_labels, predictions)
train_f1 = f1_score(true_labels, predictions, average='weighted')
train_report = classification_report(true_labels, predictions, digits=4)

print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Training F1 Score: {train_f1:.4f}')
print('Training Report:')
print(train_report)

Training Accuracy: 0.5156
Training F1 Score: 0.4997
Training Report:
              precision    recall  f1-score   support

           0     0.5098    0.7249    0.5986      1367
           1     0.5052    0.3799    0.4337      1274
           2     0.5872    0.3131    0.4084       559

    accuracy                         0.5156      3200
   macro avg     0.5341    0.4726    0.4802      3200
weighted avg     0.5215    0.5156    0.4997      3200



In [None]:
# Evaluate on Validation Set

tuned_model.eval()  #load fine-tuned model
tuned_model.to(device)

predictions = []
true_labels = []
with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = tuned_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, dim=1)
        predictions.extend(predicted.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

val_accuracy = accuracy_score(true_labels, predictions)
val_f1 = f1_score(true_labels, predictions, average='weighted')
val_report = classification_report(true_labels, predictions, digits=4)

print(f'Validation Accuracy: {val_accuracy:.4f}')
print(f'Validation F1 Score: {val_f1:.4f}')
print('Validation Report:')
print(val_report)

Validation Accuracy: 0.4550
Validation F1 Score: 0.4292
Validation Report:
              precision    recall  f1-score   support

           0     0.4475    0.7099    0.5489       162
           1     0.4348    0.3247    0.3717       154
           2     0.6071    0.2024    0.3036        84

    accuracy                         0.4550       400
   macro avg     0.4965    0.4123    0.4081       400
weighted avg     0.4761    0.4550    0.4292       400



In [None]:
# Evaluate on Test Set

tuned_model.eval()
tuned_model.to(device)

test_data['postText'] = test_data['postText'].apply(lambda x: x if isinstance(x, list) else [])
test_data['targetParagraphs'] = test_data['targetParagraphs'].apply(lambda x: x if isinstance(x, list) else [])
test_data['concatenated_text'] = test_data.apply(lambda row: ' '.join(row['postText']) + ' ' + ' '.join(row['targetParagraphs']), axis=1)

predictions = []
#loaded_model.eval()
model.eval()
for _, row in test_data.iterrows():
    inputs = tokenizer(row['concatenated_text'], padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = tuned_model(**inputs)
    logits = outputs.logits
    _, predicted_label_idx = torch.max(logits, dim=1)
    predicted_label_idx = predicted_label_idx.item()
    if predicted_label_idx == 0:
        predicted_label = 'phrase'
    elif predicted_label_idx == 1:
        predicted_label = 'passage'
    elif predicted_label_idx == 2:
        predicted_label = 'multi'
    predictions.append(predicted_label)

output_file = r'/content/drive/MyDrive/Task1/roberta_task1_batch8_epoch.2_lr.1e-4_paragraphs.bin.csv'
test_data['spoilerType'] = predictions
test_data[['id', 'spoilerType']].to_csv(output_file, index=False)