# Evaluate roBERTa Model
This notebook is used to make prediction on the test set from the fine-tuned roBERTa Model.


To successfully run this notebook, one may need to use online computing resourse. In my case, I use google colab with GPU to run the roBERTa model.

In order to get the access and download required data, we'll need to mount our google drive to the colab environment.

One can skip the following block, if not using google colab.

In [1]:
#### Skip this block if not using google colab. ####

# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where this repo is located.
FOLDERNAME = "Academics/DATA512/Project/llm-roberta-sentiment" # <--- ENTER FOLDERNAME HERE
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
import os
sys.path.append('/content/drive/MyDrive/{}'.format(FOLDERNAME))
os.chdir('/content/drive/MyDrive/{}'.format(FOLDERNAME))


Mounted at /content/drive


## 0. Settings

Load library

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import RobertaForSequenceClassification, RobertaTokenizer


In [None]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create necessary directories

os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("results", exist_ok=True)

## 1. Predictions

### Bert model (10k trainging; 5 Class)

Load tokenized data and models

In [4]:
model_path = "models/sentiment_model_10k_5"
model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

test = pd.read_parquet("data/test_10k_5.parquet") # Note that all of the test data for 10k 5-class model are the same

In [7]:
# similar to create_dataloader function in helper_finetune_bert.ipynb we defined a function here for batch processing
def create_dataloader(data, batch_size=32):

    labels = torch.tensor(data['rating'], dtype=torch.long)
    input_ids = torch.tensor(data['input_ids'])
    attention_mask = torch.tensor(data['attention_mask'])
    train_data = TensorDataset(input_ids, attention_mask, labels)

    return DataLoader(train_data, batch_size=batch_size)

test_dataloader = create_dataloader(test)

Make predictions

In [None]:
model.to(device)
with torch.no_grad():
    model.eval()
    total_eval_loss = 0
    preds = torch.tensor([]).to(device)
    target = torch.tensor([]).to(device)

    for batch in test_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
        target = torch.cat([target, b_labels])

# chang to np.array
preds_np = preds.detach().cpu().numpy()
# add predictions to test dataframe
test['pred_10k_5'] = preds_np

Save data

In [None]:
test.to_csv('results/test_5_bert_all.csv', index=False)

### Bert model (50k trainging; 5 Class)

In [None]:
model_path = "model/sentiment_model_50k_5"

model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

tokenized_test = pd.read_parquet("data/test_50k_5.parquet") # Note that all of the test data for 10k 5-class model are the same
test_dataloader = create_dataloader(tokenized_test)

In [None]:
model.to(device)
with torch.no_grad():
    model.eval()
    total_eval_loss = 0
    preds = torch.tensor([]).to(device)
    target = torch.tensor([]).to(device)

    for batch in test_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
        target = torch.cat([target, b_labels])
preds_np = preds.detach().cpu().numpy()
test['pred_50k_5'] = preds_np

In [None]:
test = pd.read_csv("results/test_5_bert_all.csv")

### Bert model (100k trainging; 5 Class)

In [None]:
model_path = "model/sentiment_model_100k_5"

model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

tokenized_test = pd.read_parquet("data/test_100k_5.parquet") # Note that all of the test data for 10k 5-class model are the same
test_dataloader = create_dataloader(tokenized_test)

In [None]:
model.to(device)
with torch.no_grad():
    model.eval()
    total_eval_loss = 0
    preds = torch.tensor([]).to(device)
    target = torch.tensor([]).to(device)

    for batch in test_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
        target = torch.cat([target, b_labels])
preds_np = preds.detach().cpu().numpy()
test['pred_100k_5'] = preds_np

In [None]:
test.to_csv('results/test_5_bert_all.csv', index= False)

### Bert model (10k trainging; 2 Class)

In [None]:
test = pd.read_parquet("data/test_10k_2.parquet")

In [None]:
model_path = "model/sentiment_model_10k_2"

model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

test = pd.read_parquet("data/test_10k_2.parquet")
test_dataloader = create_dataloader(test)


In [None]:
model.to(device)
with torch.no_grad():
    model.eval()
    total_eval_loss = 0
    preds = torch.tensor([]).to(device)
    target = torch.tensor([]).to(device)

    for batch in test_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
        target = torch.cat([target, b_labels])
preds_np = preds.detach().cpu().numpy()
test['pred_10k_2'] = preds_np


In [None]:
test.to_csv('results/test_2_bert_all.csv', index= False)

### Bert model (50k trainging; 2 Class)

In [None]:
model_path = "model/sentiment_model_50k_2"

model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

tokenized_test = pd.read_parquet("data/test_50k_2.parquet")
test_dataloader = create_dataloader(tokenized_test)

In [None]:
model.to(device)
with torch.no_grad():
    model.eval()
    total_eval_loss = 0
    preds = torch.tensor([]).to(device)
    target = torch.tensor([]).to(device)

    for batch in test_dataloader:
        b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        preds = torch.cat([preds, torch.argmax(outputs.logits, axis=1)])
        target = torch.cat([target, b_labels])
preds_np = preds.detach().cpu().numpy()
test['pred_50k_2'] = preds_np

In [None]:
test.to_csv('results/test_2_bert_all.csv', index= False)