# AI6103 Deep Learning & Applications Project

## Preparation

In [None]:
# from google.colab import drive
# drive.mount("/content/drive")

In [None]:
# !nvidia-smi

In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import requests
import json
import zipfile
import time
from datetime import datetime
import pytz
import io
import os
import re
import transformers
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [None]:
# path setting
# PROJECT_PATH = "./drive/MyDrive/DL_Project"
PROJECT_PATH = os.getcwd()
DATA_PATH = "data"
TRAIN_DATA_PATH = "semeval-2020-task-7-dataset/subtask-1/train.csv"
TEST_DATA_PATH = "semeval-2020-task-7-dataset/subtask-1/test.csv"
VAL_DATA_PATH = "semeval-2020-task-7-dataset/subtask-1/dev.csv"
FIG_PATH = "figures"
MODEL_PATH = "models"
RES_CSV_PATH = "res_csvs"
CURR_MODEL_PATH = ""

# generate folders if not exist
if not os.path.exists(os.path.join(PROJECT_PATH, DATA_PATH)):
    os.mkdir(os.path.join(PROJECT_PATH, DATA_PATH))
if not os.path.exists(os.path.join(PROJECT_PATH, FIG_PATH)):
    os.mkdir(os.path.join(PROJECT_PATH, FIG_PATH))
if not os.path.exists(os.path.join(PROJECT_PATH, MODEL_PATH)):
    os.mkdir(os.path.join(PROJECT_PATH, MODEL_PATH))
if not os.path.exists(os.path.join(PROJECT_PATH, RES_CSV_PATH)):
    os.mkdir(os.path.join(PROJECT_PATH, RES_CSV_PATH))

# model settings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MAX_LEN = 256
MODEL_NAME = "bert-base-uncased"
EPOCH_NUM = 20
LR = 1e-4
BATCH_SIZE = 32
HUMOR_THRESHOLD = 0.94

## Dataset Humicroedit

In [None]:
# download dataset
# url references: https://huggingface.co/datasets/humicroedit

data_url = "https://cs.rochester.edu/u/nhossain/semeval-2020-task-7-dataset.zip"
data_path = os.path.join(PROJECT_PATH, DATA_PATH)

response = requests.get(data_url)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))
zip_file.extractall(path=data_path)

In [None]:
# turn data into dataframes
'''
train set: 9652 rows
test set: 3042 rows
validation set: 2419 rows
columns: id | original | edit | grades | meanGrade
'''

train_df = pd.read_csv(os.path.join(PROJECT_PATH, DATA_PATH, TRAIN_DATA_PATH))
test_df = pd.read_csv(os.path.join(PROJECT_PATH, DATA_PATH, TEST_DATA_PATH))
val_df = pd.read_csv(os.path.join(PROJECT_PATH, DATA_PATH, VAL_DATA_PATH))

# modify test data frame for testing
test_df["originalGrade"] = test_df["meanGrade"]
test_df["meanGrade"] = [0 for _ in range(len(test_df))]

display(train_df[:5])
# display(test_df)
# display(val_df)


In [None]:
# define dataset for data loader

class HumicroeditDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        # generate the complete new sentence as text and grade as label
        new_sentence = re.sub("[</>]", self.df["edit"][idx], self.df["original"][idx])
        text = str(new_sentence)
        label = self.df["meanGrade"][idx]

        # tokenization
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "token_type_ids": encoding["token_type_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.float)
        }


## Model Setting and Training

In [None]:
# apply BERT as tokenizer and model
bert_tokenizer = transformers.BertTokenizer.from_pretrained(MODEL_NAME)
bert_model = transformers.BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
bert_model = bert_model.to(device)

# freeze part of the BERT model but leave the input text representation unfreezed
for name, param in bert_model.named_parameters():
    if (("layer.10" not in name) & ("layer.11" not in name) 
        & ("pooler" not in name) & ("classifier" not in name)):
          param.requires_grad = False

# generate data loader based on dataset
train_ds = HumicroeditDataset(train_df, bert_tokenizer)
test_ds = HumicroeditDataset(test_df, bert_tokenizer)
val_ds = HumicroeditDataset(val_df, bert_tokenizer)
train_dl = DataLoader(dataset=train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(dataset=test_ds, batch_size=BATCH_SIZE, shuffle=False)
val_dl = DataLoader(dataset=val_ds, batch_size=BATCH_SIZE, shuffle=False)

# apply optimizer
optimizer = transformers.AdamW(bert_model.parameters(), lr=LR)

In [None]:
train_loss = []
val_loss = []
train_acc = []
val_acc = []
val_loss_min = 9999
train_len = len(train_ds)
val_len = len(val_ds)

# get current time in format for naming
time_zone = pytz.timezone("Asia/Singapore")
curr_time = datetime.now(time_zone).strftime("%Y-%m-%d-%H:%M:%S")
file_name = str(curr_time) + (".pt")
CURR_MODEL_PATH = os.path.join(PROJECT_PATH, MODEL_PATH, file_name)

for epoch in range(EPOCH_NUM):

    train_true = []
    train_pred = []
    val_true = []
    val_pred = []

    # training
    start_time = time.time()
    curr_train_loss = 0.0
    bert_model.train()

    # load data
    for idx, train_data in enumerate(tqdm(train_dl)):
        input_ids = train_data["input_ids"].to(device)
        token_type_ids = train_data["token_type_ids"].to(device)
        attention_mask = train_data["attention_mask"].to(device)
        label = train_data["label"].to(device)
        train_true += label.tolist()

        # apply model
        optimizer.zero_grad()
        output = bert_model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            labels=label
        )
        loss, logits = output[:2]
        for logit in logits.reshape(-1):
            train_pred.append(logit.item())
        # loss = loss.float()
        # loss = loss.to(torch.float32)
        loss.backward()
        optimizer.step()
        curr_train_loss += loss.item()

    train_loss.append(curr_train_loss/train_len)

    # evaluating
    curr_val_loss = 0.0
    bert_model.eval()

    with torch.no_grad():

        # load data
        for idx, val_data in enumerate(tqdm(val_dl)):
            input_ids = val_data["input_ids"].to(device)
            token_type_ids = val_data["token_type_ids"].to(device)
            attention_mask = val_data["attention_mask"].to(device)
            label = val_data["label"].to(device)
            val_true += label.tolist()

            # apply model
            output = bert_model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=label
            )
            loss, logits = output[:2]
            curr_val_loss += loss.item()
            for logit in logits.reshape(-1):
                val_pred.append(logit.item())

    val_loss.append(curr_val_loss/len(val_ds))

    # save the model if optimized
    if curr_val_loss/val_len < val_loss_min:
        val_loss_min = curr_val_loss/val_len
        torch.save(bert_model.state_dict(), CURR_MODEL_PATH)
        print(f"\n++++++ Model optimized at epoch {epoch+1:02}! ++++++\n")

    # calculate training and validation accuracy for this epoch
    train_humor_true = [1 if float(i) > HUMOR_THRESHOLD else 0 for i in train_true]
    train_humor_pred = [1 if float(i) > HUMOR_THRESHOLD else 0 for i in train_pred]
    val_humor_true = [1 if float(i) > HUMOR_THRESHOLD else 0 for i in val_true]
    val_humor_pred = [1 if float(i) > HUMOR_THRESHOLD else 0 for i in val_pred]
    curr_train_acc = accuracy_score(train_humor_true, train_humor_pred)
    curr_val_acc = accuracy_score(val_humor_true, val_humor_pred)
    train_acc.append(curr_train_acc)
    val_acc.append(curr_val_acc)

    # record training and evaluation time
    end_time = time.time()
    time_interval = end_time - start_time
    mins_interval = int(time_interval / 60)
    secs_interval = int(time_interval - (mins_interval * 60))

    print(f"Epoch: {epoch+1:02} | Epoch Time: {mins_interval}m {secs_interval}s")
    print(f"\tTrain Loss: {(curr_train_loss/train_len):.5f} | Train Acc: {curr_train_acc*100:.5f}%")
    print(f"\t Val. Loss: {(curr_val_loss/val_len):.5f} | Val. Acc: {curr_val_acc*100:.5f}%\n")


In [None]:
# plot graph

epochs = range(1, EPOCH_NUM+1, 1)
max_loss = max(max(train_loss), max(val_loss))

# plot loss
fig, ax1 = plt.subplots()
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Loss")
ax1.set_ylim([0, max_loss+0.02])
plt1 = ax1.plot(epochs, train_loss, 'yo-', label='Training Loss')
plt2 = ax1.plot(epochs, val_loss, 'ro-', label='Validation Loss')

# plot accuracy
ax2 = ax1.twinx()
ax2.set_ylabel("Accuracy")
ax2.set_ylim([0,1])
plt3 = ax2.plot(epochs, train_acc, 'bo-', label='Training Accuracy')
plt4 = ax2.plot(epochs, val_acc, 'co-', label='Validation Accuracy')

plts = plt1 + plt2 + plt3 + plt4
labs = [p.get_label() for p in plts]
ax2.legend(plts, labs, loc=0)
# fig.tight_layout()
fig_name = "Training and Validation Metrics"
plt.title(fig_name)
plt.savefig(os.path.join(PROJECT_PATH, FIG_PATH, file_name.replace(".pt", ".png")))
plt.show()

## Testing

In [None]:
# testing
res = []
bert_model.load_state_dict(torch.load(CURR_MODEL_PATH))
bert_model.eval()   

with torch.no_grad():

    # load data
    for idx, test_data in enumerate(tqdm(test_dl)):
        input_ids = test_data["input_ids"].to(device)
        token_type_ids = test_data["token_type_ids"].to(device)
        attention_mask = test_data["attention_mask"].to(device)
        label = test_data["label"].to(device)

        # apply model
        output = bert_model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            labels=label
        )
        loss, logits = output[:2]
        for logit in logits.reshape(-1):
            res.append(round(logit.item(), 5))

# calculate RMSE loss
rmse = np.sqrt(np.mean((np.array(
    test_df["originalGrade"].tolist()) - np.array(res)) ** 2))

# add prediction and humor result in test dataframe
test_df["predictedScores"] = res
test_df["ifHumorOriginal"] = test_df["originalGrade"].apply(
    lambda x: 1 if float(x) > HUMOR_THRESHOLD else 0)
test_df["ifHumorPredicted"] = test_df["predictedScores"].apply(
    lambda x: 1 if float(x) > HUMOR_THRESHOLD else 0)
test_df["meanGrade"] = test_df["originalGrade"].tolist()
test_df = test_df.drop("originalGrade", axis=1)

display(test_df)
test_df.to_csv(os.path.join(PROJECT_PATH, RES_CSV_PATH,
                         file_name.replace(".pt", ".csv")), index=False)

# calculate metrics
if_humor_true = test_df["ifHumorOriginal"].tolist()
if_humor_predicted = test_df["ifHumorPredicted"].tolist()
precision, recall, f1, support = precision_recall_fscore_support(
    if_humor_true, if_humor_predicted, average='binary')

print(f"Test Results:")
print(f"RMSE Loss: {rmse:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1-measure: {f1:.5f}")
