# IRE Project phase 2 (sadness)
## Fine tune transformer for emotion intensity regression

## Setup

In [None]:
!pip install transformers



In [None]:
# Importing the libraries needed
import pandas as pd
import numpy as np
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
import re
from scipy.stats import pearsonr

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

print(device)

cuda


## Data handling

In [None]:
!wget http://www.saifmohammad.com/WebDocs/AIT-2018/AIT2018-DATA/EI-reg/English/EI-reg-En-train.zip
!wget http://saifmohammad.com/WebDocs/AIT-2018/AIT2018-DATA/EI-reg/English/2018-EI-reg-En-dev.zip
!unzip -q /content/EI-reg-En-train.zip
!unzip -q /content/2018-EI-reg-En-dev.zip

--2021-11-18 06:21:18--  http://www.saifmohammad.com/WebDocs/AIT-2018/AIT2018-DATA/EI-reg/English/EI-reg-En-train.zip
Resolving www.saifmohammad.com (www.saifmohammad.com)... 192.185.17.122
Connecting to www.saifmohammad.com (www.saifmohammad.com)|192.185.17.122|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 356461 (348K) [application/zip]
Saving to: ‘EI-reg-En-train.zip’


2021-11-18 06:21:18 (2.79 MB/s) - ‘EI-reg-En-train.zip’ saved [356461/356461]

--2021-11-18 06:21:18--  http://saifmohammad.com/WebDocs/AIT-2018/AIT2018-DATA/EI-reg/English/2018-EI-reg-En-dev.zip
Resolving saifmohammad.com (saifmohammad.com)... 192.185.17.122
Connecting to saifmohammad.com (saifmohammad.com)|192.185.17.122|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 83779 (82K) [application/zip]
Saving to: ‘2018-EI-reg-En-dev.zip’


2021-11-18 06:21:19 (1.26 MB/s) - ‘2018-EI-reg-En-dev.zip’ saved [83779/83779]



In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 64
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 1
EPOCHS = 10
LEARNING_RATE = 1e-05
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
def pre_process(tweet):
  tweet = tweet.lower()
  tweet = tweet.encode("ascii", "ignore").decode() # removes emoticons and non-English characters
  tweet = re.sub(r"@{1}[a-z0-9_]+\s", " ", tweet) # removes username mentions
  tweet = re.sub(r"htt(p|ps)\S+", " ", tweet) # removes links in the tweet
  tweet = re.sub(r'[a-z0-9._%-]+@[a-z0-9.-]+\.[a-z]{2,4}', " ", tweet) # removes email
  tweet = re.sub(r"#", "", tweet)
  tweet = re.sub(r"\\n|\\t", " ", tweet)
  tweet = " ".join(tweet.split()) # removing multiple spaces between words
  return tweet

In [None]:
def load_dataset(emotion):
  train_data = pd.read_csv(f"EI-reg-En-{emotion}-train.txt", delimiter='\t')
  test_data = pd.read_csv(f"2018-EI-reg-En-{emotion}-dev.txt", delimiter='\t')
  
  train_sentences = train_data["Tweet"].tolist()
  train_sentences = list(map(pre_process, train_sentences))
  y_train = train_data["Intensity Score"].to_numpy().reshape((-1, 1))
  
  test_sentences = test_data["Tweet"].tolist()
  test_sentences = list(map(pre_process, test_sentences))
  y_test = test_data["Intensity Score"].to_numpy().reshape((-1, 1))

  return train_sentences, y_train, test_sentences, y_test

In [None]:
class Triage(Dataset):
    def __init__(self, X_train, y_train, tokenizer, max_len):
        self.len = len(X_train)
        self.X_train = X_train
        self.y_train = y_train
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        inputs = self.tokenizer.encode_plus(
            self.X_train[index],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.y_train[index], dtype=torch.float)
        } 
    
    def __len__(self):
        return self.len

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

## Network

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of Roberta to get the final output for the model. 
from transformers import AutoModel
class RobertaBERTClass(torch.nn.Module):
    def __init__(self):
        super(RobertaBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("roberta-base")
        # self.fc = torch.nn.Sequential(
        #     torch.nn.Linear(768,768),
        #     torch.nn.ReLU(),
        #     torch.nn.Linear(768,1),
        #     # torch.nn.ReLU(),
        #     # torch.nn.Linear(128,32),            
        #     # torch.nn.ReLU(),
        #     # torch.nn.Linear(32,1)
        # )
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        # output = self.fc(pooler)
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
loss_function = torch.nn.MSELoss()

## Training

In [None]:
# Defining the training function on the 80% of the dataset for tuning the Roberta model

def train(epoch):
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        # if _%50==0:
        #     loss_step = tr_loss/nb_tr_steps
        #     print(f"Training Loss per 50 steps: {loss_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss/nb_tr_steps
    print(f"Training Loss Epoch: {epoch_loss}")

    y_true = []
    y_pred = []

    model.eval()
    for _,data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device)

        with torch.no_grad():
            outputs = model(ids, mask)

        y_true.append(data['targets'].cpu().item())
        y_pred.append(outputs.cpu().item())

    print('pearsonr score:', pearsonr(y_true, y_pred)[0])

    return 

In [None]:
# emotions = ["anger", "fear", "joy", "sadness"]
X_train_a, y_train_a, _, _ = load_dataset("anger")
X_train_f, y_train_f, _, _ = load_dataset("fear")
X_train_j, y_train_j, _, _ = load_dataset("joy")

X_train = X_train_a + X_train_f + X_train_j
y_train = np.concatenate((y_train_a, y_train_f, y_train_j))
X_test, y_test, _, _ = load_dataset("sadness")

training_set = Triage(X_train, y_train, tokenizer, MAX_LEN)
testing_set = Triage(X_test, y_test, tokenizer, MAX_LEN)

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

model = RobertaBERTClass()
model.to(device)
optimizer = torch.optim.Adam(params= model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    print('Epoch:', epoch)
    train(epoch)

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: 0
Training Loss Epoch: 0.04891765859558562
pearsonr score: 0.33466460429743394
Epoch: 1
Training Loss Epoch: 0.033915344304603315
pearsonr score: 0.46922418872267235
Epoch: 2
Training Loss Epoch: 0.025245279555103858
pearsonr score: 0.5505417326710543
Epoch: 3
Training Loss Epoch: 0.019796560491301535
pearsonr score: 0.4986912975705796
