# IRE Project
## Fine tune transformer for emotion intensity regression

## Setup

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.4-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 35.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
import re
from scipy.stats import pearsonr

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

print(device)

cuda


## Data handling

In [None]:
!wget http://www.saifmohammad.com/WebDocs/AIT-2018/AIT2018-DATA/EI-reg/English/EI-reg-En-train.zip
!wget http://saifmohammad.com/WebDocs/AIT-2018/AIT2018-DATA/EI-reg/English/2018-EI-reg-En-dev.zip
!unzip -q /content/EI-reg-En-train.zip
!unzip -q /content/2018-EI-reg-En-dev.zip

--2021-11-17 16:34:04--  http://www.saifmohammad.com/WebDocs/AIT-2018/AIT2018-DATA/EI-reg/English/EI-reg-En-train.zip
Resolving www.saifmohammad.com (www.saifmohammad.com)... 192.185.17.122
Connecting to www.saifmohammad.com (www.saifmohammad.com)|192.185.17.122|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 356461 (348K) [application/zip]
Saving to: ‘EI-reg-En-train.zip’


2021-11-17 16:34:05 (2.79 MB/s) - ‘EI-reg-En-train.zip’ saved [356461/356461]

--2021-11-17 16:34:05--  http://saifmohammad.com/WebDocs/AIT-2018/AIT2018-DATA/EI-reg/English/2018-EI-reg-En-dev.zip
Resolving saifmohammad.com (saifmohammad.com)... 192.185.17.122
Connecting to saifmohammad.com (saifmohammad.com)|192.185.17.122|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 83779 (82K) [application/zip]
Saving to: ‘2018-EI-reg-En-dev.zip’


2021-11-17 16:34:05 (1.34 MB/s) - ‘2018-EI-reg-En-dev.zip’ saved [83779/83779]



In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 64
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 1
EPOCHS = 10
LEARNING_RATE = 1e-05
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
def pre_process(tweet):
  tweet = tweet.lower()
  tweet = tweet.encode("ascii", "ignore").decode() # removes emoticons and non-English characters
  tweet = re.sub(r"@{1}[a-z0-9_]+\s", " ", tweet) # removes username mentions
  tweet = re.sub(r"htt(p|ps)\S+", " ", tweet) # removes links in the tweet
  tweet = re.sub(r'[a-z0-9._%-]+@[a-z0-9.-]+\.[a-z]{2,4}', " ", tweet) # removes email
  tweet = re.sub(r"#", "", tweet)
  tweet = re.sub(r"\\n|\\t", " ", tweet)
  tweet = " ".join(tweet.split()) # removing multiple spaces between words
  return tweet

In [None]:
def load_dataset(emotion):
  train_data = pd.read_csv(f"EI-reg-En-{emotion}-train.txt", delimiter='\t')
  test_data = pd.read_csv(f"2018-EI-reg-En-{emotion}-dev.txt", delimiter='\t')
  
  train_sentences = train_data["Tweet"].tolist()
  train_sentences = list(map(pre_process, train_sentences))
  y_train = train_data["Intensity Score"].to_numpy().reshape((-1, 1))
  
  test_sentences = test_data["Tweet"].tolist()
  test_sentences = list(map(pre_process, test_sentences))
  y_test = test_data["Intensity Score"].to_numpy().reshape((-1, 1))

  return train_sentences, y_train, test_sentences, y_test

In [None]:
class Triage(Dataset):
    def __init__(self, X_train, y_train, tokenizer, max_len):
        self.len = len(X_train)
        self.X_train = X_train
        self.y_train = y_train
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        inputs = self.tokenizer.encode_plus(
            self.X_train[index],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.y_train[index], dtype=torch.float)
        } 
    
    def __len__(self):
        return self.len

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

## Network

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of Roberta to get the final output for the model. 
from transformers import AutoModel
class RobertaBERTClass(torch.nn.Module):
    def __init__(self):
        super(RobertaBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("roberta-base")
        # self.fc = torch.nn.Sequential(
        #     torch.nn.Linear(768,768),
        #     torch.nn.ReLU(),
        #     torch.nn.Linear(768,1),
        #     # torch.nn.ReLU(),
        #     # torch.nn.Linear(128,32),            
        #     # torch.nn.ReLU(),
        #     # torch.nn.Linear(32,1)
        # )
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        # output = self.fc(pooler)
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
loss_function = torch.nn.MSELoss()

## Training

In [None]:
# Defining the training function on the 80% of the dataset for tuning the Roberta model

def train(epoch):
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        # if _%50==0:
        #     loss_step = tr_loss/nb_tr_steps
        #     print(f"Training Loss per 50 steps: {loss_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss/nb_tr_steps
    print(f"Training Loss Epoch: {epoch_loss}")

    y_true = []
    y_pred = []

    model.eval()
    for _,data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device)

        with torch.no_grad():
            outputs = model(ids, mask)

        y_true.append(data['targets'].cpu().item())
        y_pred.append(outputs.cpu().item())

    print('pearsonr score:', pearsonr(y_true, y_pred)[0])

    return 

In [None]:
emotions = ["anger", "fear", "joy", "sadness"]
for emotion in emotions:
    emotion = emotion
    X_train, y_train, X_test, y_test = load_dataset(emotion)

    training_set = Triage(X_train, y_train, tokenizer, MAX_LEN)
    testing_set = Triage(X_test, y_test, tokenizer, MAX_LEN)

    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)

    model = RobertaBERTClass()
    model.to(device)
    optimizer = torch.optim.Adam(params= model.parameters(), lr=LEARNING_RATE)

    print('Emotion:',emotion)
    for epoch in range(EPOCHS):
        print('Epoch:', epoch)
        train(epoch)

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Emotion: anger
Epoch: 0
Training Loss Epoch: 0.037027944917708555
pearsonr score: 0.6767465403333082
Epoch: 1
Training Loss Epoch: 0.020892540611656896
pearsonr score: 0.744374939589239
Epoch: 2
Training Loss Epoch: 0.01401805850382484
pearsonr score: 0.760340341307687
Epoch: 3
Training Loss Epoch: 0.01026329652842852
pearsonr score: 0.7510148155727353
Epoch: 4
Training Loss Epoch: 0.008908548531191786
pearsonr score: 0.7672725815552498
Epoch: 5
Training Loss Epoch: 0.007436108863674964
pearsonr score: 0.7457795608569991
Epoch: 6
Training Loss Epoch: 0.007244821868076846
pearsonr score: 0.7600541571272473
Epoch: 7
Training Loss Epoch: 0.006213362731976626
pearsonr score: 0.7635638285392756
Epoch: 8
Training Loss Epoch: 0.005262330832540869
pearsonr score: 0.745554331237757
Epoch: 9
Training Loss Epoch: 0.005044262643435471
pearsonr score: 0.7513589887091526


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Emotion: fear
Epoch: 0
Training Loss Epoch: 0.04248621777927834
pearsonr score: 0.6991593362446962
Epoch: 1
Training Loss Epoch: 0.02094532748140712
pearsonr score: 0.7463048256528823
Epoch: 2
Training Loss Epoch: 0.014520830395424546
pearsonr score: 0.7641490566910616
Epoch: 3
Training Loss Epoch: 0.010632872796995022
pearsonr score: 0.7586884571412602
Epoch: 4
Training Loss Epoch: 0.008805987076447733
pearsonr score: 0.7729240850204034
Epoch: 5
Training Loss Epoch: 0.008298468806494302
pearsonr score: 0.7687308183878574
Epoch: 6
Training Loss Epoch: 0.0067231519028305
pearsonr score: 0.7803460048769076
Epoch: 7
Training Loss Epoch: 0.0064840514102432805
pearsonr score: 0.774187519369565
Epoch: 8
Training Loss Epoch: 0.005243140920032788
pearsonr score: 0.7816408773230094
Epoch: 9
Training Loss Epoch: 0.00499811656421271
pearsonr score: 0.7772892119762227


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Emotion: joy
Epoch: 0
Training Loss Epoch: 0.04686517936611516
pearsonr score: 0.7045144760528985
Epoch: 1
Training Loss Epoch: 0.02298535769617695
pearsonr score: 0.7438224123413527
Epoch: 2
Training Loss Epoch: 0.014868501194553007
pearsonr score: 0.7783749786217272
Epoch: 3
Training Loss Epoch: 0.01258472773471894
pearsonr score: 0.7600624167240915
Epoch: 4
Training Loss Epoch: 0.010425925081883836
pearsonr score: 0.7444072023650843
Epoch: 5
Training Loss Epoch: 0.00815145386795983
pearsonr score: 0.7659402800269606
Epoch: 6
Training Loss Epoch: 0.006951049632897422
pearsonr score: 0.7808140084170445
Epoch: 7
Training Loss Epoch: 0.00600501670032433
pearsonr score: 0.7840009814253028
Epoch: 8
Training Loss Epoch: 0.005711888642685151
pearsonr score: 0.7595647559137244
Epoch: 9
Training Loss Epoch: 0.005468646461158591
pearsonr score: 0.7619530641103363


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Emotion: sadness
Epoch: 0
Training Loss Epoch: 0.04678489191483546
pearsonr score: 0.700523058415776
Epoch: 1
Training Loss Epoch: 0.025228501668041733
pearsonr score: 0.745727318657319
Epoch: 2
Training Loss Epoch: 0.017256404074275703
pearsonr score: 0.7579760334554233
Epoch: 3
Training Loss Epoch: 0.01160706568263775
pearsonr score: 0.7490202401875211
Epoch: 4
Training Loss Epoch: 0.010043497596479297
pearsonr score: 0.7402873784489484
Epoch: 5
Training Loss Epoch: 0.008263451371266833
pearsonr score: 0.7728306010744168
Epoch: 6
Training Loss Epoch: 0.006687503341862499
pearsonr score: 0.7650431269568193
Epoch: 7
Training Loss Epoch: 0.0069835296633463505
pearsonr score: 0.7642767081954431
Epoch: 8
Training Loss Epoch: 0.0053330503716812186
pearsonr score: 0.7563185600667061
Epoch: 9
Training Loss Epoch: 0.00449591542557442
pearsonr score: 0.7647093608408225
