
# Explanation of the task:

This is the classic Emotion Recognition Classification task. Given a conversation, involving 2 or more parties, for each message/utterance, we want to predict an emotion related to it.


Consider the example:

**Person A**: "Hello! I am very happy" (happiness)

**Person B**: "Why? I am very angry"   (anger)

## First model idea:
- inputs: sequence of utterances, sequence of emotions.
- For each: Linear Layers
- Fusion model

- Loss: cross-entropy



# Preprocessing:

Consider the each conversation as just a sequence of words:
$$
[[utt, utt, \cdots], \cdots ] \longrightarrow [[word, word , \cdots], \cdots]
$$

In here, we add a separator token "sep". It will serve to indicate when a utteration is over, and another one starts.

## Weight update:
 - Make the updatable weights constant and then update it dynamically
 - train it once and them update it dynamically in the forward pass
 - cache weight matrices and train them individually -> this does not train properly (super slow convergence)


In [None]:
! pip install nltk
! pip install torch
! pip install pandas
! pip install gensim
! pip install datasets
! pip install matplotlib
! pip install tqdm
! pip install torchinfo

# eventually include tensorboard



In [None]:
# ML resources:
import torch
import os
import matplotlib.pyplot as plt
import gensim
from nltk.tokenize import TweetTokenizer

from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from collections import Counter

#others:
import pandas as pd
import numpy as np



In [None]:
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
! unzip wiki-news-300d-1M.vec.zip
! rm wiki-news-300d-1M.vec.zip

In [None]:
encoder_model = gensim.models.KeyedVectors.load_word2vec_format("wiki-news-300d-1M.vec",binary = False)

In [None]:
# We create the embeddings and find the vocab
import copy
unk_token, pad_token, sep_token = '<unk>', '<pad>','<sep>'
embedding_vectors = torch.from_numpy(encoder_model.vectors)
pretrained_vocab = copy.deepcopy(encoder_model.index_to_key)
pretrained_vocab[:0] = [pad_token, unk_token,sep_token]

stoi = {word: i for i, word in enumerate(pretrained_vocab)}
itos = {i: word for i, word in enumerate(pretrained_vocab)}

pretrained_embeddings = torch.cat((torch.zeros(1,embedding_vectors.shape[1]),embedding_vectors))
pretrained_embeddings = torch.cat((torch.ones(1,embedding_vectors.shape[1]),embedding_vectors))
pretrained_embeddings = torch.cat((-torch.ones(1,embedding_vectors.shape[1]),embedding_vectors))

In [None]:
from nltk.tokenize import TweetTokenizer
from tqdm.notebook import tqdm

max_size = 50
## By using the template that was shared, we can process the inputs in a very similar way
tok = TweetTokenizer()
def tokenize_text_extend_emotions(text,emotion,stoi): ## utteration : string -> list of tokenized words : [int]
  t1 = tok.tokenize(text)
  text_tokenized = [stoi[word] if word in stoi else stoi['<unk>'] for word in t1]
  return text_tokenized, [emotion]*len(t1)

def concat_utt(dialog, emotions, stoi, max_size=max_size + 1): ## list of utterations : [string] -> list of list of tokenized words : [int]
  tokenized_and_extended = [tokenize_text_extend_emotions(t,e,stoi) for t,e in zip(dialog,emotions)]
  dialog = [i[0] for i in tokenized_and_extended]
  emotions = [i[1] for i in tokenized_and_extended]
  dialog_flat = []
  emotions_extended = []
  for i in range(len(dialog) - 1):
    dialog[i].append(stoi["<sep>"])
    emotions[i].append(emotions[i][0])
  for i in range(len(dialog)):
    dialog_flat.extend(dialog[i])
    emotions_extended.extend(emotions[i])
  if len(dialog_flat) > max_size: ## Must cut
    dialog_flat = dialog_flat[:max_size]
    emotions_extended = emotions_extended[:max_size]
  else: ## Must add padding
    dialog_flat += [stoi["<pad>"]] * (max_size - len(dialog_flat))
    emotions_extended += [0] * (max_size - len(emotions_extended))
  return dialog_flat,emotions_extended

def adjust_emotion_labels(Y): # Necessary, because we will use 0 as padding for both the text and emotions.
  Y_adjusted = copy.copy(Y)
  for i in range(len(Y)):
    for j in range(len(Y[i])):
      Y_adjusted[i][j] = Y_adjusted[i][j] + 1
  return Y_adjusted

def preprocess_data(X,Y): ## list of lists of utterations : [[string]] -> list of lists of tokenized words : [[int]]
  X_processed = []
  Y_processed = []
  Y = adjust_emotion_labels(Y) # sums 1 to all emotions (<pad_token> = 0)
  for i in tqdm(range(len(X))):
    X_processed.append(concat_utt(X[i],Y[i],stoi)[0])
    Y_processed.append(concat_utt(X[i],Y[i],stoi)[1])
  return X_processed, Y_processed

def get_target(X,Y):
  def get_other(inp):
    return inp[:-1], inp[1:]
  text_input = [get_other(i)[0] for i in X]
  text_target = [get_other(i)[1] for i in X]
  emotion_input = [get_other(i)[0] for i in Y]
  emotion_target = [get_other(i)[1] for i in Y]
  return text_input, text_target, emotion_input, emotion_target




## Check the following example:
dialog_example = ["hello, I am a I robot!","I am greek"]
emotions_example = [1,2] ## random emotions...

flatten_dialog, flatten_emotions = concat_utt(dialog_example,emotions_example,stoi)
print(f"{dialog_example} becomes {flatten_dialog}")
print(f"{emotions_example} becomes {flatten_emotions}")


## DataLoader not implemented yet:

In [None]:
## Modify this after changing the preprocessing.
class DailyDialogDataset(Dataset):
  def __init__(self, texts, emotions,target_texts,target_emotions):
  # Dataset object for Daily Dialog dataset
    self.texts = texts                     ## tokenized text
    self.emotions = emotions               ## tokenized emotions
    self.target_texts = target_texts       ## target text for loss computation
    self.target_emotions = target_emotions ## target emotions for loss computation

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    item = {
                'texts': np.array(self.texts[idx]),
             'emotions': np.array(self.emotions[idx]),
         'target_texts': np.array(self.target_texts[idx]),
      'target_emotions': np.array(self.target_emotions[idx])
    }

    return item



Here we will extract the data:

In [None]:
data = load_dataset('daily_dialog')

X_train = data['train']['dialog']
Y_train = data['train']['emotion']

X_test = data['test']['dialog']
Y_test = data['test']['emotion']

X_val = data['validation']['dialog']
Y_val = data['validation']['emotion']
mx = 0
for i in Y_train:
  for j in i:
    if j > mx:
      mx = j
print(mx)

In [None]:
X_train, Y_train = preprocess_data(X_train,Y_train)
mx = 0
for i in Y_train:
  for j in i:
    if j > mx:
      mx = j
print(mx)
X_train,X_train_target, Y_train, Y_train_target = get_target(X_train,Y_train)
mx = 0
for i in Y_train:
  for j in i:
    if j > mx:
      mx = j
print(mx)
X_test, Y_test = preprocess_data(X_test,Y_test)
X_test,X_test_target, Y_test, Y_test_target = get_target(X_test,Y_test)

X_val, Y_val = preprocess_data(X_val,Y_val)
X_val,X_val_target, Y_val, Y_val_target = get_target(X_val,Y_val)



for d, e in zip(X_train,Y_train): ## Just checking if nothing wrong happened
  assert(len(d) == len(e))
for d, e in zip(X_test,Y_test): ## Just checking if nothing wrong happened
  assert(len(d) == len(e))
for d, e in zip(X_val,Y_val): ## Just checking if nothing wrong happened
  assert(len(d) == len(e))

In [None]:
batch_size = 5

mx = 0
for i in Y_train:
  for j in i:
    if j > mx:
      mx = j
print(mx)
train_data = DailyDialogDataset(X_train,Y_train,X_train_target,Y_train_target)
test_data = DailyDialogDataset(X_test_target,Y_test_target,X_test_target,Y_test_target)
val_data = DailyDialogDataset(X_val_target,Y_val_target,X_val_target,Y_val_target)

train_loader = DataLoader(train_data, batch_size=batch_size,shuffle = True,)
test_loader = DataLoader(test_data, batch_size=batch_size,shuffle = True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle = True)


# The model:
## archtecture
- 2 input channels: word encoding, emotion encoding
- dinamically updated weights: $w_1, w_2 = w1 + w2, w1$ (Not implemented yet)
### For each channel:
   - 3 sequential Linear layers
- fusion linear layer through concatenation
- 2 output channels which contain a linear layer each





In [None]:
import torch.nn as nn
import torch.nn.functional as F
class SimpleModel(nn.Module):
  def __init__(self, emo_dim, n_emotion, n_vocab):
    super(SimpleModel,self).__init__()
    ## word_dim = 300
    self.embedding_layer_text = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
    self.embedding_layer_emotion = nn.Embedding(n_emotion+1, emo_dim)
    ## Channel for utterances/words:
    self.Linear_utt1 = nn.Linear(300,80)
    self.Linear_utt2 = nn.Linear(80,80)
    self.Linear_utt3 = nn.Linear(80,80)
    # self.Linear_utt3.requires_grad = False

    ## Channel for emotions:
    self.Linear_emo1 = nn.Linear(emo_dim,80)
    self.Linear_emo2 = nn.Linear(80,80)
    self.Linear_emo3 = nn.Linear(80,80)
    # self.Linear_emo3.requires_grad = False

    ## fusion by concatenation and Linear layer:
    self.Linear_fus = nn.Linear(160,300)

    ## We concatenate and do linear again (2 different concatenations)
    self.Linear_utt_final = nn.Linear(380, n_vocab)
    self.Linear_emo_final = nn.Linear(380, n_emotion)

    self.softmax = nn.Softmax(dim=0)



  def forward(self, text, emotion):
    ## first input channel:
    #mask_text = (emb_text != 0).unsqueeze(-1).float()       ## not considered by gradient!
    #mask_emotion = (emb_emotion != 0).unsqueeze(-1).float() ## not considered by gradient!

    #x = (emb_text * mask_text).int()
    #y = (emb_emotion * mask_emotion).int()

    x = self.embedding_layer_text(text)
    x = self.Linear_utt1(x)
    x = self.Linear_utt2(x)
    x = self.Linear_utt3(x)
    print(x.size())
    y = self.embedding_layer_emotion(emotion)
    y = self.Linear_emo1(y)
    y = self.Linear_emo2(y)
    y = self.Linear_emo3(y)
    print(y.size())
    z = torch.cat((x,y),-1)
    print(z.size())

    z = self.Linear_fus(z)
    print(z.size())

    w = torch.cat((z,x),-1)
    print(w.size())
    v = torch.cat((z,y),-1)
    print(v.size())
    pred_token = self.Linear_utt_final(w)
    pred_token = self.softmax(pred_token)
    pred_emotion = self.Linear_emo_final(v)
    pred_emotion = self.softmax(v)
    ŕint(pred_token.size())
    print(pred_emotion.size())
    return pred_token, pred_emotion



In [None]:
import torch.optim as optim
from tqdm.notebook import tqdm
def activate_gpu(force_cpu=False): # check if gpu available ; code taken from template
    device = "cpu"
    if not force_cpu:
        if torch.cuda.is_available(): # for both Nvidia and AMD GPUs
            device = 'cuda'
            print('DEVICE = ', torch.cuda.get_device_name(0))
        elif torch.backends.mps.is_available(): # for mac ARM chipset
            device = 'mps'
            print('DEVICE = ', "mps" )
        else: # for cpu only
            device = 'cpu'
            print('DEVICE = ', 'CPU', "blue")
    return device

## normal train function
def train(model, epochs, device):
  optimizer = optim.Adam(model.parameters())
  loss_fn = nn.CrossEntropyLoss()
  model.train()
  model = model.to(device)
  loss = 0
  loss_to_plot = []
  for epoch in range(epochs):
    losses = []
    print(f"Epoch {epoch+1}/{epochs}")
    for it, batch in tqdm(enumerate(train_loader),total = train_loader.__len__()):

      batch = {'texts': batch['texts'].to(device),
               'emotions': batch['emotions'].to(device),
               'target_texts': batch['target_texts'].to(device),
               'target_emotions': batch['target_emotions'].to(device)}
      optimizer.zero_grad()

      pt, pe  = model.forward(batch['texts'],batch['emotions'])
      loss = loss_fn(pe,batch['target_emotions']) + loss_fn(pt, batch['target_texts'])
      loss.backward()
      optimizer.step()
      losses.append(loss.item())
    loss_to_append = sum(losses)/len(losses)
    loss_to_plot.append(loss_to_append)
    print(f"loss: ",loss_to_append)
  return loss_to_plot


In [None]:
device = activate_gpu()
emotion_dim = 30
n_emotions = 7
n_words = len(stoi)


model = SimpleModel(emotion_dim,n_emotions,n_words)
print(device)
model

In [None]:
for i in train_loader:
  print(i['texts'].shape)
  print(i['emotions'].shape)
  print(i['target_texts'].shape)
  print(i['target_emotions'].shape)
  break

In [None]:
epochs = 8
losses = train(model,epochs,device)
plt.plot(np.arange(1,epochs+1),losses)

# Descrition of the issues faced:

It is not trivial of how to deal with the gradient flow in this case. Maybe by fixing the 2 matrix it would go better. Or just train the matrix with fixed weight. And update the weight not in the forward pass. but in the prediction. this way we can cache the weight and everytime we restart, we will be ok.

I believe that, the issue of this approach specifically is updating directly the weights, and not, a hidden state, for instance.

# Problem with Daily Dialog:
By plotting the frequency of each emotion, we notice that the dataset is truly not diversified. It has essentially only 2 emotions. This is not ideal, because the models will most likely overfit into predicting those 2 emotions...

In [None]:
def compute_test_loss(model,loss_fn,data_loader,device):
  model.eval()
  losses = []
  accs = []
  preds = []
  prec = 0
  trues = []
  pred_words = []
  for it,batch in tqdm(enumerate(data_loader),total=data_loader.__len__()):
    batch = {'texts' : batch['texts'].to(device),
             'emotions': batch['emotions'].to(device),
             'target_texts': batch['target_texts'].to(device),
             'target_emotions': batch['target_emotions'].to(device)}
    pt, pe  = model.forward(batch['text'],batch['emot'])
    loss = loss_fn(pe,batch['target_emotions'])
    index_pred = torch.argmax(pe,1)
    word_p = torch.argmax(pt,1)
    correct = (batch['target_emotions'].flatten() == index_pred.flatten()).float().sum()
    acc = correct/len(index_pred.flatten())
    accs.append(acc.item())
    losses.append(loss.item())
    trues.extend(batch['target_emotions'].flatten().tolist())
    preds.extend(index_pred.tolist())
    pred_words.extend(word_p.tolist())
    for i in batch['target_texts']:
      i.item()
  print("average loss: ", sum(losses)/len(losses))
  print("average acc: ", sum(accs)/len(accs))
  return trues, preds, word_p
trues, preds, pred_words = compute_test_loss(model,nn.CrossEntropyLoss(),val_loader,device)
names = [f'{i}' for i in range(1,8)]



In [None]:
#for i,w,q in zip(trues[:50],preds[:50],pred_words[:50]):
#  print(i,w,itos_train1[q.item()])
#
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, RocCurveDisplay
print(classification_report(np.array(trues).flatten(), np.array(preds).flatten(), target_names=names))
confusion_matrix(trues,preds)

              precision    recall  f1-score   support

           1       0.96      0.72      0.82     82011
           2       0.00      0.00      0.00      1083
           3       0.00      0.00      0.00        41
           4       0.16      0.17      0.16       126
           5       0.01      0.02      0.01      6231
           6       0.93      0.93      0.93      1085
           7       0.81      0.27      0.40      1033

    accuracy                           0.66     91610
   macro avg       0.41      0.30      0.33     91610
weighted avg       0.88      0.66      0.75     91610



array([[59060,  9073,   402,     7, 13373,    68,    28],
       [ 1064,     1,     5,     1,    10,     2,     0],
       [   28,     0,     0,     0,     1,     0,    12],
       [    7,    54,     0,    21,    24,     0,    20],
       [  509,    20,  5588,     2,   106,     4,     2],
       [   49,     4,     5,     1,    16,  1008,     2],
       [  621,     4,    11,   100,    19,     3,   275]])

In [None]:
def eval_sentence(model, sentence, encoded_sentence, emotions, device):
  model.eval()
  for i in range(len(sentence) - 1):
    pe, _ = model.forward(encoded_sentence[i].to(device),emotions[i].to(device))
    t = torch.max(pe,1)
    print(f"word:{sentence[i+1]}; predicted_emotion: {t} ; target_emotion: {emotions[i+1]}")

# eval_sentence(model,sentence,encod,emot,device)