# Models
In this notebook, all the training process for models used for the work can be found.

In [None]:
# Importing Necessary packages
from collections import Counter
from official.nlp import optimization  
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras import backend as K 
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

import librosa
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pathlib
import pandas as pd
import re
import seaborn as sns
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_hub as hub
import tensorflow_text as text
import urllib.request
import zipfile

# os.environ["CUDA_VISIBLE_DEVICES"] = "2"
# gpus = tf.config.experimental.list_physical_devices('GPU')
# for gpu in gpus:
#     tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
# Reading Annotted files
df = pd.read_csv("Annotated_Files.csv")

list_emotions = ["Angry",
                "Bored",
                "Happy",
                "Sad",
                "Surprised",
                "Neutral"]

# Removing No Audio files
to_rem = ["Can't play the audio",
          'No  audio', 
          'No audio',
          'No one is talking', 
          'No other', 
          'No voice', 
          'Noise', 
          'None',
          'Other']

# Encoding Happy sentiments
happy_lab = ["Excited", 
             "Laughing", 
             "Grateful", 
             "Glad",
             "Good", 
             "Hyped", 
             "Satisfied"]

df["label"] = df["Answer"].str.strip().str.capitalize()
df = df[~df["label"].isin(to_rem)].copy()
df.loc[df["label"].isin(happy_lab), "label"] = "Happy"
df = df.loc[df["label"].isin(list_emotions)].copy()
df["augment"] = "NA"
df["file_path"] = "All_Audio/" + df["file_name"]

In [None]:
from models import train_test_split

# Train test split
train, val_ = train_test_split(df, test_size=0.3, random_state=42)
dev, test = train_test_split(val_, test_size=0.5, random_state=42)

print('Training set size', len(train))
print('Validation set size', len(dev))
print('Test set size', len(test))

In [None]:
# Assigning Augumentation to Audio files
df2 = train.groupby('label').count().sort_values(by = 'label', ascending = False).reset_index()
min_value = df2["file_name"].min()
augment_type = ["low_noise", 
                "high_noise", 
                "slower", 
                "faster",
                "pitch",
                "slow_low_noise",
                "fast_low_noise",
                "slow_high_noise",
                "fast_high_noise"]

df_filter = train[train["label"]!="Neutral"].copy()
numb_neut = train[train["label"] == "Neutral"].count()[0]

list_em_no_neut = ["Angry",
                "Bored",
                "Happy",
                "Sad",
                "Surprised"]

for aug in augment_type:
    df_aug = df_filter.copy()
    for emot in list_em_no_neut:
        if (train[train["label"]==emot].count()[0] > numb_neut) | (emot =="Neutral"):
            print("hey",train[train["label"]==emot].count()[0] )
            aug_file = df_aug[df_aug["label"]==emot]["file_name"].sample(min_value)
            df_aug.loc[df_aug["file_name"].isin(aug_file), "augment"] = "NA"
        else:
            aug_file = df_aug[df_aug["label"]==emot]["file_name"].sample(min_value)
            df_aug.loc[df_aug["file_name"].isin(aug_file), "augment"] = aug
    df_aug = df_aug[df_aug["augment"]!="NA"].copy()
    train = pd.concat([train, df_aug]).sort_values("folder").reset_index(drop=True)

In [None]:
# Encoding the Labels
label_encoder = LabelEncoder()

y_train = np.array(label_encoder.fit_transform(train['label'].values.tolist()))
y_val = np.array(label_encoder.fit_transform(dev['label'].values.tolist()))
y_test = np.array(label_encoder.fit_transform(test['label'].values.tolist()))

X_train_path = train["file_path"].to_numpy()
X_val_path = dev["file_path"].to_numpy()
X_test_path = test["file_path"].to_numpy()

aug_train = train["augment"].to_numpy()
aug_val = dev["augment"].to_numpy()
aug_test = test["augment"].to_numpy()

In [None]:
from models import Audio_Generator

batch_size = 32
# Saving Data generator
train_batch_generator = Audio_Generator(X_train_path, y_train, aug_train, batch_size)
val_batch_generator = Audio_Generator(X_val_path, y_val, aug_val, batch_size)
test_batch_generator = Audio_Generator(X_test_path, y_test, aug_test, batch_size)

input_shape = (40, 844, 1)
num_labels = len(list_emotions)

## ResNet
The ResNet is the model which performed better

In [None]:
# Importing model
from models import create_res_net

# Training
batch_size = 16
opt = "sgd"
name = "res_net"
input_shape = (160, 211, 1)
num_labels = len(list_emotions)
model_audio = create_res_net(opt, input_shape, num_labels)

# With this callback we save the weights of the best epoch based on val_loss
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='weights/'+opt+str(batch_size)+'_'+name+'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_f1_l',
    mode='max',
    save_best_only=True)

history = model_audio.fit(train_batch_generator, 
                  epochs=epochs,
                  batch_size=batch_size,
                  verbose=2,
                  validation_data=val_batch_generator,
                  callbacks=[model_checkpoint_callback],
                  max_queue_size=20,
                  workers=45,
                  use_multiprocessing=True)

loss, accuracy, recall, precision, f1_l  = model_audio.evaluate(test_batch_generator,
                                batch_size=batch_size,
                                verbose=False,
                                workers=30,
                                use_multiprocessing=True)

df = pd.DataFrame(history.history)
df.to_csv(f'log/{opt+str(batch_size)}_batch_{str(epochs)}_{name}.csv', index=False)

print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("\nTesting Recall:  {:.4f}".format(recall))
print("\nTesting Precision:  {:.4f}".format(precision))
print("\nTesting F1:  {:.4f}".format(f1_l))

## CONV LSTM
To train the model, we split the conversations in dialogues of length 10.

In [None]:
# Splitting the dataset into shorter dialogues
file_parse_df = df.groupby("folder", sort=False)["file_name"].count()//10*10
df_in = pd.DataFrame()
for folder, value in file_parse_df.iteritems():
    df_in = pd.concat((df_in,df[df["folder"]==folder][:value]))

df_in['group_id'] = np.arange(len(df_in))//10
groups = [df for _, df in df_in.groupby('group_id')]
np.random.seed(42)
np.random.shuffle(groups)
df_in = pd.concat(groups).reset_index(drop=True)

train = df_in[:39910].copy()
dev = df_in[39910:48460].copy()
test = df_in[48460:].copy()

print('Training set size', len(train))
print('Validation set size', len(dev))
print('Test set size', len(test))

In [None]:
from models import Audio_Generator_CNNLSTM

# Generator
batch_size = group_sent = 20

X_test_path = test["file_path"].to_numpy()[:test.shape[0]//group_sent*group_sent]
aug_test = test["augment"].to_numpy()[:test.shape[0]//group_sent*group_sent]
y_test = np.array(label_encoder.fit_transform(test['label'].values\
                                              .tolist()))[:X_test_path.shape[0]//group_sent*group_sent]
test_batch_generator = c(X_test_path, y_test, aug_test, batch_size)

In [None]:
# Importing model
from models import ConvLSTM_Model

# Training
batch_size = 16
opt = "sgd"
name = "cnn_lstm"

model_cnn_lstm = ConvLSTM_Model(10, 40, 844, 1, list_emotions)
model_cnn_lstm.compile(optimizer= opt,
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'
                       ])

# With this callback we save the weights of the best epoch based on val_loss
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='weights/'+opt+str(batch_size)+'_'+name+'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_f1_l',
    mode='max',
    save_best_only=True)

history = model_cnn_lstm.fit(train_batch_generator, 
                  epochs=epochs,
                  batch_size=batch_size,
                  verbose=2,
                  validation_data=val_batch_generator,
                  callbacks=[model_checkpoint_callback],
                  max_queue_size=20,
                  workers=45,
                  use_multiprocessing=True)

loss, accuracy, recall, precision, f1_l  = model_cnn_lstm.evaluate(test_batch_generator,
                                batch_size=batch_size,
                                verbose=False,
                                workers=30,
                                use_multiprocessing=True)

df = pd.DataFrame(history.history)
df.to_csv(f'log/{opt+str(batch_size)}_batch_{str(epochs)}_{name}.csv', index=False)

print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("\nTesting Recall:  {:.4f}".format(recall))
print("\nTesting Precision:  {:.4f}".format(precision))
print("\nTesting F1:  {:.4f}".format(f1_l))

## AlexNet

In [None]:
# Importing model
from models import model_alex

# Training
input_shape = (160, 211, 1)
num_labels = len(list_emotions)
batch_size = 16
opt = "sgd"
name = "alexnet"

model_alex = model_alex(opt, input_shape, num_labels)
model_alex.compile(optimizer= opt,
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'
                       ])

# With this callback we save the weights of the best epoch based on val_loss
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='weights/'+opt+str(batch_size)+'_'+name+'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_f1_l',
    mode='max',
    save_best_only=True)

history = model_alex.fit(train_batch_generator, 
                  epochs=epochs,
                  batch_size=batch_size,
                  verbose=2,
                  validation_data=val_batch_generator,
                  callbacks=[model_checkpoint_callback],
                  max_queue_size=20,
                  workers=45,
                  use_multiprocessing=True)

loss, accuracy, recall, precision, f1_l  = model_alex.evaluate(test_batch_generator,
                                batch_size=batch_size,
                                verbose=False,
                                workers=30,
                                use_multiprocessing=True)

df = pd.DataFrame(history.history)
df.to_csv(f'log/{opt+str(batch_size)}_batch_{str(epochs)}_{name}.csv', index=False)

print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("\nTesting Recall:  {:.4f}".format(recall))
print("\nTesting Precision:  {:.4f}".format(precision))
print("\nTesting F1:  {:.4f}".format(f1_l))

## VGG

In [None]:
# Importing model
from models import model_vgg

# Training
input_shape = (160, 211, 1)
num_labels = len(list_emotions)
batch_size = 16
opt = "sgd"
name = "vggnet"

model_vgg_ = model_vgg(opt, input_shape, num_labels)
model_vgg_.compile(optimizer= opt,
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'
                       ])

# With this callback we save the weights of the best epoch based on val_loss
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='weights/'+opt+str(batch_size)+'_'+name+'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_f1_l',
    mode='max',
    save_best_only=True)

history = model_vgg_.fit(train_batch_generator, 
                  epochs=epochs,
                  batch_size=batch_size,
                  verbose=2,
                  validation_data=val_batch_generator,
                  callbacks=[model_checkpoint_callback],
                  max_queue_size=20,
                  workers=45,
                  use_multiprocessing=True)

loss, accuracy, recall, precision, f1_l  = model_vgg_.evaluate(test_batch_generator,
                                batch_size=batch_size,
                                verbose=False,
                                workers=30,
                                use_multiprocessing=True)

df = pd.DataFrame(history.history)
df.to_csv(f'log/{opt+str(batch_size)}_batch_{str(epochs)}_{name}.csv', index=False)

print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("\nTesting Recall:  {:.4f}".format(recall))
print("\nTesting Precision:  {:.4f}".format(precision))
print("\nTesting F1:  {:.4f}".format(f1_l))

---
# Textual Features Models
Models Trained on the Transcripts, not on the audio features

## BiLSTM

In [None]:
# Reading dataset 
df = pd.read_csv("Annotated_Files.csv")

list_emotions = ["Angry",
                "Bored",
                "Happy",
                "Sad",
                "Surprised",
                "Neutral"]

df["label"] = df["Answer"].str.strip().str.capitalize()
df = df[~df["label"].isin(to_rem)].copy()
df.loc[df["label"].isin(happy_lab), "label"] = "Happy"
df = df.loc[df["label"].isin(list_emotions)].copy()
df["augment"] = "NA"
df["file_path"] = "All_Audio/" + df["file_name"]

# Splitting
train, dev, test = split_data(df, train_dim= 0.70, dev_dim=0.15, test_dim=0.15)
print('Training set size', len(train))
print('Validation set size', len(dev))
print('Test set size', len(test))

In [None]:
# Joining all text
text_all = [' '.join(clean_text(text)) for text in df["Transcript"]]
train_text = [' '.join(clean_text(text)) for text in train["Transcript"]]
val_text = [' '.join(clean_text(text)) for text in dev["Transcript"]]
test_text = [' '.join(clean_text(text)) for text in test["Transcript"]]

# Tokenizing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_all)
sequence_train=tokenizer.texts_to_sequences(train_text)
sequence_val=tokenizer.texts_to_sequences(val_text)
sequence_test=tokenizer.texts_to_sequences(test_text)
index_of_words=tokenizer.word_index
vocab_size=len(index_of_words)+1
embed_num_dims=300
max_seq_len=40

# Padding sequences
X_train_pad =pad_sequences(sequence_train, maxlen=max_seq_len)
X_val_pad =pad_sequences(sequence_val, maxlen=max_seq_len)
X_test_pad =pad_sequences(sequence_test, maxlen=max_seq_len)

# Encoding Labels
y_train = np.array(label_encoder.fit_transform(train['label'].values.tolist()))
y_val = np.array(label_encoder.fit_transform(dev['label'].values.tolist()))
y_test = np.array(label_encoder.fit_transform(test['label'].values.tolist()))

# Embedding Matrix
fname='crawl-300d-2M.vec'
embedd_matrix=create_embedding_matrix(fname,index_of_words,embed_num_dims)

In [None]:
# Importing model
from models import model_text

# Training
batch_size = 16
opt = "sgd"
name = "bilstm"
num_labels = len(list_emotions)
model_text_ = model_text(opt,vocab_size,embed_num_dims,max_seq_len,embedd_matrix,num_labels)
model_text_.compile(optimizer= opt,
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'
                       ])

# With this callback we save the weights of the best epoch based on val_loss
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='weights/'+opt+str(batch_size)+'_'+name+'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_f1_l',
    mode='max',
    save_best_only=True)

history = model_text_.fit(X_train_pad,y_train,
               batch_size=batch_size,
               epochs=epochs,
               validation_data=(X_val_pad,y_val),
               verbose=2,
               callbacks=[model_checkpoint_callback],
               use_multiprocessing=True)

loss, accuracy, recall, precision, f1_l  = model_text_.evaluate(X_test_pad,y_test,
                                batch_size=batch_size,
                                verbose=False)

df = pd.DataFrame(history.history)
df.to_csv(f'log/{opt+str(batch_size)}_batch_{str(epochs)}_{name}.csv', index=False)

print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("\nTesting Recall:  {:.4f}".format(recall))
print("\nTesting Precision:  {:.4f}".format(precision))
print("\nTesting F1:  {:.4f}".format(f1_l))

### Bert Model

In [None]:
# Bert model and preprocess
bert_model = 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3'
bert_preprocess = 'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3'

bert_preprocess_model = hub.KerasLayer(bert_preprocess)
bert_model = hub.KerasLayer(bert_model)

In [None]:
from models import TextGenerator

# Saving Text
X_train_text = train["Transcript"].to_numpy()
X_val_text = dev["Transcript"].to_numpy()
X_test_text = test["Transcript"].to_numpy()

# Bert Generator
batch_size = 32
train_batch_generator = TextGenerator(X_train_text, y_train, batch_size)
val_batch_generator = TextGenerator(X_val_text, y_val, batch_size)
test_batch_generator = TextGenerator(X_test_text, y_test, batch_size)

In [None]:
# Importing model
from models import model_vgg

# Training
input_shape = (128, 512, 1)
num_labels = len(list_emotions)
batch_size = 16
opt = "sgd"
name = "bert"
model_bert_ = model_bert(opt, bert_preprocess_model, num_labels)
model_bert_.compile(optimizer= opt,
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'
                       ])

# With this callback we save the weights of the best epoch based on val_loss
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='weights/'+opt+str(batch_size)+'_'+name+'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_f1_l',
    mode='max',
    save_best_only=True)

history = model_bert_.fit(train_batch_generator, 
                  epochs=epochs,
                  batch_size=batch_size,
                  verbose=2,
                  validation_data=val_batch_generator,
                  callbacks=[model_checkpoint_callback],
                  max_queue_size=20,
                  workers=45,
                  use_multiprocessing=True)

loss, accuracy, recall, precision, f1_l  = model_bert_.evaluate(test_batch_generator,
                                batch_size=batch_size,
                                verbose=False,
                                workers=30,
                                use_multiprocessing=True)

df = pd.DataFrame(history.history)
df.to_csv(f'log/{opt+str(batch_size)}_batch_{str(epochs)}_{name}.csv', index=False)

print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("\nTesting Recall:  {:.4f}".format(recall))
print("\nTesting Precision:  {:.4f}".format(precision))
print("\nTesting F1:  {:.4f}".format(f1_l))

---
# Combined Model - Multimodal
Here, the model combining the two features, audio and text. The best weights of the best perforiming model has been used: ResNet and BiLSTM.

In [None]:
# Importing the models
from models import CombinedGenerator, model_combined

label_encoder = LabelEncoder()
y_train = np.array(label_encoder.fit_transform(train['label'].values.tolist()))
y_val = np.array(label_encoder.fit_transform(dev['label'].values.tolist()))
y_test = np.array(label_encoder.fit_transform(test['label'].values.tolist()))

X_train_text = train["Transcript"].to_numpy()
X_val_text = dev["Transcript"].to_numpy()
X_test_text = test["Transcript"].to_numpy()

X_train_path = train["file_path"].to_numpy()
X_val_path = dev["file_path"].to_numpy()
X_test_path = test["file_path"].to_numpy()

aug_train = train["augment"].to_numpy()
aug_val = dev["augment"].to_numpy()
aug_test = test["augment"].to_numpy()

batch_size = 16

train_combined_generator = CombinedGenerator(X_train_path, y_train, aug_train, X_train_pad, batch_size)
val_combined_generator = CombinedGenerator(X_val_path, y_val, aug_val, X_val_pad, batch_size)
test_combined_generator = CombinedGenerator(X_test_path, y_test, aug_test, X_test_pad, batch_size)

# Loading best weights
batch_size = 16
opt = "sgd"
input_shape = (160, 211, 1)
num_labels = len(list_emotions)

model_audio = create_res_net("sgd", input_shape, num_labels)
model_audio.load_weights("weights/Best_ResNet_SGD.12-2.34.hdf5")   

model_txt = model_text("adam",vocab_size,embed_num_dims,max_seq_len,embedd_matrix,num_labels)
model_txt.load_weights("weights/Best_BILSTM_text_embeddings.30-2.72.hdf5")
print("done loading")

model = model_combined(model_audio, model_txt, "sgd")

epochs = 50 
name = "combined_res_bilstm"
opt = "SGD"

model.compile(optimizer= opt,
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy', 
                       recall, 
                       precision, 
                       f1_l
                       ])

In [None]:
# With this callback we save the weights of the best epoch based on val_loss
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='weights/'+opt+str(batch_size)+'_'+name+'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_f1_l',
    mode='max',
    save_best_only=True)


history = model.fit(train_combined_generator, 
                  epochs=epochs,
                  batch_size=batch_size,
                  verbose=2,
                  validation_data=val_combined_generator,
                  callbacks=[model_checkpoint_callback],
                  max_queue_size=20,
                  workers=45,
                  use_multiprocessing=True)


loss, accuracy, recall, precision, f1_l  = model.evaluate(test_combined_generator,
                                batch_size=batch_size,
                                verbose=False,
                                workers=30,
                                use_multiprocessing=True)

df = pd.DataFrame(history.history)
df.to_csv(f'log/{opt+str(batch_size)}_batch_{str(epochs)}_{name}.csv', index=False)
print("\nTesting Accuracy:  {:.4f}".format(accuracy))
print("\nTesting Recall:  {:.4f}".format(recall))
print("\nTesting Precision:  {:.4f}".format(precision))
print("\nTesting F1:  {:.4f}".format(f1_l))