<a href="https://colab.research.google.com/github/anitayadav3/EmotionRecognitionInConversation/blob/master/Final_ERC_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, GRU, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import pickle
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from keras.utils.np_utils import to_categorical  
import string
import requests

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
!pip install sentencepiece
import tokenization

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |▎                               | 10kB 25.1MB/s eta 0:00:01[K     |▌                               | 20kB 32.3MB/s eta 0:00:01[K     |▉                               | 30kB 22.0MB/s eta 0:00:01[K     |█                               | 40kB 25.6MB/s eta 0:00:01[K     |█▍                              | 51kB 27.7MB/s eta 0:00:01[K     |█▋                              | 61kB 30.2MB/s eta 0:00:01[K     |██                              | 71kB 20.1MB/s eta 0:00:01[K     |██▏                             | 81kB 21.4MB/s eta 0:00:01[K     |██▌                             | 92kB 20.2MB/s eta 0:00:01[K     |██▊                             | 102kB 20.4MB/s eta 0:00:01[K     |███                             | 112kB 20.4MB/s eta 0:00:01[K     |███▎        

In [3]:
def albert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [4]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = pooled_output
    out = Dense(100, activation='relu')(clf_output)
    out = Dense(100, activation='relu')(out)
    out = Dense(6, activation='softmax')(out)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [5]:
%%time
module_url = "https://tfhub.dev/tensorflow/albert_en_base/1"
albert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 3.61 s, sys: 573 ms, total: 4.18 s
Wall time: 9.08 s


In [52]:
with open('/content/vader_featured_sentences.pkl', 'rb') as f:
    processed_data = pickle.load(f)
with open('/content/arranged_processed_label.pkl', 'rb') as f:
    processed_label = pickle.load(f)
with open('/content/new_test_vader_features.pkl', 'rb') as f:
    test_processed_data = pickle.load(f)
with open('/content/new_test_vader_labels.pkl', 'rb') as f:
    test_processed_label = pickle.load(f)

In [8]:
processed_data=np.asarray(processed_data)
Y=to_categorical(processed_label, num_classes=6)

In [9]:
sp_model_file = albert_layer.resolved_object.sp_model_file.asset_path.numpy()
tokenizer = tokenization.FullSentencePieceTokenizer(sp_model_file)

In [10]:
train_input = albert_encode(processed_data, tokenizer, max_len=160)
train_labels = Y

In [11]:
model = build_model(albert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 11683584    input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [12]:
train_history = model.fit(
    train_input, train_labels,
    epochs=6,
    batch_size=1
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [53]:
test_processed_data=np.asarray(test_processed_data)
test_Y=to_categorical(test_processed_label, num_classes=6)

In [54]:
test_input = albert_encode(test_processed_data, tokenizer, max_len=160)
test_labels = test_Y

In [55]:
y_pred=model.predict(test_input, batch_size=1)

In [56]:
y_pred1=np.argmax(y_pred,axis=1)
test_processed_label=np.asarray(test_processed_label)
print("Accuracy : " + str(accuracy_score(test_processed_label, y_pred1)))
print("Weighted F1-score : " + str(f1_score(test_processed_label, y_pred1, average='weighted')))

Accuracy : 0.6428571428571429
Weighted F1-score : 0.6450342791947398
