<a href="https://colab.research.google.com/github/anitayadav3/EmotionRecognitionInConversation/blob/master/BERT_on_IEMOCAP_for_ERC_Improvization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import re
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam  
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import pickle
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from keras.utils.np_utils import to_categorical  
import time

In [2]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |▎                               | 10kB 21.4MB/s eta 0:00:01[K     |▋                               | 20kB 22.2MB/s eta 0:00:01[K     |▉                               | 30kB 17.5MB/s eta 0:00:01[K     |█▏                              | 40kB 16.4MB/s eta 0:00:01[K     |█▌                              | 51kB 13.2MB/s eta 0:00:01[K     |█▊                              | 61kB 12.9MB/s eta 0:00:01[K     |██                              | 71kB 14.2MB/s eta 0:00:01[K     |██▍                             | 81kB 13.6MB/s eta 0:00:01[K     |██▋                             | 92kB 14.4MB/s eta 0:00:01[K     |███                             | 102kB 14.0MB/s eta 0:00:01[K     |███▎                            | 112kB 14.0MB/s eta 0:00:01[K     |███▌        

In [3]:
import tokenization

In [4]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [5]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    clf_output1 = tf.reshape(clf_output,[1,clf_output.shape[1],1])
    text_layer = Conv1D(128, 100, activation='relu', input_shape=(clf_output.shape[1],1))(clf_output1)
    text_layer = Conv1D(128, 90, activation='relu')(text_layer)
    text_layer = MaxPooling1D(3)(text_layer)
    text_layer = Flatten()(text_layer)
    dense_layer1 = Dense(100,activation='relu')(text_layer)
    dense_layer2 = Dense(100,activation='relu')(dense_layer1)
    dense_layer3 = Dense(100,activation='relu')(dense_layer2)
    out = Dense(6, activation='softmax')(dense_layer3)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [7]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 41.8 s, sys: 12.1 s, total: 53.9 s
Wall time: 2min 29s


In [8]:
with open('/content/gdrive/My Drive/iemocap/train/sentences.pkl', 'rb') as f:
    data = pickle.load(f)
with open('/content/gdrive/My Drive/iemocap/train/labels.pkl', 'rb') as f:
    labels = pickle.load(f)
with open('/content/gdrive/My Drive/iemocap/test/sentences.pkl', 'rb') as f:
    test_data = pickle.load(f)
with open('/content/gdrive/My Drive/iemocap/test/labels.pkl', 'rb') as f:
    test_labels = pickle.load(f)

In [9]:
def preprocessing(data,labels):
  processed_data=[]
  processed_label=[]
  for i in range(0,len(data)):
    for j in range(0,len(data[i])):
      intermediate_data=[]
      intermediate_label=[]
      for k in range(0,len(data[i][j])):
        text=data[i][j][k]
        if text != '<eos>'and text!='<pad>':
          intermediate_data.append(text)
      processed_data.append(intermediate_data)
  for i in labels:
    for j in i:
      processed_label.append(j)
  return processed_data,processed_label

In [10]:
processed_data,processed_label = preprocessing(data,labels)
test_processed_data,test_processed_label = preprocessing(test_data,test_labels)

In [11]:
for i in range(0,len(processed_data)):
  processed_data[i]= ' '.join(processed_data[i])
for i in range(0,len(test_processed_data)):
  test_processed_data[i]=' '.join(test_processed_data[i])

In [12]:
processed_data=np.asarray(processed_data)
test_processed_data=np.asarray(test_processed_data)
Y=to_categorical(processed_label, num_classes=6)
test_Y=to_categorical(test_processed_label, num_classes=6)

In [13]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [14]:
t0 = time.time()
train_input = bert_encode(processed_data, tokenizer, max_len=160)
test_input = bert_encode(test_processed_data, tokenizer, max_len=160)
train_labels = Y
test_labels = test_Y

In [15]:
model = build_model(bert_layer, max_len=161)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 161)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 161)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 161)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [None]:
train_history = model.fit(
    train_input, train_labels,
    epochs=3,
    batch_size=1
)

model.save('model.h5')

Epoch 1/3
























 236/4699 [>.............................] - ETA: 12:34 - loss: 1.7774 - accuracy: 0.2669

In [None]:
y_pred=model.predict(test_input, batch_size=1)

In [None]:
y_pred1=np.argmax(y_pred,axis=1)
test_processed_label=np.asarray(test_processed_label)
t1 = time.time()
total = t1-t0
print("Total Execution time (Training + Testing): " + str(total))
print("Accuracy : " + str(accuracy_score(test_processed_label, y_pred1)))
print("Weighted F1-score : " + str(f1_score(test_processed_label, y_pred1, average='weighted')))