<a href="https://colab.research.google.com/github/anitayadav3/EmotionRecognitionInConversation/blob/master/BERT_on_IEMOCAP_for_ERC_Improvization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import re
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import pickle
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from keras.utils.np_utils import to_categorical  

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
!pip install sentencepiece



In [8]:
import tokenization

In [4]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [38]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    dense_layer1 = Dense(64,activation='relu')(clf_output)
    dense_layer2 = Dense(32,activation='relu')(dense_layer1)
    out = Dense(6, activation='softmax')(dense_layer2)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [10]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 42.7 s, sys: 11.8 s, total: 54.5 s
Wall time: 7min 33s


In [39]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [40]:
with open('/content/gdrive/My Drive/iemocap/train/sentences.pkl', 'rb') as f:
    data = pickle.load(f)
with open('/content/gdrive/My Drive/iemocap/train/labels.pkl', 'rb') as f:
    labels = pickle.load(f)
with open('/content/gdrive/My Drive/iemocap/test/sentences.pkl', 'rb') as f:
    test_data = pickle.load(f)
with open('/content/gdrive/My Drive/iemocap/test/labels.pkl', 'rb') as f:
    test_labels = pickle.load(f)

In [41]:
def preprocessing(data,labels):
  processed_data=[]
  processed_label=[]
  for i in range(0,len(data)):
    for j in range(0,len(data[i])):
      intermediate_data=[]
      intermediate_label=[]
      for k in range(0,len(data[i][j])):
        text=data[i][j][k]
        if text != '<eos>'and text!='<pad>':
          intermediate_data.append(text)
      processed_data.append(intermediate_data)
  for i in labels:
    for j in i:
      processed_label.append(j)
  return processed_data,processed_label

In [42]:
processed_data,processed_label = preprocessing(data,labels)
test_processed_data,test_processed_label = preprocessing(test_data,test_labels)

In [43]:
for i in range(0,len(processed_data)):
  processed_data[i]= ' '.join(processed_data[i])
for i in range(0,len(test_processed_data)):
  test_processed_data[i]=' '.join(test_processed_data[i])

In [44]:
processed_data=np.asarray(processed_data)
test_processed_data=np.asarray(test_processed_data)
Y=to_categorical(processed_label, num_classes=6)
test_Y=to_categorical(test_processed_label, num_classes=6)

In [45]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [46]:
train_input = bert_encode(processed_data, tokenizer, max_len=160)
test_input = bert_encode(test_processed_data, tokenizer, max_len=160)
train_labels = Y
test_labels = test_Y

In [47]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [48]:
train_history = model.fit(
    train_input, train_labels,
    epochs=6,
    batch_size=1
)

model.save('model.h5')

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [49]:
y_pred=model.predict(test_input)

In [50]:
y_pred=np.argmax(y_pred,axis=1)
test_processed_label=np.asarray(test_processed_label)
print("Accuracy : " + str(accuracy_score(test_processed_label, y_pred1)))
print("Weighted F1-score : " + str(f1_score(test_processed_label, y_pred1, average='weighted')))

Accuracy : 0.5163277880468269
Weighted F1-score : 0.5136379046111991
