# Pembangunan Model Leksikal Bertahap - Sistem Pengenal Emosi Cakapan

## Library Preparation

In [1]:
!pip install imbalanced-learn
!pip install transformers
!pip install sentence_transformers



In [2]:
import os

import numpy as np
import pandas as pd
import statistics
import joblib
import torch

import nltk
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, AutoModel

import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from keras import backend
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras import layers
from imblearn.over_sampling import SMOTE

from google.colab import drive
from google.colab import files



In [3]:
drive.mount('/content/gdrive')
drive_dir = "/content/gdrive/MyDrive/Teknik Informatika/Semester 7/TUGAS AKHIR/TA 2"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Feature Extraction

In [4]:
data = joblib.load(drive_dir + '/Eksperimen/Data/stopword.pkl')

In [5]:
tfidf_vectorizer = TfidfVectorizer()
feature_tfidf = tfidf_vectorizer.fit_transform(data['Transkripsi'])
feature_tfidf = np.array(feature_tfidf.todense())
feature_tfidf.shape

(10822, 5017)

## Data Preparation

### Construct Sequential Data 

In [6]:
def construct_sequential(id, data):
  sequential_datas = []
  new_data= []
  for i in range(len(id)):
    if id[i].endswith('001'):
      sequential_datas.append(new_data)
      new_data = []
    new_data.append(data[i])
  sequential_datas.append(new_data)

  return np.array(sequential_datas, dtype=object)[1:]

In [7]:
def construct_abstract(id, abstraksi):
  new_abstraksi = []
  for i in range(len(id)):
    if id[i].endswith('001'):
      new_abstraksi.append(abstraksi[i])

  return np.array(new_abstraksi, dtype=object)

In [8]:
id = data['ID Ucapan']
sequential_features_tfidf = construct_sequential(id, feature_tfidf)
print(sequential_features_tfidf.shape)

(2003,)


### Prepare Labels

In [9]:
encoder_ucapan = OneHotEncoder(sparse=False)
labels = np.array(data['Label Emosi']).reshape(-1, 1)
labels = encoder_ucapan.fit_transform(labels)
labels = construct_sequential(id, labels)
labels.shape

(2003,)

In [10]:
abstraksi = construct_abstract(id, data['Abstraksi Emosi'])
encoder_cakapan = OneHotEncoder(sparse=False)
abstraksi = abstraksi.reshape(-1, 1)
abstraksi = encoder_cakapan.fit_transform(abstraksi)
abstraksi.shape

(2003, 6)

### Split Train and Test Data

In [11]:
X = sequential_features_tfidf
y_sequential = labels
y = abstraksi

In [12]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, padding="post", dtype='float32')
y_sequential = tf.keras.preprocessing.sequence.pad_sequences(y_sequential, padding="post", dtype='float32')
y = tf.keras.preprocessing.sequence.pad_sequences(y, padding="post", dtype='float32')
print(X.shape, y.shape, y_sequential.shape)

(2003, 15, 5017) (2003, 6) (2003, 15, 6)


In [13]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
for train_index, test_index in sss.split(X, y):
  train_index, test_index = train_index, test_index

## Modelling Preparation

### Define Model

In [14]:
def get_contextual_model(shape, label_shape):
  model = Sequential()
  model.add(layers.Masking(mask_value=0, input_shape=shape))
  model.add(layers.GRU(32, return_sequences=True))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(64, activation='relu'))
  model.add(layers.Dense(label_shape, activation='softmax'))
  return model

In [15]:
def get_classification_model(shape, label_shape):
  model = Sequential()
  model.add(layers.Masking(mask_value=0, input_shape=shape))
  model.add(layers.GRU(32, return_sequences=False))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(64, activation='relu'))
  model.add(layers.Dense(label_shape, activation='softmax'))
  return model

### Define Training Parameter

In [16]:
batch_size = 32
epochs = 10
validation_split = 0.2
optimizer = 'adam'
loss = 'categorical_crossentropy'
metrics = ['accuracy']
callbacks = [EarlyStopping(monitor='val_accuracy', mode='max', min_delta=0.0001, patience=2, restore_best_weights=True)]

### Define Schema Experiment

In [17]:
def train_5fold_cv(X, y, shape, label_shape, return_sequences):

  loss_score = []
  accuracy_score = []  

  cv = KFold(n_splits=5, random_state=42, shuffle=True)

  for train_index, val_index in cv.split(X):
      
      X_train, X_val, y_train, y_val = X[train_index], X[val_index], y[train_index], y[val_index]
      
      model = None
      if return_sequences :
        model = get_contextual_model(shape, label_shape)
      else :
        model = get_classification_model(shape, label_shape)
      model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
      model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              epochs=epochs,
              batch_size=batch_size,
              callbacks=callbacks)
      
      l, a = model.evaluate(X_val, y_val)
      loss_score.append(l)
      accuracy_score.append(a)
  
  loss_score = np.round(np.mean(loss_score), 4)
  accuracy_score = np.round(np.mean(accuracy_score), 4)
  print("Loss Score : ", loss_score, "Accuracy Score : ", accuracy_score)
  return loss_score, accuracy_score

In [18]:
def train_stratified_random_sampling(X_train, y_train, X_test, y_test, shape, label_shape, return_sequences):

  model = None
  if return_sequences :
    model = get_contextual_model(shape, label_shape)
  else :
    model = get_classification_model(shape, label_shape)
  model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
  model.fit(X_train, y_train,
            validation_data=(X_test, y_test),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks)
      
  loss_score, accuracy_score = model.evaluate(X_test, y_test)
  loss_score = np.round(np.mean(loss_score), 4)
  accuracy_score = np.round(np.mean(accuracy_score), 4)

  print("Loss Score : ", loss_score, "Accuracy Score : ", accuracy_score)
  
  return model, accuracy_score

## Modul 1 : Pembangunan Contextual Model

In [62]:
X_train, y_train = X[train_index], y_sequential[train_index]
X_test, y_test = X[test_index], y_sequential[test_index]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1702, 15, 5017) (1702, 15, 6)
(301, 15, 5017) (301, 15, 6)


In [None]:
def run_contextual_model(X_train, y_train, X_test, y_test, n):
  model = None
  best_acc = 0
  for i in range(n):
    model, accuracy_score = train_stratified_random_sampling(X_train, y_train, X_test, y_test, X_train.shape[1:], y_train.shape[2], True)
    if best_acc < accuracy_score:
      best_acc = accuracy_score
      best_model = model
  return best_model

In [None]:
contextual_model = run_contextual_model(X_train, y_train, X_test, y_test, 5)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Loss Score :  0.4352 Accuracy Score :  0.4955
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Loss Score :  0.4845 Accuracy Score :  0.4981
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Loss Score :  0.4384 Accuracy Score :  0.5051
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Loss Score :  0.4482 Accuracy Score :  0.4923
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Loss Score :  0.4654 Accuracy Score :  0.5013


In [None]:
contextual_model.evaluate(X_test, y_test)



[0.43836164474487305, 0.5051085352897644]

In [None]:
contextual_model.save(drive_dir + "/Eksperimen/Model/model_leksikal_bertahap_contextual.h5")

In [63]:
contextual_model = keras.models.load_model(drive_dir + "/Eksperimen/Model/model_leksikal_bertahap_contextual.h5")
contextual_model.evaluate(X_test, y_test)



[0.43836164474487305, 0.5051085352897644]

### Extract Contextual Features

In [26]:
contextual_model = keras.models.load_model(drive_dir + "/Eksperimen/Model/model_leksikal_bertahap_contextual.h5")
contextual_model.evaluate(X[test_index], y_sequential[test_index])



[0.43836164474487305, 0.5051085352897644]

In [18]:
def extract_contextual_features(model, idx_output, data):
  get_relu_output = backend.function([model.layers[0].input], [model.layers[idx_output].output])
  relu_output = get_relu_output([data])
  return np.array(relu_output[0])

In [19]:
contextual_features = extract_contextual_features(contextual_model, 3, X)
contextual_features.shape

(2003, 15, 64)

In [45]:
for a in X[test_index][100][14]:
  if a != 0.0 : print(a)

## Modul 2 : Pembangunan Classification Model

In [47]:
X_train, y_train = extract_contextual_features(contextual_model, 3, X[train_index]), y[train_index]
X_test, y_test = extract_contextual_features(contextual_model, 3, X[test_index]), y[test_index]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1702, 15, 64) (1702, 6)
(301, 15, 64) (301, 6)


In [37]:
def run_classification_model(X_train, y_train, X_test, y_test, n):
  model = None
  best_acc = 0
  for i in range(n):
    model, accuracy_score = train_stratified_random_sampling(X_train, y_train, X_test, y_test, X_train.shape[1:], y_train.shape[1], False)
    if best_acc < accuracy_score:
      best_acc = accuracy_score
      best_model = model
  return best_model

In [None]:
classification_model = run_classification_model(X_train, y_train, X_test, y_test, 5)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Loss Score :  1.6066 Accuracy Score :  0.2625
Epoch 1/10
Epoch 2/10
Epoch 3/10
Loss Score :  1.5872 Accuracy Score :  0.3621
Epoch 1/10
Epoch 2/10
Epoch 3/10
Loss Score :  1.598 Accuracy Score :  0.3189
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  1.578 Accuracy Score :  0.2558
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss Score :  1.552 Accuracy Score :  0.412


In [None]:
classification_model.evaluate(X_test, y_test)



[1.2955167293548584, 0.5714285969734192]

In [48]:
classification_model = keras.models.load_model(drive_dir + "/Eksperimen/Model/model_leksikal_bertahap_classification_6.h5")
pred = classification_model.predict(X_test)
y_pred = encoder_cakapan.inverse_transform(pred)
y_true = encoder_cakapan.inverse_transform(y_test)
print(classification_report(y_true, y_pred, digits=4))
print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.5667    0.4048    0.4722        42
       Sedih     0.4907    0.6163    0.5464        86
      Senang     0.6525    0.6814    0.6667       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.5556    0.5000    0.5263        50

    accuracy                         0.5714       301
   macro avg     0.3776    0.3671    0.3686       301
weighted avg     0.5565    0.5714    0.5597       301

[[ 0  0  1  0  0  0]
 [ 0 17 18  3  0  4]
 [ 0  6 53 21  0  6]
 [ 0  2 25 77  0  9]
 [ 0  1  4  3  0  1]
 [ 0  4  7 14  0 25]]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
classification_model.save(drive_dir + "/Eksperimen/Model/model_leksikal_bertahap_classification_6.h5")

### Delete Small Size Label Data

In [None]:
def get_label_idxs(list, label):
  idxs = []
  for i in range(len(list)):
    for l in label:
      if list[i] == l:
        idxs.append(i)
  return np.array(idxs)

In [None]:
def delete_elArr_by_idx(arr, idx):
  new_arr = []
  for i in range(len(arr)):
    if i not in idx:
      new_arr.append(arr[i])
  return np.array(new_arr)

In [None]:
dec_abstraksi = encoder_cakapan.inverse_transform(abstraksi)
del_idxs = get_label_idxs(dec_abstraksi, ["Jijik", "Takut"])
updated_features = delete_elArr_by_idx(contextual_features, del_idxs)
updated_labels = delete_elArr_by_idx(dec_abstraksi, del_idxs)
print(updated_features.shape, updated_labels.shape)

(1936, 15, 64) (1936, 1)


In [None]:
encoder_cakapan_2 = OneHotEncoder(sparse=False)
updated_labels = updated_labels.reshape(-1, 1)
updated_labels = encoder_cakapan_2.fit_transform(updated_labels)
updated_labels.shape

(1936, 4)

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
for train_index, test_index in sss.split(updated_features, updated_labels):
  train_index, test_index = train_index, test_index

In [None]:
X_train, y_train = updated_features[train_index], updated_labels[train_index]
X_test, y_test = updated_features[test_index], updated_labels[test_index]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1645, 15, 64) (1645, 4)
(291, 15, 64) (291, 4)


In [None]:
classification_model = run_classification_model(X_train, y_train, X_test, y_test, 5)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss Score :  0.8913 Accuracy Score :  0.6873
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  0.8829 Accuracy Score :  0.6873
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  0.858 Accuracy Score :  0.6873
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  0.8883 Accuracy Score :  0.6804
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Loss Score :  0.8749 Accuracy Score :  0.6942


In [None]:
classification_model.evaluate(X_test, y_test)



[0.8749446868896484, 0.6941580772399902]

In [None]:
pred = classification_model.predict(X_test)
y_pred = encoder_cakapan_2.inverse_transform(pred)
y_true = encoder_cakapan_2.inverse_transform(y_test)
print(classification_report(y_true, y_pred, digits=4))
print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

       Marah     0.6735    0.7857    0.7253        42
       Sedih     0.6914    0.6512    0.6707        86
      Senang     0.7120    0.7876    0.7479       113
    Terkejut     0.6667    0.4800    0.5581        50

    accuracy                         0.6942       291
   macro avg     0.6859    0.6761    0.6755       291
weighted avg     0.6925    0.6942    0.6892       291

[[33  5  2  2]
 [ 7 56 20  3]
 [ 5 12 89  7]
 [ 4  8 14 24]]


In [None]:
classification_model.save(drive_dir + "/Eksperimen/Model/model_leksikal_bertahap_classification_4.h5")