# Pembangunan Model Leksikal Langsung - Sistem Pengenal Emosi Cakapan

## Library Preparation

In [1]:
!pip install imbalanced-learn
!pip install transformers
!pip install sentence_transformers



In [2]:
import os

import numpy as np
import pandas as pd
import statistics
import joblib
import torch

import nltk
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, AutoModel

import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras import layers
from imblearn.over_sampling import SMOTE

from google.colab import drive
from google.colab import files



In [3]:
drive.mount('/content/gdrive')
drive_dir = "/content/gdrive/MyDrive/Teknik Informatika/Semester 7/TUGAS AKHIR/TA 2"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
print(tf.__version__)

2.5.0


## Feature Extraction

In [5]:
data = joblib.load(drive_dir + '/Eksperimen/Data/basic.pkl')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10822 entries, 0 to 10821
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID Ucapan        10822 non-null  object
 1   Transkripsi      10822 non-null  object
 2   Label Emosi      10822 non-null  object
 3   Abstraksi Emosi  2003 non-null   object
dtypes: object(4)
memory usage: 338.3+ KB


### TFIDF

In [6]:
tfidf_vectorizer = TfidfVectorizer()
feature_tfidf = tfidf_vectorizer.fit_transform(data['Transkripsi'])
feature_tfidf = feature_tfidf.todense()
feature_tfidf.shape

(10822, 8304)

### Average Word2Vec

In [7]:
def w2v_embedding(data, model):
    matrix_data = []
    for text in data :
        matrix = []
        for word in text.split() :
            try :
                matrix.append(model.wv[word])
            except :
                matrix.append(np.zeros((300,), dtype=float))
        matrix = np.array(matrix)
        matrix_data.append((matrix.mean(axis=0)))   # Average word2vec
        # matrix_data.append(matrix)   # Concatenate vectors word2vec
    return np.array(matrix_data)

In [8]:
path = drive_dir + '/Eksperimen/Pretrained Model/Word2Vec/idwiki_word2vec_300.model'
w2v_model = gensim.models.word2vec.Word2Vec.load(path)

In [9]:
feature_w2v = w2v_embedding(data['Transkripsi'], w2v_model)   
feature_w2v = tf.keras.preprocessing.sequence.pad_sequences(feature_w2v, padding="post", dtype='float32')
feature_w2v.shape

(10822, 300)

### IndoBERT

In [10]:
# load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229167.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1534.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=497810400.0, style=ProgressStyle(descri…




In [11]:
# prepare data

tokenized = data['Transkripsi'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

max_len = 0
for i in tokenized.values: 
  if len(i) > max_len: max_len = len(i)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [None]:
# modelling
input_ids = torch.LongTensor(padded)
attention_mask = torch.LongTensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
feature_bert = last_hidden_states[0][:,0,:].numpy()
feature_bert.shape

## Data Preparation

### Construct Sequential Data 

In [7]:
def construct_sequential(id, data):
  sequential_datas = []
  new_data= []
  for i in range(len(id)):
    if id[i].endswith('001'):
      sequential_datas.append(new_data)
      new_data = []
    new_data.append(data[i])
  sequential_datas.append(new_data)

  return np.array(sequential_datas, dtype=object)[1:]

In [8]:
def construct_abstract(id, abstraksi):
  new_abstraksi = []
  for i in range(len(id)):
    if id[i].endswith('001'):
      new_abstraksi.append(abstraksi[i])

  return np.array(new_abstraksi, dtype=object)

In [9]:
id = data['ID Ucapan']
sequential_features_tfidf, sequential_features_w2v, sequential_features_bert = construct_sequential(id, feature_tfidf), construct_sequential(id, feature_tfidf), construct_sequential(id, feature_tfidf)
print(sequential_features_tfidf.shape, sequential_features_w2v.shape, sequential_features_bert.shape)

(2003,) (2003,) (2003,)


In [10]:
abstraksi = construct_abstract(id, data['Abstraksi Emosi'])
print(abstraksi.shape)

(2003,)


### Label One Hot Encoding

In [11]:
encoder_cakapan = OneHotEncoder(sparse=False)
abstraksi = abstraksi.reshape(-1, 1)
abstraksi = encoder_cakapan.fit_transform(abstraksi)
abstraksi.shape

(2003, 6)

### Split Train and Test Data

In [12]:
X_tfidf = sequential_features_tfidf
X_w2v = sequential_features_w2v
X_bert = sequential_features_bert
y = abstraksi

In [13]:
X_tfidf = tf.keras.preprocessing.sequence.pad_sequences(X_tfidf, padding="post", dtype='float32')
X_w2v = tf.keras.preprocessing.sequence.pad_sequences(X_w2v, padding="post", dtype='float32')
X_bert = tf.keras.preprocessing.sequence.pad_sequences(X_bert, padding="post", dtype='float32')
X_tfidf = X_tfidf.reshape(2003, 15, 8304)
y = tf.keras.preprocessing.sequence.pad_sequences(y, padding="post", dtype='float32')
print("TFIDF", X_tfidf.shape, "W2V", X_w2v.shape, "BERT", X_bert.shape)
print("Abstract", y.shape)

TFIDF (2003, 15, 8304) W2V (2003, 15, 1, 8304) BERT (2003, 15, 1, 8304)
Abstract (2003, 6)


In [14]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
for train_index, test_index in sss.split(X_tfidf, y):
  train_index, test_index = train_index, test_index
print("Train data", train_index.shape)
print("Test data", test_index.shape)

Train data (1702,)
Test data (301,)


## Modelling Preparation

### Define Model

In [15]:
def get_lstm_model(shape, label_shape):
  model = Sequential()
  model.add(layers.Masking(mask_value=0, input_shape=shape))
  model.add(layers.LSTM(32))
  model.add(layers.Dense(label_shape, activation='softmax'))
  return model

In [16]:
def get_gru_model(shape, label_shape):
  model = Sequential()
  model.add(layers.Masking(mask_value=0, input_shape=shape))
  model.add(layers.GRU(32))
  model.add(layers.Dense(label_shape, activation='softmax'))
  return model

In [17]:
def get_model_1(shape, label_shape):
  model = Sequential()
  model.add(layers.Masking(mask_value=0, input_shape=shape))
  model.add(layers.GRU(32))
  model.add(layers.Dense(label_shape, activation='softmax'))
  return model

In [18]:
def get_model_2(shape, label_shape):
  model = Sequential()
  model.add(layers.Masking(mask_value=0, input_shape=shape))
  model.add(layers.GRU(32))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(label_shape, activation='softmax'))
  return model

In [19]:
def get_model_3(shape, label_shape):
  model = Sequential()
  model.add(layers.Masking(mask_value=0, input_shape=shape))
  model.add(layers.GRU(32))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(64, activation='relu'))
  model.add(layers.Dense(label_shape, activation='softmax'))
  return model

In [20]:
def create_model(shape, label_shape, mode):
  model = None
  if mode == 1:
    model = get_model_1(shape, label_shape)
  elif mode == 2 :
    model = get_model_2(shape, label_shape)
  elif mode == 3 :
    model = get_model_3(shape, label_shape)
  elif mode == "lstm":
    model = get_lstm_model(shape, label_shape)
  elif mode == "gru":
    model = get_gru_model(shape, label_shape)
  return model

### Define Training Parameter

In [21]:
batch_size = 10
epochs = 10
validation_split = 0.2
optimizer = 'rmsprop'
loss = 'categorical_crossentropy'
metrics = ['accuracy']
callbacks = [EarlyStopping(monitor='val_loss', mode='min', min_delta=0.0001, patience=1, restore_best_weights=True)]

### Define Schema Experiment

In [22]:
def train_5fold_cv(X, y, shape, label_shape, mode):

  loss_score = []
  accuracy_score = []  

  cv = KFold(n_splits=5, random_state=42, shuffle=True)

  for train_index, val_index in cv.split(X):
      
      X_train, X_val, y_train, y_val = X[train_index], X[val_index], y[train_index], y[val_index]
      
      model = create_model(shape, label_shape, mode)
      model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
      model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              epochs=epochs,
              batch_size=batch_size,
              callbacks=callbacks)
      
      l, a = model.evaluate(X_val, y_val)
      loss_score.append(l)
      accuracy_score.append(a)
  
  loss_score = np.round(np.mean(loss_score), 4)
  accuracy_score = np.round(np.mean(accuracy_score), 4)
  print("Loss Score : ", loss_score, "Accuracy Score : ", accuracy_score)
  return loss_score, accuracy_score

In [23]:
def train_stratified_random_sampling(X_train, y_train, X_test, y_test, shape, label_shape, mode, encoder):

  model = create_model(shape, label_shape, mode)
  model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
  model.fit(X_train, y_train,
            validation_data=(X_test, y_test),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks)
      
  loss_score, accuracy_score = model.evaluate(X_test, y_test)
  loss_score = np.round(np.mean(loss_score), 4)
  accuracy_score = np.round(np.mean(accuracy_score), 4)

  pred = model.predict(X_test)
  y_pred = encoder.inverse_transform(pred)
  y_true = encoder.inverse_transform(y_test)

  print("Loss Score : ", loss_score, "Accuracy Score : ", accuracy_score)
  print(classification_report(y_true, y_pred, digits=4))
  print(confusion_matrix(y_true, y_pred))
  
  return model, accuracy_score

## Tahap 1 : Pencarian Fitur dan Algoritma Terbaik

In [None]:
feature = [("TFIDF", X_tfidf), ("W2V", X_w2v), ("BERT", X_bert)]
label = y
model = ["lstm", "gru"]
n = 5

In [None]:
def run_tahap_1(feature, label, model, train_index, n):
  performances = []
  for i in range(n):
    p = []
    for f in feature:
      X_train, y_train = f[1][train_index], label[train_index]
      for m in model:
        loss_score, accuracy_score = train_5fold_cv(X_train, y_train, X_train.shape[1:], y.shape[1], m)
        p.append((f[0] + ' ' + m, loss_score, accuracy_score))
    performances.append((i, p))
  return performances

In [None]:
performances = run_tahap_1(feature, label, model, train_index, n)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Loss Score :  1.4294 Accuracy Score :  0.4377
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss Score :  1.3741 Accuracy Score :  0.4513
Epoch 1/10
Epoch 2/10
Epoch 1/10
Epoch 2/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Loss Score :  1.4007 Accuracy Score :  0.389
Epoch 1/10
Epoch 2/10
Epoch 1/10
Epoch 2/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss Score :  1.3636 Accuracy Score :  0.416
Epoc

In [None]:
for p in performances:
  print("Training ke- : ", p[0])
  res = pd.DataFrame(p[1], columns=['Feature + Model', 'Loss Score', 'Accuracy Score'])
  print(res)

Training ke- :  0
  Feature + Model  Loss Score  Accuracy Score
0      TFIDF lstm      1.4294          0.4377
1       TFIDF gru      1.3741          0.4513
2        W2V lstm      1.4007          0.3890
3         W2V gru      1.3636          0.4160
4       BERT lstm      1.4526          0.3919
5        BERT gru      1.4004          0.4295
Training ke- :  1
  Feature + Model  Loss Score  Accuracy Score
0      TFIDF lstm      1.4102          0.4319
1       TFIDF gru      1.3483          0.4660
2        W2V lstm      1.3947          0.3919
3         W2V gru      1.4473          0.3502
4       BERT lstm      1.5467          0.3619
5        BERT gru      1.4547          0.3972
Training ke- :  2
  Feature + Model  Loss Score  Accuracy Score
0      TFIDF lstm      1.3955          0.4478
1       TFIDF gru      1.3523          0.4419
2        W2V lstm      1.4225          0.3836
3         W2V gru      1.4029          0.3796
4       BERT lstm      1.3949          0.3932
5        BERT gru      1.3

In [None]:
res = pd.DataFrame(performances[0][1], columns=['Feature + Model', 'Loss Score', 'Accuracy Score'])
for performance in performances[1:]:
  temp = pd.DataFrame(performance[1], columns=['Feature + Model', 'Loss Score', 'Accuracy Score'])
  res['Loss Score'] += temp['Loss Score']
  res['Accuracy Score'] += temp['Accuracy Score']
res['Loss Score'] /= n
res['Accuracy Score'] /= n
print(res.sort_values(by='Accuracy Score', ascending=False, ignore_index=True))

  Feature + Model  Loss Score  Accuracy Score
0       TFIDF gru     1.37598         0.45022
1      TFIDF lstm     1.41360         0.43540
2        BERT gru     1.43676         0.40240
3       BERT lstm     1.45106         0.38570
4         W2V gru     1.40848         0.38146
5        W2V lstm     1.40966         0.38118


## Tahap 2 : Penentuan Teknik Preprocessing

In [None]:
preprocess = ['basic', 'normalization', 'stemming', 'stopword']
label = abstraksi
n = 5

In [None]:
def run_tahap_2(preprocess, label, train_index, n):
  performances = []
  for i in range(n):
    p = []
    for pp in preprocess:
      data = joblib.load(drive_dir + '/Eksperimen/Data/' + pp + '.pkl')
      vectorizer = TfidfVectorizer()
      feature = vectorizer.fit_transform(data['Transkripsi'])
      sequential_feature = construct_sequential(data["ID Ucapan"], feature.todense())

      X = tf.keras.preprocessing.sequence.pad_sequences(sequential_feature, padding="post", dtype='float32')
      X = X.reshape(X.shape[0], X.shape[1], X.shape[3])

      X_train, y_train = X[train_index], label[train_index]
      
      loss_score, accuracy_score = train_5fold_cv(X_train, y_train, X_train.shape[1:], y.shape[1], 'gru')
      p.append((pp, loss_score, accuracy_score))

    performances.append((i, p))
  return performances

In [None]:
performances = run_tahap_2(preprocess, label, model, train_index, n)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss Score :  1.3513 Accuracy Score :  0.4442
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss Score :  1.3667 Accuracy Score :  0.4436
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Loss Score :  1.3665 Accuracy Score :  0.4495
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoc

In [None]:
for p in performances:
  print("Training ke- : ", p[0])
  res = pd.DataFrame(p[1], columns=['Preprocess', 'Loss Score', 'Accuracy Score'])
  print(res['Accuracy Score'])

Training ke- :  0
0    0.4442
1    0.4436
2    0.4495
3    0.4477
Name: Accuracy Score, dtype: float64
Training ke- :  1
0    0.4407
1    0.4454
2    0.4431
3    0.4654
Name: Accuracy Score, dtype: float64
Training ke- :  2
0    0.4530
1    0.4460
2    0.4483
3    0.4537
Name: Accuracy Score, dtype: float64
Training ke- :  3
0    0.4442
1    0.4301
2    0.4648
3    0.4360
Name: Accuracy Score, dtype: float64
Training ke- :  4
0    0.4413
1    0.4524
2    0.4624
3    0.4736
Name: Accuracy Score, dtype: float64


In [None]:
res = pd.DataFrame(performances[0][1], columns=['Preprocess', 'Loss Score', 'Accuracy Score'])
for performance in performances[1:]:
  temp = pd.DataFrame(performance[1], columns=['Preprocess', 'Loss Score', 'Accuracy Score'])
  res['Loss Score'] += temp['Loss Score']
  res['Accuracy Score'] += temp['Accuracy Score']
res['Loss Score'] /= n
res['Accuracy Score'] /= n
print(res.sort_values(by='Accuracy Score', ascending=False, ignore_index=True))

      Preprocess  Loss Score  Accuracy Score
0       stopword     1.35268         0.45528
1       stemming     1.36138         0.45362
2          basic     1.36694         0.44468
3  normalization     1.39116         0.44350


## Tahap 3 : Pencarian Arsitektur Model Terbaik

In [None]:
model = [1, 2, 3]
label = abstraksi
n = 5

In [None]:
data = joblib.load(drive_dir + '/Eksperimen/Data/stopword.pkl')
vectorizer = TfidfVectorizer()
feature = vectorizer.fit_transform(data['Transkripsi'])
sequential_feature = construct_sequential(data["ID Ucapan"], feature.todense())
X = tf.keras.preprocessing.sequence.pad_sequences(sequential_feature, padding="post", dtype='float32')
X = X.reshape(X.shape[0], X.shape[1], X.shape[3])

In [None]:
def run_tahap_3(X, label, model, train_index, n):
  performances = []
  for i in range(n):
    p = []
    X_train, y_train = X[train_index], label[train_index]
    for m in model:
      loss_score, accuracy_score = train_5fold_cv(X_train, y_train, X_train.shape[1:], y.shape[1], m)
      p.append((m, loss_score, accuracy_score))
    performances.append((i, p))
  return performances

In [None]:
performances = run_tahap_3(X, label, model, train_index, n)

In [None]:
for p in performances:
  print("Training ke- : ", p[0])
  res = pd.DataFrame(p[1], columns=['Architecture', 'Loss Score', 'Accuracy Score'])
  print(res['Accuracy Score'])

In [None]:
res = pd.DataFrame(performances[0][1], columns=['Architecture', 'Loss Score', 'Accuracy Score'])
for performance in performances[1:]:
  temp = pd.DataFrame(performance[1], columns=['Architecture', 'Loss Score', 'Accuracy Score'])
  res['Loss Score'] += temp['Loss Score']
  res['Accuracy Score'] += temp['Accuracy Score']
res['Loss Score'] /= n
res['Accuracy Score'] /= n
print(res.sort_values(by='Accuracy Score', ascending=False, ignore_index=True))

## Tahap 4 : Penentuan Parameter Model

In [None]:
data = joblib.load(drive_dir + '/Eksperimen/Data/stopword.pkl')
vectorizer = TfidfVectorizer()
feature = vectorizer.fit_transform(data['Transkripsi'])
sequential_feature = construct_sequential(data["ID Ucapan"], feature.todense())
X = tf.keras.preprocessing.sequence.pad_sequences(sequential_feature, padding="post", dtype='float32')
X = X.reshape(X.shape[0], X.shape[1], X.shape[3])
label = abstraksi

In [None]:
n = 5

In [None]:
def run_tahap_4(X, label, train_index, n):
  performances = []
  for i in range(n):
    p = []
    X_train, y_train = X[train_index], label[train_index]
    loss_score, accuracy_score = train_5fold_cv(X_train, y_train, X_train.shape[1:], y.shape[1], 3)
    p.append((batch_size, loss_score, accuracy_score))
    performances.append((i, p))
  return performances

### Batch Size

In [None]:
batch_size = 10
run_tahap_4(X, label, train_index, n)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Loss Score :  1.3565 Accuracy Score :  0.4283
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Loss Score :  1.3035 Accuracy Score :  0.4419
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Loss Score :  1.3168 Accuracy Score :  0.4366
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoc

[(0, [(10, 1.3565, 0.4283)]),
 (1, [(10, 1.3035, 0.4419)]),
 (2, [(10, 1.3168, 0.4366)]),
 (3, [(10, 1.308, 0.439)]),
 (4, [(10, 1.3317, 0.4342)])]

In [None]:
batch_size = 32
performances.append(run_tahap_4(X, label, train_index, n))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  1.3035 Accuracy Score :  0.4583
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  1.3227 Accuracy Score :  0.4448
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 1/10
Epoch 2/10
Epoch 

In [None]:
batch_size = 64
performances.append(run_tahap_4(X, label, train_index, n))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Loss Score :  1.3243 Accuracy Score :  0.4395
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  1.3527 Accuracy Score :  0.3961
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch

In [None]:
batch_size = 128
performances.append(run_tahap_4(X, label, train_index, n))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss Score :  1.3483 Accuracy Score :  0.436
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Ep

### Optimizer

In [None]:
batch_size = 32

In [None]:
optimizer = 'adam'
run_tahap_4(X, label, train_index, n)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss Score :  1.3158 Accuracy Score :  0.4513
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss Score :  1.3322 Accuracy Score :  0.4319
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss S

In [None]:
optimizer = 'rmsprop'
run_tahap_4(X, label, train_index, n)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  1.3064 Accuracy Score :  0.439
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  1.314 Accuracy Score :  0.4348
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 1/10
Epoch 2/

[(0, [(32, 1.3064, 0.439)]),
 (1, [(32, 1.314, 0.4348)]),
 (2, [(32, 1.3283, 0.4248)]),
 (3, [(32, 1.3055, 0.4624)]),
 (4, [(32, 1.3261, 0.4366)])]

In [None]:
optimizer = 'sgd'
run_tahap_4(X, label, train_index, n)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss Score :  1.4594 Accuracy Score :  0.3749
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


[(0, [(32, 1.4594, 0.3749)]),
 (1, [(32, 1.4649, 0.3749)]),
 (2, [(32, 1.4558, 0.3749)]),
 (3, [(32, 1.4599, 0.3749)]),
 (4, [(32, 1.4573, 0.3749)])]

## Tahap 5 : Penentuan Penggunaan Teknik Resampling

In [24]:
batch_size = 32
optimizer = 'adam'
callbacks = [EarlyStopping(monitor='val_accuracy', mode='max', min_delta=0.0001, patience=2, restore_best_weights=True)]

In [25]:
data = joblib.load(drive_dir + '/Eksperimen/Data/stopword.pkl')

vectorizer = TfidfVectorizer()
feature = vectorizer.fit_transform(data['Transkripsi'])
sequential_features = construct_sequential(data["ID Ucapan"], feature.todense())

X = tf.keras.preprocessing.sequence.pad_sequences(sequential_features, padding="post", dtype='float32')
X = X.reshape(2003, 15, X.shape[3])
y = abstraksi

In [36]:
X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1702, 15, 5017) (1702, 6) (301, 15, 5017) (301, 6)


In [31]:
n = 5
encoder = encoder_cakapan

In [30]:
def run_tahap_4(X_train, y_train, X_test, y_test, n, encoder):
  model = None
  best_acc = 0
  for i in range(n):
    model, accuracy_score = train_stratified_random_sampling(X_train, y_train, X_test, y_test, X_train.shape[1:], y.shape[1], 3, encoder)
    if best_acc < accuracy_score:
      best_acc = accuracy_score
      best_model = model
  return best_model

In [39]:
def run_tahap_5_cv(X_train, y_train, n):
  performances = []
  for i in range(n):
    p = []
    loss_score, accuracy_score = train_5fold_cv(X_train, y_train, X_train.shape[1:], y.shape[1], 3)
    p.append((batch_size, loss_score, accuracy_score))
    performances.append((i, p))
  return performances

### Tanpa Resampling

In [44]:
callbacks = [EarlyStopping(monitor='val_accuracy', mode='max', min_delta=0.0001, patience=3, restore_best_weights=True)]
run_tahap_5_cv(X_train, y_train, 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss Score :  1.8251 Accuracy Score :  0.4842


[(0, [(32, 1.8251, 0.4842)])]

In [37]:
best_model = run_tahap_4(X_train, y_train, X_test, y_test, n, encoder)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Loss Score :  1.4217 Accuracy Score :  0.3754
              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.0000    0.0000    0.0000        42
       Sedih     0.0000    0.0000    0.0000        86
      Senang     0.3754    1.0000    0.5459       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.0000    0.0000    0.0000        50

    accuracy                         0.3754       301
   macro avg     0.0626    0.1667    0.0910       301
weighted avg     0.1409    0.3754    0.2049       301

[[  0   0   0   1   0   0]
 [  0   0   0  42   0   0]
 [  0   0   0  86   0   0]
 [  0   0   0 113   0   0]
 [  0   0   0   9   0   0]
 [  0   0   0  50   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Loss Score :  1.5838 Accuracy Score :  0.5316
              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.5789    0.2619    0.3607        42
       Sedih     0.4474    0.5930    0.5100        86
      Senang     0.6148    0.6637    0.6383       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.5000    0.4600    0.4792        50

    accuracy                         0.5316       301
   macro avg     0.3568    0.3298    0.3314       301
weighted avg     0.5224    0.5316    0.5153       301

[[ 0  0  0  0  0  1]
 [ 0 11 18  9  0  4]
 [ 0  4 51 23  0  8]
 [ 0  1 30 75  0  7]
 [ 0  1  3  2  0  3]
 [ 0  2 12 13  0 23]]


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Loss Score :  1.4398 Accuracy Score :  0.5249
              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.5000    0.2619    0.3438        42
       Sedih     0.4583    0.6395    0.5340        86
      Senang     0.6400    0.5664    0.6009       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.4746    0.5600    0.5138        50

    accuracy                         0.5249       301
   macro avg     0.3455    0.3380    0.3321       301
weighted avg     0.5198    0.5249    0.5115       301

[[ 0  0  0  0  0  1]
 [ 0 11 18  7  0  6]
 [ 0  6 55 19  0  6]
 [ 0  0 34 64  0 15]
 [ 0  1  4  1  0  3]
 [ 0  4  9  9  0 28]]


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Loss Score :  1.3041 Accuracy Score :  0.5017
              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.4074    0.2619    0.3188        42
       Sedih     0.4194    0.3023    0.3514        86
      Senang     0.5724    0.7699    0.6566       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.4500    0.5400    0.4909        50

    accuracy                         0.5017       301
   macro avg     0.3082    0.3124    0.3030       301
weighted avg     0.4663    0.5017    0.4729       301

[[ 0  0  0  0  0  1]
 [ 0 11 14  8  0  9]
 [ 0  8 26 43  0  9]
 [ 0  2 13 87  0 11]
 [ 0  2  2  2  0  3]
 [ 0  4  7 12  0 27]]


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Loss Score :  1.7088 Accuracy Score :  0.515
              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.5135    0.4524    0.4810        42
       Sedih     0.4397    0.5930    0.5050        86
      Senang     0.6778    0.5398    0.6010       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.4138    0.4800    0.4444        50

    accuracy                         0.5150       301
   macro avg     0.3408    0.3442    0.3386       301
weighted avg     0.5205    0.5150    0.5108       301

[[ 0  0  0  0  0  1]
 [ 0 19 15  6  0  2]
 [ 0  6 51 16  0 13]
 [ 0  4 34 61  0 14]
 [ 0  2  3  0  0  4]
 [ 0  6 13  7  0 24]]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
best_model.save(drive_dir + "/Eksperimen/Model/model_leksikal_langsung_tes.h5")

### Dengan Resampling

In [26]:
loss_score = []
accuracy_score = []

cv = KFold(n_splits=5, random_state=42, shuffle=True)
i = 0
for train_index, val_index in cv.split(X):
    
    X_train, X_val, y_train, y_val = X[train_index], X[val_index], y[train_index], y[val_index]
    try:
      X_train_reshape = X_train.reshape((X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
      oversample = SMOTE(kind='regular', k_neighbors=2)
      X_train_resampling, y_train_resampling = oversample.fit_resample(X_train_reshape, y_train)
      X_train_resampling = X_train_resampling.reshape(X_train_resampling.shape[0], 15, int(X_train_resampling.shape[1]/15))

      model = create_model(X_train_resampling.shape[1:], y_train_resampling.shape[1], 3)
      model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
      model.fit(X_train_resampling, y_train_resampling,
                validation_data=(X_val, y_val),
                epochs=epochs,
                batch_size=batch_size,
                callbacks=callbacks)
    except:
      model = create_model(X_train.shape[1:], y_train.shape[1], 3)
      model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
      model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              epochs=epochs,
              batch_size=batch_size,
              callbacks=callbacks)
        
    l, a = model.evaluate(X_val, y_val)
    loss_score.append(l)
    accuracy_score.append(a)
  
loss_score = np.round(np.mean(loss_score), 4)
accuracy_score = np.round(np.mean(accuracy_score), 4)
print("Loss Score : ", loss_score, "Accuracy Score : ", accuracy_score)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [27]:
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
oversample = SMOTE(kind='regular', k_neighbors=2)
X_train_resampling, y_train_resampling = oversample.fit_resample(X_train, y_train)
X_train_resampling = X_train_resampling.reshape(3828, 15, 5017)



In [34]:
best_model = run_tahap_4(X_train_resampling, y_train_resampling, X_test, y_test, 5, encoder)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss Score :  3.0047 Accuracy Score :  0.4884
              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.4400    0.2619    0.3284        42
       Sedih     0.4556    0.4767    0.4659        86
      Senang     0.5917    0.6283    0.6094       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.3871    0.4800    0.4286        50

    accuracy                         0.4884       301
   macro avg     0.3124    0.3078    0.3054       301
weighted avg     0.4780    0.4884    0.4789       301

[[ 0  0  0  0  0  1]
 [ 0 11 13  6  2 10]
 [ 0  8 41 27  1  9]
 [ 0  2 25 71  0 15]
 [ 0  1  2  3  0  3]
 [ 0  3  9 13  1 24]]


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  1.663 Accuracy Score :  0.4983
              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.4706    0.1905    0.2712        42
       Sedih     0.4886    0.5000    0.4943        86
      Senang     0.5882    0.6195    0.6034       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.3867    0.5800    0.4640        50

    accuracy                         0.4983       301
   macro avg     0.3224    0.3150    0.3055       301
weighted avg     0.4903    0.4983    0.4827       301

[[ 0  0  0  0  0  1]
 [ 0  8 11  9  0 14]
 [ 0  5 43 25  0 13]
 [ 0  2 26 70  1 14]
 [ 0  1  2  2  0  4]
 [ 0  1  6 13  1 29]]


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Loss Score :  1.396 Accuracy Score :  0.4086
              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.3636    0.2857    0.3200        42
       Sedih     0.3452    0.3372    0.3412        86
      Senang     0.4425    0.6814    0.5366       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.5000    0.1000    0.1667        50

    accuracy                         0.4086       301
   macro avg     0.2752    0.2341    0.2274       301
weighted avg     0.3986    0.4086    0.3713       301

[[ 0  0  1  0  0  0]
 [ 0 12  8 22  0  0]
 [ 0 10 29 46  0  1]
 [ 0  6 26 77  0  4]
 [ 0  1  5  3  0  0]
 [ 0  4 15 26  0  5]]


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Loss Score :  2.5346 Accuracy Score :  0.4419
              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.4375    0.1667    0.2414        42
       Sedih     0.4051    0.3721    0.3879        86
      Senang     0.5077    0.5841    0.5432       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.3733    0.5600    0.4480        50

    accuracy                         0.4419       301
   macro avg     0.2873    0.2805    0.2701       301
weighted avg     0.4294    0.4419    0.4229       301

[[ 0  0  0  0  0  1]
 [ 0  7  8 13  0 14]
 [ 0  1 32 41  0 12]
 [ 0  5 27 66  0 15]
 [ 0  0  2  2  0  5]
 [ 0  3 10  8  1 28]]


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Loss Score :  1.6604 Accuracy Score :  0.4784
              precision    recall  f1-score   support

       Jijik     0.0000    0.0000    0.0000         1
       Marah     0.4762    0.2381    0.3175        42
       Sedih     0.4302    0.4302    0.4302        86
      Senang     0.5649    0.6549    0.6066       113
       Takut     0.0000    0.0000    0.0000         9
    Terkejut     0.4035    0.4600    0.4299        50

    accuracy                         0.4784       301
   macro avg     0.3125    0.2972    0.2974       301
weighted avg     0.4685    0.4784    0.4663       301

[[ 0  0  0  0  0  1]
 [ 0 10 19  6  1  6]
 [ 0  8 37 30  2  9]
 [ 0  3 20 74  1 15]
 [ 0  0  2  4  0  3]
 [ 0  0  8 17  2 23]]


  _warn_prf(average, modifier, msg_start, len(result))


## Tahap 6 : Penyesuaian Design Eksperimen Untuk Meningkatkan Performa

### Delete Small Size Label Data

In [None]:
def get_label_idxs(list, label):
  idxs = []
  for i in range(len(list)):
    for l in label:
      if list[i] == l:
        idxs.append(i)
  return np.array(idxs)

In [None]:
dec_abstraksi = encoder_cakapan.inverse_transform(abstraksi)
del_idxs = get_label_idxs(dec_abstraksi, ["Jijik", "Takut"])
updated_features = np.delete(sequential_features, del_idxs)
updated_labels = np.delete(dec_abstraksi, del_idxs)
updated_labels = updated_labels.reshape(-1, 1)
print(updated_features.shape, updated_labels.shape)

(1936,) (1936, 1)


In [None]:
new_encoder_cakapan = OneHotEncoder(sparse=False)
updated_labels = updated_labels.reshape(-1, 1)
updated_labels = new_encoder_cakapan.fit_transform(updated_labels)
updated_labels.shape

(1936, 4)

In [None]:
X = tf.keras.preprocessing.sequence.pad_sequences(updated_features, padding="post", dtype='float32')
X = X.reshape(X.shape[0], X.shape[1], X.shape[3])
y = tf.keras.preprocessing.sequence.pad_sequences(updated_labels, padding="post", dtype='float32')
print(X.shape, y.shape)

(1936, 15, 5017) (1936, 4)


In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
for train_index, test_index in sss.split(X, y):
  train_index, test_index = train_index, test_index

In [None]:
n = 5
encoder = new_encoder_cakapan

In [None]:
best_model = run_tahap_4(X[train_index], y[train_index], X[test_index], y[test_index], n, new_encoder_cakapan)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Loss Score :  1.8195 Accuracy Score :  0.4777
              precision    recall  f1-score   support

       Marah     0.3793    0.2619    0.3099        42
       Sedih     0.3735    0.3605    0.3669        86
      Senang     0.5541    0.7257    0.6284       113
    Terkejut     0.4839    0.3000    0.3704        50

    accuracy                         0.4777       291
   macro avg     0.4477    0.4120    0.4189       291
weighted avg     0.4634    0.4777    0.4608       291

[[11 17 10  4]
 [10 31 41  4]
 [ 2 21 82  8]
 [ 6 14 15 15]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Loss Score :  1.1788 Accuracy Score :  0.4777
              precision    recall  f1-score   support

       Marah     0.8000    0.0952    0.1702        42
       Sedih     0.4828    0.1628    0.2435        86
      Senang     0.4783    0.8761    0.6188       113
    Terkejut     0.4400    0.4400    0.4400        5

In [None]:
best_model.save(drive_dir + "/Eksperimen/Model/model_leksikal_langsung_4.h5")

In [None]:
model_leksikal = keras.models.load_model(drive_dir + "/Eksperimen/Model/model_leksikal_langsung_4.h5")
model_leksikal.evaluate(X[test_index], y[test_index])



[1.110121726989746, 0.5498281717300415]

In [None]:
pred = model_leksikal.predict(X[test_index])
y_pred = new_encoder_cakapan.inverse_transform(pred)
y_true = new_encoder_cakapan.inverse_transform(y[test_index])

print(classification_report(y_true, y_pred, digits=4))
print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

       Marah     0.5526    0.5000    0.5250        42
       Sedih     0.4375    0.4070    0.4217        86
      Senang     0.6202    0.7080    0.6612       113
    Terkejut     0.5455    0.4800    0.5106        50

    accuracy                         0.5498       291
   macro avg     0.5389    0.5237    0.5296       291
weighted avg     0.5436    0.5498    0.5449       291

[[21 13  5  3]
 [ 8 35 34  9]
 [ 3 22 80  8]
 [ 6 10 10 24]]
