# Feeling Detection using Deep Learning NLP

### Import Library

In [None]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
!pip install pandas nltk tensorflow scikit-learn keras-tuner



In [None]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import json
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras_tuner as kt

## Load Dataset

In [None]:
# Load dataset from CSV
dataset = pd.read_csv('dataset-feeling.csv')

# Check columns
print("Columns in the dataset:", dataset.columns)

# Check if dataset contains the required columns
if 'Teks' not in dataset.columns or 'Label' not in dataset.columns:
    raise ValueError("The dataset must contain 'Teks' and 'Label' columns.")

Columns in the dataset: Index(['Teks', 'Label'], dtype='object')


## Data Preprocessing

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')

# Daftar stop words Bahasa Indonesia
stop_words_id = set(stopwords.words('indonesian'))

# Clean the texts
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)  # remove short words
    text = re.sub(r'\d+', '', text)  # remove digits
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = clean_text(text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words_id]
    return ' '.join(words)

dataset['Teks'] = dataset['Teks'].apply(preprocess_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Normalization

In [None]:
# Split texts and labels
texts = dataset['Teks'].tolist()
labels = dataset['Label'].tolist()

# Tokenizing
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# Save the tokenizer
with open('tokenizer.json', 'w') as f:
    json.dump(tokenizer.to_json(), f)

# Padding
maxlen = 50
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.array(labels)

# Load GloVe embeddings
def load_glove_embeddings(path):
    embeddings_index = {}
    with open(path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Download GloVe embeddings if not already done
!wget --no-check-certificate http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d glove

glove_path = 'glove/glove.6B.50d.txt'
embeddings_index = load_glove_embeddings(glove_path)

embedding_dim = 50
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

--2024-06-17 13:56:36--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-06-17 13:56:37--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-06-17 13:56:37--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

## Split Dataset

In [None]:
# Split the data into training, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(data, labels, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

## Model Building

In [None]:
# Define the model with three classes
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index) + 1, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
    tf.keras.layers.LSTM(64, return_sequences=True),  # Second LSTM layer
    tf.keras.layers.LSTM(32, return_sequences=True),  # Third LSTM layer with return_sequences=True
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation='softmax')  # Output layer for 3 classes with softmax activation
])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 50)            128900    
                                                                 
 spatial_dropout1d_2 (Spati  (None, 50, 50)            0         
 alDropout1D)                                                    
                                                                 
 lstm_4 (LSTM)               (None, 50, 64)            29440     
                                                                 
 lstm_5 (LSTM)               (None, 50, 64)            33024     
                                                                 
 lstm_6 (LSTM)               (None, 50, 32)            12416     
                                                                 
 global_max_pooling1d_2 (Gl  (None, 32)                0         
 obalMaxPooling1D)                                    

## Compile Model

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## Train Model

In [None]:
# Define a custom callback to stop training when accuracy and validation accuracy reach 95%
class StopEpoch(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        accuracy = logs.get('accuracy')
        val_accuracy = logs.get('val_accuracy')
        if accuracy is not None and val_accuracy is not None:
            if accuracy > 0.85 and val_accuracy > 0.80:
                print(f"\nStopping training as accuracy ({accuracy*100:.2f}%) and validation accuracy ({val_accuracy*100:.2f}%) have reached target")
                self.model.stop_training = True

# Instantiate the custom callback
stop_epoch= StopEpoch()

# Training the model
history = model.fit(X_train, y_train, epochs=500, validation_data=(X_val, y_val), batch_size=512, callbacks=[stop_epoch])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

## Evaluate Model

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy*100:.2f}%')

Test Accuracy: 78.20%


## Tuning

In [None]:
def build_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(len(word_index) + 1, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
    model.add(tf.keras.layers.SpatialDropout1D(0.2))
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(tf.keras.layers.LSTM(units=hp.Int('units_' + str(i), min_value=64, max_value=256, step=64),
                                       dropout=hp.Float('dropout_' + str(i), 0.1, 0.5, step=0.1),
                                       recurrent_dropout=hp.Float('recurrent_dropout_' + str(i), 0.1, 0.5, step=0.1),
                                       return_sequences=True))
    model.add(tf.keras.layers.GlobalMaxPooling1D())
    model.add(tf.keras.layers.Dense(hp.Int('dense_units', 64, 256, step=64), activation='relu'))
    model.add(tf.keras.layers.Dropout(hp.Float('dropout_dense', 0.1, 0.5, step=0.1)))
    model.add(tf.keras.layers.Dense(3, activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

tuner = kt.BayesianOptimization(
    build_model,
    objective='val_accuracy',
    max_trials=3,
    directory='my_dir',
    project_name='emotion_detection_bayesian'
)

tuner.search(X_train, y_train, epochs=15, validation_data=(X_val, y_val))
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

# Retrain the best model on the full training data
best_model.fit(X_train, y_train, epochs=25, validation_data=(X_val, y_val))

# Save the best model
best_model.save('feeling_detection_model_2.h5')

Reloading Tuner from my_dir/emotion_detection_bayesian/tuner0.json
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            128900    
                                                                 
 spatial_dropout1d (Spatial  (None, 50, 50)            0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 50, 192)           186624    
                                                                 
 global_max_pooling1d (Glob  (None, 192)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 128)               24704     
                                                       

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = best_model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy*100:.2f}%')

Test Accuracy: 78.95%


### Download Model ke Lokal

Download Model h5 Non-Tuning

In [None]:
# Save Model Non-Tuning
model.save('feeling_detection_model_notuning_2.h5')

In [None]:
from google.colab import files

# Unduh file langsung ke komputer lokal
files.download('feeling_detection_model_notuning_2.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download Model h5 Tuning

In [None]:
from google.colab import files

# Unduh file langsung ke komputer lokal
files.download('feeling_detection_model_2.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Save Model

In [None]:
# The best model is already saved after tuning
# Save the tokenizer
with open('tokenizer.json', 'w') as f:
    json.dump(tokenizer.to_json(), f)

## Deployment on Colab

In [None]:
# Load the model
model = tf.keras.models.load_model('feeling_detection_model_notuning_2.h5') #ini bisa diganti h5 tuning atau non-tuning

def preprocess_text_for_prediction(text, tokenizer, maxlen):
    text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=maxlen)
    return padded_sequence

import ipywidgets as widgets
from IPython.display import display

def get_user_input_and_predict(model, tokenizer, maxlen):
    text_box = widgets.Textarea(
        value='',
        placeholder='Masukkan teks di sini...',
        description='Input:',
        disabled=False
    )
    submit_button = widgets.Button(description="Prediksi")
    output = widgets.Output()

    def on_button_click(b):
        with output:
            output.clear_output()
            input_text = text_box.value
            preprocessed_text = preprocess_text_for_prediction(input_text, tokenizer, maxlen)
            prediction = model.predict(preprocessed_text)
            predicted_label = np.argmax(prediction, axis=1)[0]
            emotion_labels = ["Bahagia", "Sedih", "Marah"]
            result = emotion_labels[predicted_label]
            print(f'Teks: "{input_text}"')
            print(f'Prediksi: {result}')

    submit_button.on_click(on_button_click)

    display(text_box, submit_button, output)

get_user_input_and_predict(model, tokenizer, maxlen)

Textarea(value='', description='Input:', placeholder='Masukkan teks di sini...')

Button(description='Prediksi', style=ButtonStyle())

Output()