In [77]:
import pandas as pd
import re

# RNN

# Prepare Dataset

In [78]:
df_train = pd.read_csv("train.csv")
df_valid = pd.read_csv("valid.csv")
df_test = pd.read_csv("test.csv")
df =  pd.concat([df_train, df_valid, df_test], ignore_index=True)
df

Unnamed: 0,id,text,label
0,219,Nikmati cicilan 0% hingga 12 bulan untuk pemes...,neutral
1,209,Kue-kue yang disajikan bikin saya bernostalgia...,positive
2,436,Ibu pernah bekerja di grab indonesia,neutral
3,394,Paling suka banget makan siang di sini ayam sa...,positive
4,592,Pelayanan bus DAMRI sangat baik,positive
...,...,...,...
995,502,Saya sudah sering kali datang menikmati makana...,positive
996,268,Banyak orang yang kurang suka untuk berobat ke...,negative
997,282,"Pelayanan baik, tempat parkir cukup luas, kebu...",positive
998,407,"Demi apa pun tes cpns bikin macet, sialannnnnnn",negative


In [79]:
df.label.value_counts()

label
negative    383
positive    378
neutral     239
Name: count, dtype: int64

# Text Normalization

In [80]:
def cleansing(sent):
    string = sent.lower()
    string = re.sub(r'[^a-zA-Z0-9]', ' ', string)
    return string

In [81]:
df['text_clean'] = df.text.apply(cleansing)
df.head()

Unnamed: 0,id,text,label,text_clean
0,219,Nikmati cicilan 0% hingga 12 bulan untuk pemes...,neutral,nikmati cicilan 0 hingga 12 bulan untuk pemes...
1,209,Kue-kue yang disajikan bikin saya bernostalgia...,positive,kue kue yang disajikan bikin saya bernostalgia...
2,436,Ibu pernah bekerja di grab indonesia,neutral,ibu pernah bekerja di grab indonesia
3,394,Paling suka banget makan siang di sini ayam sa...,positive,paling suka banget makan siang di sini ayam sa...
4,592,Pelayanan bus DAMRI sangat baik,positive,pelayanan bus damri sangat baik


In [82]:
neg = df.loc[df.label == 'negative'].text_clean.tolist()
neu = df.loc[df.label == 'neutral'].text_clean.tolist()
pos = df.loc[df.label == 'positive'].text_clean.tolist()

neg_label = df.loc[df.label == 'negative'].label.tolist()
neu_label = df.loc[df.label == 'neutral'].label.tolist()
pos_label = df.loc[df.label == 'positive'].label.tolist()

total_data = pos + neu + neg
labels = pos_label + neu_label + neg_label

print("Pos: %s, Neu: %s, Neg: %s" % (len(pos), len(neu), len(neg)))
print("Total data: %s" % len(total_data))

Pos: 378, Neu: 239, Neg: 383
Total data: 1000


# Feature Extraction

In [83]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict


In [84]:
max_features = 100000
tokenizer = Tokenizer(num_words=max_features, split=' ', lower=True)
tokenizer.fit_on_texts(total_data)

with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("tokenizer.pickle has created!")

tokenizer.pickle has created!


In [85]:
X = tokenizer.texts_to_sequences(total_data)

vocab_size = len(tokenizer.word_index)
maxlen = max(len(x) for x in X)

In [86]:
X = pad_sequences(X)

with open('x_pad_sequences.pickle','wb') as handle:
    pickle.dump(X, handle, protocol = pickle.HIGHEST_PROTOCOL)
    print("x_pad_sequences.pickle has created!")

X

x_pad_sequences.pickle has created!


array([[   0,    0,    0, ...,   49,   14,  104],
       [   0,    0,    0, ...,    7,   12,  564],
       [   0,    0,    0, ..., 1876,   16,  103],
       ...,
       [   0,    0,    0, ...,  918,   14, 1769],
       [   0,    0,    0, ...,  111,  185, 4257],
       [   0,    0,    0, ..., 4261,  120,  435]])

In [87]:
Y = pd.get_dummies(labels)
Y = Y.values

with open('y_labels.pickle', 'wb') as handle:
    pickle.dump(Y, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("y_labels.pickle has created!")

y_labels.pickle has created!


In [88]:
df_array = pd.DataFrame(data=X)
print(df_array)

     0   1   2   3   4   5   6   7    8   9   ...    67    68    69    70  \
0     0   0   0   0   0   0   0   0    0   0  ...    17   919   389    32   
1     0   0   0   0   0   0   0   0    0   0  ...  1874     9  1875    41   
2     0   0   0   0   0   0   0   0    0   0  ...     0     0     0     0   
3     0   0   0   0   0   0   0   0    0   0  ...    20   755  1229     2   
4    77  28  12  20  79  80  67   3  756  67  ...   352  1231   220    97   
..   ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ...   ...   ...   ...   ...   
995   0   0   0   0   0   0   0   0    0   0  ...     3   271    48   117   
996   0   0   0   0   0   0   0   0    0   0  ...  1171     2   385    91   
997   0   0   0   0   0   0   0   0    0   0  ...   101  4253  4254  4255   
998   0   0   0   0   0   0   0   0    0   0  ...     0     0   633    98   
999   0   0   0   0   0   0   0   0    0   0  ...    50   778    71    65   

       71    72    73    74    75    76  
0    1226     7     3    49    14

# Split traning and testing dataset

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
file = open("x_pad_sequences.pickle", "rb")
X = pickle.load(file)
file.close()

In [91]:
file = open("y_labels.pickle", "rb")
Y = pickle.load(file)
file.close()

In [92]:
# split dataset to 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

# Model Training

In [93]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, SimpleRNN, Activation
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.layers import Flatten
from tensorflow.keras import backend as K

In [94]:
embed_dim = 100
units = 64

max_features = 100000

In [95]:
model = Sequential()
model.add(Embedding(max_features, embed_dim))
model.add(SimpleRNN(units, dropout=0.2))
model.add(layers.Dense(3, activation='softmax'))
sgd = optimizers.Adam(learning_rate= 0.001)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary)

adam = optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

<bound method Model.summary of <Sequential name=sequential_17, built=False>>


In [96]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
history = model.fit(X_train, y_train, epochs=10, batch_size=10, validation_data = (X_test, y_test), verbose=1, callbacks=[es])

Epoch 1/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 61ms/step - accuracy: 0.4599 - loss: 1.0208 - val_accuracy: 0.5850 - val_loss: 0.8990
Epoch 2/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 59ms/step - accuracy: 0.8512 - loss: 0.5620 - val_accuracy: 0.6750 - val_loss: 0.7655
Epoch 3/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 58ms/step - accuracy: 0.9791 - loss: 0.1511 - val_accuracy: 0.6000 - val_loss: 0.8792
Epoch 3: early stopping


In [97]:
model.save("API/model_of_rnn/model_rnn.keras")
print("Model has created!")

Model has created!


# Model Evaluation

In [98]:
from sklearn import metrics

In [99]:
predictions = model.predict(X_test)
y_pred = predictions
matrix_test = metrics.classification_report(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print("Testing selesai")
print(matrix_test)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Testing selesai
              precision    recall  f1-score   support

           0       0.52      0.47      0.49        73
           1       0.47      0.48      0.48        54
           2       0.76      0.82      0.79        73

    accuracy                           0.60       200
   macro avg       0.58      0.59      0.59       200
weighted avg       0.59      0.60      0.60       200



In [100]:
from numpy import arange
 
# Load the training and validation loss dictionaries
train_loss = load(open('train_loss.pkl', 'rb'))
val_loss = load(open('val_loss.pkl', 'rb'))
 
# Retrieve each dictionary's values
train_values = train_loss.values()
val_values = val_loss.values()
 
# Generate a sequence of integers to represent the epoch numbers
epochs = range(1, 21)
 
# Plot and label the training and validation loss values
plt.plot(epochs, train_values, label='Training Loss')
plt.plot(epochs, val_values, label='Validation Loss')
 
# Add in a title and axes labels
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
 
# Set the tick locations
plt.xticks(arange(0, 21, 2))
 
# Display the plot
plt.legend(loc='best')
plt.show()

NameError: name 'load' is not defined

## K Cross Validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

accuracies = []
embed_dim = 100
units = 64

In [None]:
# iterate sebanyak k cross validation
for iteration, data in enumerate(kf.split(X_train), start=1):

    # get data and target train
    data_train = X_train[data[0]]
    target_train = y_train[data[0]]

    # get data and target test
    data_test =  X_train[data[1]]
    target_test =  y_train[data[1]]

    # model training menggunakan data train
    model = Sequential()
    model.add(Embedding(max_features, embed_dim))
    model.add(SimpleRNN(units, dropout=0.2))
    model.add(layers.Dense(3, activation='softmax'))
    sgd = optimizers.Adam(learning_rate= 0.001)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary)
    
    adam = optimizers.Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
    history = model.fit(data_train, target_train, epochs=10, batch_size=10, validation_data = (data_test, target_test), verbose=1, callbacks=[es])

    # prediksi data test
    predictions = model.predict(data_test)
    y_pred = predictions

    # menghitung accuracy
    accuracy = accuracy_score(target_test.argmax(axis=1), y_pred.argmax(axis=1))

    print("Training ke-", iteration)
    print(classification_report(target_test.argmax(axis=1), y_pred.argmax(axis=1)))
    print("=================================================================")

    accuracies.append(accuracy)
    

In [None]:
average_accuracy = np.mean(accuracies)

print("Rata-rata accuracy: ", average_accuracy)

## Evaluation Visualization

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12,5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title("Training and validation accuracy")
    plt.legend()

    
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title("Training and validation loss")
    plt.legend()

%matplotlib inline
plot_history(history)

# Prediksi data baru

In [None]:
import re
from keras.models import load_model

In [None]:
input_text = """Pelayanan baik, tempat parkir cukup luas"""

In [None]:
def cleansing(sent):
    string = sent.lower()
    string = re.sub(r'[^a-zA-Z0-9]', ' ', string)
    return string

In [None]:
sentiment = ['negative','neutral','positive']

text = [cleansing(input_text)]
predicted = tokenizer.texts_to_sequences(text)
guess = pad_sequences(predicted, maxlen=X.shape[1])

model = load_model("model_rnn.keras")
prediction = model.predict(guess)
polarity = np.argmax(prediction[0])

print("Text: ", text[0])
print("Sentiment: ", sentiment[polarity])

In [None]:
file = open("API/resources_of_rnn/x_pad_sequences.pickle",'rb')
feature_file_from_rnn = pickle.load(file)
file = open("API/resources_of_rnn/tokenizer.pickle",'rb')
tokenizer_from_rnn = pickle.load(file)
file.close()

model_file_from_rnn = load_model('API/model_of_rnn/model_rnn.keras')

original_text = "Pelayanan baik, tempat parkir cukup luas"
text = [cleansing(original_text)]

feature = tokenizer_from_rnn.texts_to_sequences(text)
feature = pad_sequences(feature, maxlen=feature_file_from_rnn.shape[1])

prediction = model_file_from_rnn.predict(feature)
get_sentiment = sentiment[np.argmax(prediction[0])]

get_sentiment