###Import Library

In [3]:
import pandas as pd
import numpy as np
import re
import csv
import string
import requests
from io import StringIO

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical

from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

###Loading Dataset

In [4]:
df = pd.read_csv('ulasan_aplikasi.csv')
df.shape

(247500, 1)

In [5]:
df.head()

Unnamed: 0,Review
0,saya beralih dari myBCA ke aplikasi BCA mobile...
1,aplikasinya oke bgt kok nggak ada komplain apa...
2,Aplikasi kenapa gak bisa di pakai padahal suda...
3,"u/ versi 4.62 Di menu transfer sesama Bank, un..."
4,"tolong segera diperbaiki ya, Bank BCA, agar ti..."


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247500 entries, 0 to 247499
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Review  247499 non-null  object
dtypes: object(1)
memory usage: 1.9+ MB


In [7]:
df.isna().sum()

Unnamed: 0,0
Review,1


In [8]:
df.duplicated().sum()

91437

In [9]:
clean_df = df.dropna()

In [10]:
clean_df = clean_df.drop_duplicates()

In [11]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 156062 entries, 0 to 247499
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Review  156062 non-null  object
dtypes: object(1)
memory usage: 2.4+ MB


###Preprocessing Text

In [12]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'RT[\s]', '', text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip(' ')
    return text

def casefoldingText(text):
    return text.lower()

def tokenizingText(text):
    return word_tokenize(text)

def filteringText(text):
    listStopwords = set(stopwords.words('indonesian') + stopwords.words('english'))
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    filtered = [txt for txt in text if txt not in listStopwords]
    return filtered

def toSentence(list_words):
    return ' '.join(list_words)

In [13]:
slangwords = {"@": "di", "abis": "habis", "wtb": "beli", "masi": "masih", "wts": "jual", "wtt": "tukar", "bgt": "banget", "maks": "maksimal"}

def fix_slangwords(text):
    words = text.split()
    fixed_words = [slangwords.get(word.lower(), word) for word in words]
    return ' '.join(fixed_words)

In [14]:
clean_df['text_clean'] = clean_df['Review'].apply(cleaningText)
clean_df['text_casefolding'] = clean_df['text_clean'].apply(casefoldingText)
clean_df['text_slangwords'] = clean_df['text_casefolding'].apply(fix_slangwords)
clean_df['text_tokenizing'] = clean_df['text_slangwords'].apply(tokenizingText)
clean_df['text_stopword'] = clean_df['text_tokenizing'].apply(filteringText)
clean_df['text_akhir'] = clean_df['text_stopword'].apply(toSentence)

###Pelabelan

In [15]:
lexicon_positive = {}
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
if response.status_code == 200:
    reader = csv.reader(StringIO(response.text), delimiter=',')
    for row in reader:
        lexicon_positive[row[0]] = int(row[1])

lexicon_negative = {}
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')
if response.status_code == 200:
    reader = csv.reader(StringIO(response.text), delimiter=',')
    for row in reader:
        lexicon_negative[row[0]] = int(row[1])

In [39]:
def sentiment_analysis_lexicon_indonesia(text):
    score = 0

    for word in text:
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]

    for word in text:
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]

    polarity=''

    if (score >= 0):
        polarity = 'positive'
    elif (score < 0):
        polarity = 'negative'
    else :
        polarity = 'neutral'

    return score, polarity

In [40]:
results = clean_df['text_stopword'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
clean_df['polarity_score'] = results[0]
clean_df['polarity'] = results[1]

print(clean_df['polarity'].value_counts())

polarity
positive    96704
negative    59358
Name: count, dtype: int64


In [41]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(clean_df['text_akhir'])

X_seq = tokenizer.texts_to_sequences(clean_df['text_akhir'])
X_pad = pad_sequences(X_seq, maxlen=200)

In [42]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(clean_df['polarity'])
y_categorical = to_categorical(y_encoded)

###Modeling

####LSTM + Tokenizer

In [20]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_pad, y_categorical, test_size=0.2, random_state=42)

model1 = Sequential()
model1.add(Embedding(input_dim=10000, output_dim=128, input_shape=(200,)))
model1.add(LSTM(128))
model1.add(Dropout(0.5))
model1.add(Dense(2, activation='softmax'))

model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model1.fit(X_train1, y_train1, epochs=10, batch_size=64, validation_data=(X_test1, y_test1))

loss1_train, acc1_train = model1.evaluate(X_train1, y_train1, verbose=0)
loss1_test, acc1_test = model1.evaluate(X_test1, y_test1, verbose=0)
print(f"Akurasi Training: {acc1_train*100:.2f}%")
print(f"Akurasi Testing: {acc1_test*100:.2f}%")

  super().__init__(**kwargs)


Epoch 1/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m980s[0m 501ms/step - accuracy: 0.8927 - loss: 0.2430 - val_accuracy: 0.9690 - val_loss: 0.0859
Epoch 2/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1000s[0m 513ms/step - accuracy: 0.9767 - loss: 0.0658 - val_accuracy: 0.9725 - val_loss: 0.0753
Epoch 3/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m992s[0m 509ms/step - accuracy: 0.9855 - loss: 0.0433 - val_accuracy: 0.9749 - val_loss: 0.0777
Epoch 4/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1042s[0m 509ms/step - accuracy: 0.9892 - loss: 0.0329 - val_accuracy: 0.9724 - val_loss: 0.0854
Epoch 5/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1039s[0m 507ms/step - accuracy: 0.9917 - loss: 0.0260 - val_accuracy: 0.9770 - val_loss: 0.0900
Epoch 6/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m996s[0m 511ms/step - accuracy: 0.9937 - loss: 0.0199 - val_accuracy: 0.9740 - val_lo

####TF-IDF + SVM

In [21]:
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(clean_df['text_akhir'])

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

model2 = SVC(kernel='linear', C=1, probability=True)
model2.fit(X_train2, y_train2)

train_accuracy2 = model2.score(X_train2, y_train2)
test_accuracy2 = model2.score(X_test2, y_test2)

print(f"Akurasi Training: {train_accuracy2*100:.2f}%")
print(f"Akurasi Testing: {test_accuracy2*100:.2f}%")

Akurasi Training: 97.95%
Akurasi Testing: 97.30%


####CNN + LSTM + Word2Vec

In [22]:
sentences = [text.split() for text in clean_df['text_akhir']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)

embedding_matrix = np.zeros((10000, 100))
for word, i in tokenizer.word_index.items():
    if i >= 10000:
        continue
    try:
        embedding_vector = w2v_model.wv[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        pass

In [23]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_pad, y_categorical, test_size=0.2, random_state=42)

model3 = Sequential()
model3.add(Embedding(input_dim=10000, output_dim=100, weights=[embedding_matrix], input_length=200, trainable=False))
model3.add(Conv1D(128, 5, activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(LSTM(64))
model3.add(Dropout(0.5))
model3.add(Dense(2, activation='softmax'))

model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.fit(X_train3, y_train3, epochs=10, batch_size=64, validation_data=(X_test3, y_test3))

loss3_train, acc3_train = model3.evaluate(X_train3, y_train3, verbose=0)
loss3_test, acc3_test = model3.evaluate(X_test3, y_test3, verbose=0)
print(f"Akurasi Training: {acc3_train*100:.2f}%")
print(f"Akurasi Testing: {acc3_test*100:.2f}%")



Epoch 1/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 224ms/step - accuracy: 0.8179 - loss: 0.3957 - val_accuracy: 0.8817 - val_loss: 0.2716
Epoch 2/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m452s[0m 230ms/step - accuracy: 0.8883 - loss: 0.2630 - val_accuracy: 0.8974 - val_loss: 0.2425
Epoch 3/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m492s[0m 224ms/step - accuracy: 0.9029 - loss: 0.2331 - val_accuracy: 0.9052 - val_loss: 0.2288
Epoch 4/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m448s[0m 230ms/step - accuracy: 0.9144 - loss: 0.2112 - val_accuracy: 0.9025 - val_loss: 0.2281
Epoch 5/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m501s[0m 229ms/step - accuracy: 0.9220 - loss: 0.1950 - val_accuracy: 0.9119 - val_loss: 0.2154
Epoch 6/10
[1m1951/1951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m448s[0m 229ms/step - accuracy: 0.9287 - loss: 0.1802 - val_accuracy: 0.9096 - val_loss:

####Inference Testing

#####Inference Testing Model 1 (LSTM + Tokenizer)

In [34]:
sample_text = ["aplikasinya bagus dan mudah digunakan"]

sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_pad = pad_sequences(sample_seq, maxlen=200)

prediction = model1.predict(sample_pad)
predicted_class = np.argmax(prediction)

reverse_label_mapping = {0: 'negative', 1: 'positive'}

print(f"Kalimat Uji: {sample_text[0]}")
print(f"Prediksi Sentimen: {reverse_label_mapping[predicted_class]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
Kalimat Uji: aplikasinya bagus dan mudah digunakan
Prediksi Sentimen: positive


#####Inference Testing Model 2 (TF-IDF + SVM)

In [36]:
sample_text = ["aplikasi sangat jelek dan sering error"]

sample_tfidf = tfidf.transform(sample_text)
prediction = model2.predict(sample_tfidf)

reverse_label_mapping = {0: 'negative', 1: 'positive'}

print(f"Kalimat Uji: {sample_text[0]}")
print(f"Prediksi Sentimen: {reverse_label_mapping[int(prediction[0])]}")

Kalimat Uji: aplikasi sangat jelek dan sering error
Prediksi Sentimen: negative


#####Inference Testing Model 3 (CNN + LSTM + Word2Vec)

In [38]:
sample_text = ["fitur aplikasinya sangat lengkap dan mudah digunakan"]

sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_pad = pad_sequences(sample_seq, maxlen=200)

prediction = model3.predict(sample_pad)
predicted_class = np.argmax(prediction, axis=1)

reverse_label_mapping = {0: 'negative', 1: 'positive'}

print(f"Kalimat Uji: {sample_text[0]}")
print(f"Prediksi Sentimen: {reverse_label_mapping[int(predicted_class[0])]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Kalimat Uji: fitur aplikasinya sangat lengkap dan mudah digunakan
Prediksi Sentimen: positive
