<a href="https://colab.research.google.com/github/blythecarr/NLP_E-CommersRatingAndReview/blob/main/STI202303367_WindyPangestuti_NLP_KlasifikasiSentimenE_CommerceRatingAndReview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Import Library

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

2. Load dataset

In [6]:
df = pd.read_csv("/content/raw_review_googleplay.csv")

3. Tampilkan kolom yang tersedi

In [7]:
print("Kolom dalam dataset:", df.columns)

Kolom dalam dataset: Index(['reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'appVersion'],
      dtype='object')


4. Tampilkan 5 data pertama

In [8]:
print(df[['content', 'score']].head())

                                             content  score
0                   bakalan langganan belanja disini    5.0
1  Saya dengar ada promo pengguna baru diskon sam...    3.0
2                                             mantap    5.0
3                                      banyak diskon    5.0
4  Kurang Transparan TAGIHAN TOKOPEDIA CARD, apa ...    2.0


5. Buat label sentimen berdasarkan skor rating

In [9]:
def label_sentimen(score):
    if score >= 4:
        return 1  # Positif
    elif score <= 2:
        return 0  # Negatif
    else:
        return None  # Netral

df['label'] = df['score'].apply(label_sentimen)
df = df.dropna(subset=['label']).copy()  # Hapus data netral
df.loc[:, 'label'] = df['label'].astype(int)

6. Tokenisasi teks ulasan

In [10]:
tokenizer = Tokenizer(num_words=5000, oov_token='')
tokenizer.fit_on_texts(df['content'])

sequences = tokenizer.texts_to_sequences(df['content'])
padded = pad_sequences(sequences, maxlen=100, padding='post')

In [16]:
print("Token hasil tokenisasi:", sequences)

Token hasil tokenisasi: [[1203, 460, 15, 163], [12], [23, 67], [208, 3659, 558, 3, 1125, 87, 10, 14, 33, 1], [648, 147, 986, 319, 92, 4, 437, 108, 345, 104, 61, 1635, 1, 149, 11, 10, 199, 307, 179, 41, 1, 1, 463, 65, 28, 52, 17, 606, 2045], [12], [1], [421, 488, 45, 129, 1126, 109, 2341, 45, 450, 397, 17, 70, 377], [143, 20, 6, 93, 150], [471, 234, 236, 164, 202, 74, 67, 1691, 30, 181, 2, 405, 169, 169, 2, 183, 63, 422], [1, 80, 112], [812, 58, 15, 112, 3969], [110], [31, 1043, 159, 46, 11, 138, 2, 14, 34, 172, 4, 73, 21, 1138, 965, 30, 11, 339, 1138, 965, 832, 832, 407, 21, 1138, 965, 78, 1437, 33, 227, 889, 3, 50, 912, 199, 87, 87, 4, 117, 6, 569, 17, 599, 337, 4, 1526, 154, 95, 3590, 241, 80, 14, 10, 1460, 517, 159], [9, 1091, 24, 67, 185, 251], [189, 46, 39], [54], [4, 1571, 584, 56, 529, 189, 30, 3887, 27, 125, 29, 164, 102, 763, 80, 16, 302, 189, 27, 348, 697, 209, 5, 61, 53, 104, 759, 38, 1028, 1, 30, 675, 521, 25, 104, 2, 87, 1, 4, 164, 102, 751, 2, 821, 563, 662, 3450, 39, 30,

7. Split data

In [19]:
X_train, X_test, y_train, y_test = train_test_split(padded, df['label'], test_size=0.2, random_state=42)

8. Bangun model LSTM

In [18]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])



In [20]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [21]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

9. Training model

In [23]:
model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop]
)

Epoch 1/5
[1m3959/3959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m419s[0m 106ms/step - accuracy: 0.6887 - loss: 0.6242 - val_accuracy: 0.6911 - val_loss: 0.6184
Epoch 2/5
[1m3959/3959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 106ms/step - accuracy: 0.6932 - loss: 0.6176 - val_accuracy: 0.6911 - val_loss: 0.6183
Epoch 3/5
[1m3959/3959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m461s[0m 111ms/step - accuracy: 0.6915 - loss: 0.6182 - val_accuracy: 0.6911 - val_loss: 0.6183
Epoch 4/5
[1m3959/3959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 110ms/step - accuracy: 0.6928 - loss: 0.6169 - val_accuracy: 0.6911 - val_loss: 0.6183
Epoch 5/5
[1m3959/3959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 111ms/step - accuracy: 0.6917 - loss: 0.6178 - val_accuracy: 0.6911 - val_loss: 0.6183


<keras.src.callbacks.history.History at 0x789f80ffa6d0>

10. Evaluasi model

In [24]:
loss, acc = model.evaluate(X_test, y_test)
print(f'\n✅ Akurasi Testing: {acc * 100:.2f}%')

[1m1238/1238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 25ms/step - accuracy: 0.6918 - loss: 0.6176

✅ Akurasi Testing: 69.48%


11. Uji coba

In [28]:
# Fungsi pembersihan teks
def bersihkan_teks(teks):
    teks = teks.lower()
    teks = re.sub(r'[^a-zA-Z0-9\s]', '', teks)
    return teks


In [40]:
import re

print("\n🔍 Uji Coba Prediksi Ulasan Baru")


ulasan_baru = input("Masukkan ulasan Anda: ")


ulasan_bersih = bersihkan_teks(ulasan_baru)

ulasan_seq = tokenizer.texts_to_sequences([ulasan_bersih])
print("Token hasil tokenisasi:", ulasan_seq)  # Debug

if len(ulasan_seq[0]) == 0:
    print("⚠️ Kata-kata dalam ulasan tidak dikenali oleh model.")
else:
    ulasan_pad = pad_sequences(ulasan_seq, maxlen=100, padding='post')
    prediksi = model.predict(ulasan_pad)
    skor = prediksi[0][0]
    print(f"\n📊 Skor prediksi: {skor:.4f}")

    if skor > 0.5:
        print("✅ Hasil Prediksi: Positif")
    else:
        print("❌ Hasil Prediksi: Negatif")


🔍 Uji Coba Prediksi Ulasan Baru
Masukkan ulasan Anda: barang bagus sesuai dengan pesanan
Token hasil tokenisasi: [[17, 9, 138, 43, 92]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step

📊 Skor prediksi: 0.6914
✅ Hasil Prediksi: Positif
