In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
import gensim
from gensim.models import Word2Vec, FastText
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import optimizers
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from transformers import pipeline
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from torch.optim import AdamW

**Pre Processing**

In [3]:
# download stopwords bahasa Indonesia
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

# load dataset hasil scraping
df = pd.read_csv("/content/drive/MyDrive/Asah/playstore_tokopedia_reviews.csv")

def fast_clean(text):
    text = str(text).lower()                           # lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # hapus link
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)           # hapus angka/simbol
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words] # hapus stopwords
    return " ".join(tokens)


df["cleaned"] = df["content"].map(fast_clean)

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,content,score,cleaned
0,kok produc jualan ya,1,produc jualan ya
1,tok the tok,3,tok the tok
2,Mantap,5,mantap
3,good,5,good
4,proses retur ribet cs lambat banget responnya ...,1,proses retur ribet cs lambat banget responnya ...


In [4]:
def label_sentiment(score):
    if score <= 2:
        return "negatif"
    elif score == 3:
        return "netral"
    else:
        return "positif"

df["label"] = df["score"].apply(label_sentiment)

In [5]:
df.head()

Unnamed: 0,content,score,cleaned,label
0,kok produc jualan ya,1,produc jualan ya,negatif
1,tok the tok,3,tok the tok,netral
2,Mantap,5,mantap,positif
3,good,5,good,positif
4,proses retur ribet cs lambat banget responnya ...,1,proses retur ribet cs lambat banget responnya ...,negatif


# **Skema 1 : LSTM + TF-IDF**

In [6]:
# ambil teks & label
texts = df["cleaned"].astype(str).tolist()
labels = df["label"].astype(str).tolist()

# TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(texts).toarray()

# ubah label ke angka (0=negatif, 1=netral, 2=positif)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(labels)

# split train-test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Shape data train:", X_train.shape)
print("Shape data test :", X_test.shape)

Shape data train: (8000, 5000)
Shape data test : (2000, 5000)


In [7]:
# ubah label ke one-hot encoding
num_classes = len(np.unique(y))
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat = to_categorical(y_test, num_classes)

# reshape TF-IDF ke bentuk 3D (samples, timesteps, features)
# di sini timesteps = jumlah fitur, features = 1
X_train_3d = np.expand_dims(X_train, axis=2)
X_test_3d = np.expand_dims(X_test, axis=2)

print("Shape untuk LSTM:", X_train_3d.shape)

# bangun model LSTM sederhana
model = Sequential()
model.add(LSTM(128, input_shape=(X_train_3d.shape[1], 1)))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation="softmax"))

# compile model
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

# latih model
history = model.fit(
    X_train_3d, y_train_cat,
    validation_data=(X_test_3d, y_test_cat),
    epochs=5,      # bisa dinaikkan (10–20) kalau butuh akurasi lebih
    batch_size=128,
    verbose=1
)

# evaluasi
loss, acc = model.evaluate(X_test_3d, y_test_cat, verbose=0)
print(f"Akurasi Test Set: {acc*100:.2f}%")

Shape untuk LSTM: (8000, 5000, 1)


  super().__init__(**kwargs)


Epoch 1/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 306ms/step - accuracy: 0.4515 - loss: 0.9775 - val_accuracy: 0.5080 - val_loss: 0.8711
Epoch 2/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 325ms/step - accuracy: 0.5067 - loss: 0.8753 - val_accuracy: 0.5080 - val_loss: 0.8723
Epoch 3/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 269ms/step - accuracy: 0.4996 - loss: 0.8836 - val_accuracy: 0.5080 - val_loss: 0.8688
Epoch 4/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 271ms/step - accuracy: 0.5030 - loss: 0.8767 - val_accuracy: 0.5080 - val_loss: 0.8691
Epoch 5/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 273ms/step - accuracy: 0.4920 - loss: 0.8816 - val_accuracy: 0.5080 - val_loss: 0.8700
Akurasi Test Set: 50.80%


# **Skema 2 word2Vec with BILSTM**

In [8]:
df = pd.read_csv("/content/drive/MyDrive/Asah/playstore_tokopedia_cleaned.csv")
df["cleaned"] = df["cleaned"].astype(str)

# (opsional) buang teks sangat pendek yg berisik
df = df[df["cleaned"].str.split().str.len() > 2].reset_index(drop=True)

In [9]:
le = LabelEncoder()
df["y"] = le.fit_transform(df["label"])  # 0/1/2
num_classes = len(le.classes_)

In [10]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    df["cleaned"], df["y"], test_size=0.1, random_state=42, stratify=df["y"]
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.1111, random_state=42, stratify=y_train_full
)

In [11]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train.tolist())
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print("Vocab size:", vocab_size)

max_len = 100  # bisa 100–150; jangan terlalu panjang biar cepat & stabil
Xtr = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len, padding="post")
Xva = pad_sequences(tokenizer.texts_to_sequences(X_val),   maxlen=max_len, padding="post")
Xte = pad_sequences(tokenizer.texts_to_sequences(X_test),  maxlen=max_len, padding="post")

ytr = to_categorical(y_train, num_classes)
yva = to_categorical(y_val,   num_classes)
yte = to_categorical(y_test,  num_classes)

Vocab size: 7972


In [12]:
sentences = [text.split() for text in X_train.tolist()]
w2v_dim = 100


w2v = Word2Vec(
    sentences=sentences,
    vector_size=w2v_dim,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    epochs=15)

embedding_matrix = np.zeros((vocab_size, w2v_dim))
for word, i in word_index.items():
    if word in w2v.wv:
        embedding_matrix[i] = w2v.wv[word]

In [13]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights = {i: w for i, w in enumerate(class_weights)}
print("Class weights:", class_weights)

Class weights: {0: 0.5528735632183908, 1: 4.487701441899915, 2: 1.0325917252146761}


In [14]:
model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=w2v_dim,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation="relu"),
    Dropout(0.4),
    Dense(num_classes, activation="softmax")
])

opt = optimizers.Adam(learning_rate=2e-3)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
model.summary()



In [15]:
history = model.fit(
    Xtr, ytr,
    validation_data=(Xva, yva),
    epochs=20,
    batch_size=128,
    class_weight=class_weights,
    verbose=1
)

Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 601ms/step - accuracy: 0.5096 - loss: 1.0155 - val_accuracy: 0.6103 - val_loss: 0.8188
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 676ms/step - accuracy: 0.6180 - loss: 0.8567 - val_accuracy: 0.5891 - val_loss: 0.8378
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 581ms/step - accuracy: 0.6322 - loss: 0.8631 - val_accuracy: 0.5453 - val_loss: 0.8733
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 576ms/step - accuracy: 0.5868 - loss: 0.8516 - val_accuracy: 0.6495 - val_loss: 0.7989
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 558ms/step - accuracy: 0.6533 - loss: 0.8118 - val_accuracy: 0.6012 - val_loss: 0.8406
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 641ms/step - accuracy: 0.6550 - loss: 0.8394 - val_accuracy: 0.7251 - val_loss: 0.6901
Epoch 7/20
[1m42/42[

In [16]:
test_loss, test_acc = model.evaluate(Xte, yte, verbose=0)
print(f"\nAkurasi Test Set: {test_acc*100:.2f}%")


Akurasi Test Set: 63.60%


# **Skema 3 dengan IndoBERT**

In [29]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


**Load Data dan Pelabelan**

In [30]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Asah/playstore_tokopedia_cleaned.csv")

# Pastikan kolom content sebagai string
df["content"] = df["content"].astype(str)

# Cek nama kolom
print("Kolom dataset:", df.columns)
print("Contoh data:", df.head())


from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def label_vader(text):
    score = analyzer.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "positif"
    elif score <= -0.05:
        return "negatif"
    else:
        return "netral"

df["label"] = df["content"].apply(label_vader)
print("Distribusi label:\n", df["label"].value_counts())

# Encode label
le = LabelEncoder()
df["label_enc"] = le.fit_transform(df["label"])
num_classes = len(le.classes_)

Kolom dataset: Index(['content', 'score', 'cleaned', 'label'], dtype='object')
Contoh data:                                              content  score  \
0                               kok produc jualan ya      1   
1                                        tok the tok      3   
2                                             Mantap      5   
3                                               good      5   
4  proses retur ribet cs lambat banget responnya ...      1   

                                             cleaned    label  
0                                   produc jualan ya  negatif  
1                                        tok the tok   netral  
2                                             mantap  positif  
3                                               good  positif  
4  proses retur ribet cs lambat banget responnya ...  negatif  
Distribusi label:
 label
netral     7970
positif    1555
negatif     475
Name: count, dtype: int64


In [31]:
X_temp, X_test, y_temp, y_test = train_test_split(
    df["content"], df["label_enc"], test_size=0.1, stratify=df["label_enc"], random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=1/9, stratify=y_temp, random_state=42
)
print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

Train: 8000, Val: 1000, Test: 1000


**Tokenizer**

In [32]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Load IndoBERT tokenizer
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = SentimentDataset(X_train.values, y_train.values, tokenizer)
val_dataset   = SentimentDataset(X_val.values, y_val.values, tokenizer)
test_dataset  = SentimentDataset(X_test.values, y_test.values, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16)
test_loader  = DataLoader(test_dataset, batch_size=16)

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_loader) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criterion = CrossEntropyLoss()

best_val_acc = 0
patience, counter = 2, 0

for epoch in range(5):
    # Training
    model.train()
    total_correct = 0
    loop = tqdm(train_loader, leave=False)
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        preds = torch.argmax(logits, dim=1)
        total_correct += (preds == labels).sum().item()

    train_acc = total_correct / len(train_dataset)

    # Validation
    model.eval()
    val_correct, val_total = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    val_acc = val_correct / val_total
    print(f"Epoch {epoch+1} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        counter = 0
        torch.save(model.state_dict(), "best_model.pt")
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping aktif!")
            break


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 | Train Acc: 0.9211 | Val Acc: 0.9460




Epoch 2 | Train Acc: 0.9619 | Val Acc: 0.9610




Epoch 3 | Train Acc: 0.9721 | Val Acc: 0.9550




Epoch 4 | Train Acc: 0.9816 | Val Acc: 0.9580
Early stopping aktif!


In [34]:
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

test_correct, test_total = 0, 0
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        test_correct += (preds == labels).sum().item()
        test_total += labels.size(0)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_acc = test_correct / test_total
print(f"Test Accuracy: {test_acc*100:.2f}%")

Test Accuracy: 94.10%
