# 0. Install dependency

In [None]:
!pip -q install Sastrawi emoji nltk requests pandas scikit-learn joblib scipy threadpoolctl
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 1. Setup & Load Dataset

In [5]:
import pandas as pd

df_raw = pd.read_csv("playstore_reviews_raw.csv")
df_raw = df_raw[df_raw["lang"] == "id"].copy()  # fokus ID biar sesuai arahan reviewer
df_raw = df_raw.dropna(subset=["text"]).copy()

print("Total ID:", len(df_raw))
df_raw.head()

Total ID: 5678


Unnamed: 0,text,rating,date,lang
0,saya juga suka dengan geme ini saya gak di ber...,1,2026-01-28 10:43:47,id
1,Dev. serakah author yang serakah?,1,2026-01-28 09:11:21,id
2,game asuuuuu,1,2026-01-28 03:27:44,id
3,"alahhhh... dulu bagus banget ini game, sekaran...",1,2026-01-28 00:45:17,id
4,epic comeback pun hadia nya dikit kikir,1,2026-01-26 15:22:15,id


# 2. Preprocessing

In [7]:
import re
import emoji
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stopwords = set(StopWordRemoverFactory().get_stop_words())
stemmer = StemmerFactory().create_stemmer()

# slang dict contoh (silakan tambah)
SLANG = {
    "gk":"tidak", "ga":"tidak", "gak":"tidak", "nggak":"tidak", "tdk":"tidak",
    "bgt":"banget", "bgtt":"banget", "aja":"saja", "udh":"sudah", "dpt":"dapat",
    "sy":"saya", "gw":"saya", "gue":"saya", "kmu":"kamu",
    "mantul":"mantap", "kzl":"kesal", "bt":"bosan"
}

def normalize_slang(tokens):
    return [SLANG.get(t, t) for t in tokens]

def preprocess(text: str):
    t = str(text).lower()
    t = emoji.replace_emoji(t, replace=" ")              # emoji handling
    t = re.sub(r"http\S+|www\.\S+", " ", t)              # hapus url
    t = re.sub(r"(.)\1{2,}", r"\1\1", t)                 # huruf berulang: asuuuu -> asuu
    t = re.sub(r"[^a-z\s]", " ", t)                      # hanya huruf
    t = re.sub(r"\s+", " ", t).strip()

    tokens = t.split()                                  # tokenization sederhana
    tokens = normalize_slang(tokens)                     # slang normalization
    tokens = [x for x in tokens if x not in stopwords]   # stopwords removal
    tokens = [stemmer.stem(x) for x in tokens]           # stemming (MPStemmer)

    return tokens

df_raw["tokens"] = df_raw["text"].apply(preprocess)
df_raw["text_clean"] = df_raw["tokens"].apply(lambda x: " ".join(x))
df_raw = df_raw[df_raw["text_clean"].str.len() >= 10].copy()

print("Setelah preprocessing:", len(df_raw))
df_raw[["text","text_clean"]].head()

Setelah preprocessing: 5391


Unnamed: 0,text,text_clean
0,saya juga suka dengan geme ini saya gak di ber...,suka geme beri masuk hapus
1,Dev. serakah author yang serakah?,dev serakah author serakah
3,"alahhhh... dulu bagus banget ini game, sekaran...",alahh dulu bagus banget game sekarang solo lev...
4,epic comeback pun hadia nya dikit kikir,epic comeback hadia nya dikit kikir
6,game bagus karna grafik dan cerita namun banya...,game bagus karna grafik cerita banyak banget k...


# 3. Labeling Lexicon

In [9]:
import requests

POS_URL = "https://raw.githubusercontent.com/fajri91/InSet/refs/heads/master/positive.tsv"
NEG_URL = "https://raw.githubusercontent.com/fajri91/InSet/refs/heads/master/negative.tsv"

def download_to_file(url, out_path):
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(r.text)

download_to_file(POS_URL, "lexicon_positive.tsv")
download_to_file(NEG_URL, "lexicon_negative.tsv")

print("Downloaded lexicon files")

Downloaded lexicon files


In [10]:
def load_lexicon_words(path):
    words = set()
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            w = line.split("\t")[0].split(",")[0].strip()
            if w:
                words.add(w)
    return words

pos_lex = load_lexicon_words("lexicon_positive.tsv")
neg_lex = load_lexicon_words("lexicon_negative.tsv")
print("pos:", len(pos_lex), "neg:", len(neg_lex))

def lexicon_score(tokens):
    score = 0
    for w in tokens:
        if w in pos_lex: score += 1
        if w in neg_lex: score -= 1
    return score

def score_to_label(score: int, threshold: int = 1):
    if score >= threshold: return "positif"
    if score <= -threshold: return "negatif"
    return "netral"


pos: 3610 neg: 6608


In [11]:
import pandas as pd

df_raw = pd.read_csv("playstore_reviews_raw.csv").dropna(subset=["text"]).copy()
if "lang" in df_raw.columns:
    df_raw = df_raw[df_raw["lang"] == "id"].copy()

df_raw["tokens"] = df_raw["text"].apply(preprocess)
df_raw["text_clean"] = df_raw["tokens"].apply(lambda x: " ".join(x))
df_raw = df_raw[df_raw["text_clean"].str.len() > 0].copy()

print("rows:", len(df_raw))
df_raw[["text","text_clean"]].head()

rows: 5652


Unnamed: 0,text,text_clean
0,saya juga suka dengan geme ini saya gak di ber...,suka geme beri masuk hapus
1,Dev. serakah author yang serakah?,dev serakah author serakah
2,game asuuuuu,game asuu
3,"alahhhh... dulu bagus banget ini game, sekaran...",alahh dulu bagus banget game sekarang solo lev...
4,epic comeback pun hadia nya dikit kikir,epic comeback hadia nya dikit kikir


In [12]:
df_raw["score_lex"] = df_raw["tokens"].apply(lexicon_score)
df_raw["label"] = df_raw["score_lex"].apply(lambda s: score_to_label(s, threshold=1))

print(df_raw["label"].value_counts())
df_raw[["text_clean","score_lex","label"]].head(10)

df_labeled = df_raw[["text","label"]].copy()
df_labeled.to_csv("playstore_reviews_labeled.csv", index=False)
print("Saved playstore_reviews_labeled.csv | rows:", len(df_labeled))

label
positif    2008
negatif    1928
netral     1716
Name: count, dtype: int64
Saved playstore_reviews_labeled.csv | rows: 5652


In [13]:
df = df_raw[df_raw["label"].isin(["negatif","positif"])].copy()
print("Total 2 kelas:", len(df))
print(df["label"].value_counts())

Total 2 kelas: 3936
label
positif    2008
negatif    1928
Name: count, dtype: int64


In [14]:
df_raw[["text","tokens","text_clean","score_lex","label"]].head(10)

Unnamed: 0,text,tokens,text_clean,score_lex,label
0,saya juga suka dengan geme ini saya gak di ber...,"[suka, geme, beri, masuk, hapus]",suka geme beri masuk hapus,-2,negatif
1,Dev. serakah author yang serakah?,"[dev, serakah, author, serakah]",dev serakah author serakah,0,netral
2,game asuuuuu,"[game, asuu]",game asuu,1,positif
3,"alahhhh... dulu bagus banget ini game, sekaran...","[alahh, dulu, bagus, banget, game, sekarang, s...",alahh dulu bagus banget game sekarang solo lev...,6,positif
4,epic comeback pun hadia nya dikit kikir,"[epic, comeback, hadia, nya, dikit, kikir]",epic comeback hadia nya dikit kikir,-1,negatif
5,Keren,[keren],keren,0,netral
6,game bagus karna grafik dan cerita namun banya...,"[game, bagus, karna, grafik, cerita, banyak, b...",game bagus karna grafik cerita banyak banget k...,-5,negatif
7,"cukup bagus sih,cuma kadang bug nya aneh","[cukup, bagus, sih, cuma, kadang, bug, nya, aneh]",cukup bagus sih cuma kadang bug nya aneh,-1,negatif
8,game ga jelas tiap tiap update mana besar bang...,"[game, jelas, tiap, tiap, update, mana, besar,...",game jelas tiap tiap update mana besar banget ...,3,positif
9,FavoriteðŸ‘Œtapi hp skrng dh xkuat... terpaksa un...,"[favorite, hp, skrng, dh, xkuat, paksa, uninstal]",favorite hp skrng dh xkuat paksa uninstal,0,netral


# 4. Split Data

In [15]:
from sklearn.model_selection import train_test_split

X = df["text_clean"]   # df = hasil 2 kelas (positif/negatif)
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", len(X_train), "Test:", len(X_test))
print("Distribusi train:\n", y_train.value_counts())
print("Distribusi test:\n", y_test.value_counts())

Train: 3148 Test: 788
Distribusi train:
 label
positif    1606
negatif    1542
Name: count, dtype: int64
Distribusi test:
 label
positif    402
negatif    386
Name: count, dtype: int64


# 5. Feature Extraction

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_df=0.95,
    min_df=2,
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(X_train)   # FIT di train
X_test_tfidf  = tfidf.transform(X_test)        # TRANSFORM di test

# 6. Tranning

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

best_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=80000,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", LogisticRegression(
        max_iter=5000,
        C=3.0,
        class_weight="balanced"
    ))
])

best_model.fit(X_train, y_train)
pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred))
print("\nConfusion:\n", confusion_matrix(y_test, pred))

Accuracy: 0.9149746192893401

Report:
               precision    recall  f1-score   support

     negatif       0.90      0.93      0.91       386
     positif       0.93      0.90      0.92       402

    accuracy                           0.91       788
   macro avg       0.92      0.92      0.91       788
weighted avg       0.92      0.91      0.91       788


Confusion:
 [[360  26]
 [ 41 361]]


# 7. Inference

In [18]:
def predict_sentiment(text: str) -> str:
    tokens = preprocess(text)
    clean = " ".join(tokens)
    return best_model.predict([clean])[0]

samples = [
    "game nya seru banget, grafik bagus, mantap!",
    "parah, crash terus dan lag, bikin kesel",
    "update lumayan tapi masih ada bug",
    "server sering down, rank jadi rugi",
    "pay to win banget, kecewa",
    "eventnya keren, hadiah banyak",
    "top up error item ga masuk, parah",
    "kontrol enak, gameplay smooth",
    "loading lama banget padahal wifi kenceng",
    "aplikasi ringan dan ga bikin panas",
    "banyak cheater, jadi males main",
    "sistemnya adil, makin lama makin seru",
    "support cepat respon, mantap dev",
    "update bikin force close terus",
    "overall bagus, recommended",
    "matchmaking jelek, ketemu musuh ga seimbang",
    "grafik hd tapi tetap lancar",
    "banyak bug quest ga kelar",
    "komunitasnya asik",
    "parah banget, nge-freeze terus"
]
for i,s in enumerate(samples,1):
    print(f"{i:02d}. {s} -> {predict_sentiment(s)}")

01. game nya seru banget, grafik bagus, mantap! -> positif
02. parah, crash terus dan lag, bikin kesel -> negatif
03. update lumayan tapi masih ada bug -> negatif
04. server sering down, rank jadi rugi -> positif
05. pay to win banget, kecewa -> positif
06. eventnya keren, hadiah banyak -> positif
07. top up error item ga masuk, parah -> negatif
08. kontrol enak, gameplay smooth -> negatif
09. loading lama banget padahal wifi kenceng -> negatif
10. aplikasi ringan dan ga bikin panas -> negatif
11. banyak cheater, jadi males main -> negatif
12. sistemnya adil, makin lama makin seru -> positif
13. support cepat respon, mantap dev -> positif
14. update bikin force close terus -> negatif
15. overall bagus, recommended -> positif
16. matchmaking jelek, ketemu musuh ga seimbang -> negatif
17. grafik hd tapi tetap lancar -> positif
18. banyak bug quest ga kelar -> positif
19. komunitasnya asik -> positif
20. parah banget, nge-freeze terus -> positif


# 8. Simpan Model

In [19]:
import joblib
joblib.dump(best_model, "sentiment_model.joblib")
print("Saved: sentiment_model.joblib")

Saved: sentiment_model.joblib


In [21]:
bundle = joblib.load("sentiment_model.joblib")
print(type(bundle))
print(bundle)

<class 'sklearn.pipeline.Pipeline'>
Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.95, max_features=80000, min_df=2,
                                 ngram_range=(1, 2), sublinear_tf=True)),
                ('clf',
                 LogisticRegression(C=3.0, class_weight='balanced',
                                    max_iter=5000))])


In [22]:
bundle.predict(["game bagus banget"])

array(['positif'], dtype=object)