# 1. Setup & Load Dataset

In [1]:
import pandas as pd

df_raw = pd.read_csv("playstore_reviews_raw.csv")
print("Total raw:", len(df_raw))
print(df_raw["rating"].value_counts().sort_index())

Total raw: 11132
rating
1    4054
2     964
3    1180
4     986
5    3948
Name: count, dtype: int64


In [2]:
df = df_raw[df_raw["rating"].isin([1,5])].copy()

df["label"] = df["rating"].map({1: "negatif", 5: "positif"})
df = df.dropna(subset=["text", "label"]).copy()

print("Total ekstrem (1 & 5):", len(df))
print(df["label"].value_counts())

Total ekstrem (1 & 5): 8002
label
negatif    4054
positif    3948
Name: count, dtype: int64


# 2. Cleaning Data

In [3]:
import re

def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)   # aaaa -> aa
    text = re.sub(r"[^a-z0-9\s]", " ", text)     # buang simbol
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_clean"] = df["text"].apply(clean_text)

# filter minimal (jangan terlalu ketat dulu)
df = df[df["text_clean"].str.len() >= 10]
df = df[df["text_clean"].str.split().str.len() >= 2]

print("Total setelah filter:", len(df))
print(df["label"].value_counts())

Total setelah filter: 7371
label
negatif    3931
positif    3440
Name: count, dtype: int64


# 3. Split Data

In [4]:
from sklearn.model_selection import train_test_split

X = df["text_clean"].astype(str)
y = df["label"].astype(str)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", len(X_train), "Test:", len(X_test))
print("Label train:", sorted(set(y_train)))
print("Label test :", sorted(set(y_test)))

Train: 5896 Test: 1475
Label train: ['negatif', 'positif']
Label test : ['negatif', 'positif']


# 4a. Model 1: TF-IDF + Logistic Regression

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

configs = [
    {"C": 1.0, "max_df": 0.95, "sublinear_tf": True},
    {"C": 2.0, "max_df": 0.95, "sublinear_tf": True},
    {"C": 3.0, "max_df": 0.95, "sublinear_tf": True},
    {"C": 5.0, "max_df": 0.95, "sublinear_tf": True},
    {"C": 2.0, "max_df": 0.9,  "sublinear_tf": True},
    {"C": 3.0, "max_df": 0.9,  "sublinear_tf": True},
]

best = None

for cfg in configs:
    model = Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=80000,
            ngram_range=(1,2),
            min_df=2,
            max_df=cfg["max_df"],
            sublinear_tf=cfg["sublinear_tf"]
        )),
        ("clf", LogisticRegression(
            max_iter=5000,
            C=cfg["C"],
            class_weight="balanced"
        ))
    ])
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    print(cfg, "-> acc:", acc)

    if (best is None) or (acc > best["acc"]):
        best = {"acc": acc, "cfg": cfg, "model": model}

print("\nBEST:", best["cfg"], "acc:", best["acc"])
print("\nReport best:\n", classification_report(y_test, best["model"].predict(X_test)))

{'C': 1.0, 'max_df': 0.95, 'sublinear_tf': True} -> acc: 0.8630508474576272
{'C': 2.0, 'max_df': 0.95, 'sublinear_tf': True} -> acc: 0.8684745762711864
{'C': 3.0, 'max_df': 0.95, 'sublinear_tf': True} -> acc: 0.8705084745762712
{'C': 5.0, 'max_df': 0.95, 'sublinear_tf': True} -> acc: 0.8684745762711864
{'C': 2.0, 'max_df': 0.9, 'sublinear_tf': True} -> acc: 0.8684745762711864
{'C': 3.0, 'max_df': 0.9, 'sublinear_tf': True} -> acc: 0.8705084745762712

BEST: {'C': 3.0, 'max_df': 0.95, 'sublinear_tf': True} acc: 0.8705084745762712

Report best:
               precision    recall  f1-score   support

     negatif       0.87      0.90      0.88       787
     positif       0.88      0.84      0.86       688

    accuracy                           0.87      1475
   macro avg       0.87      0.87      0.87      1475
weighted avg       0.87      0.87      0.87      1475



# 4b. Model 2: TF-IDF + LinearSVC

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df["text_clean"].astype(str)
y = df["label"].astype(str)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=80000,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", LinearSVC(C=1.0, class_weight="balanced"))
])

model.fit(X_train, y_train)
pred = model.predict(X_test)

acc = accuracy_score(y_test, pred)
print("Accuracy:", acc)
print("\nReport:\n", classification_report(y_test, pred))
print("\nConfusion:\n", confusion_matrix(y_test, pred))

Accuracy: 0.8589830508474576

Report:
               precision    recall  f1-score   support

     negatif       0.86      0.88      0.87       787
     positif       0.86      0.83      0.85       688

    accuracy                           0.86      1475
   macro avg       0.86      0.86      0.86      1475
weighted avg       0.86      0.86      0.86      1475


Confusion:
 [[695  92]
 [116 572]]


In [9]:
for C in [0.5, 1.0, 2.0, 3.0]:
    m = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=80000, ngram_range=(1,2),
                                  min_df=2, max_df=0.95, sublinear_tf=True)),
        ("clf", LinearSVC(C=C, class_weight="balanced"))
    ])
    m.fit(X_train, y_train)
    pred = m.predict(X_test)
    print("C:", C, "acc:", accuracy_score(y_test, pred))

C: 0.5 acc: 0.8671186440677966
C: 1.0 acc: 0.8589830508474576
C: 2.0 acc: 0.8569491525423729
C: 3.0 acc: 0.8542372881355932


# 4c. Model 3: TF-IDF + Naive Bayes

In [10]:
from sklearn.naive_bayes import ComplementNB

cnb_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", ComplementNB(alpha=0.5))
])

cnb_model.fit(X_train, y_train)
pred = cnb_model.predict(X_test)

print("Accuracy (TFIDF + ComplementNB):", accuracy_score(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred))

Accuracy (TFIDF + ComplementNB): 0.8257627118644068

Report:
               precision    recall  f1-score   support

     negatif       0.83      0.84      0.84       787
     positif       0.82      0.81      0.81       688

    accuracy                           0.83      1475
   macro avg       0.83      0.82      0.82      1475
weighted avg       0.83      0.83      0.83      1475



In [11]:
for a in [0.1, 0.3, 0.5, 1.0]:
    model = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95, sublinear_tf=True)),
        ("clf", ComplementNB(alpha=a))
    ])
    model.fit(X_train, y_train)
    print("alpha", a, "acc", accuracy_score(y_test, model.predict(X_test)))

alpha 0.1 acc 0.8345762711864407
alpha 0.3 acc 0.8271186440677966
alpha 0.5 acc 0.8257627118644068
alpha 1.0 acc 0.8244067796610169


# 5. Model Terbaik

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

best_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=80000,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", LogisticRegression(
        max_iter=5000,
        C=3.0,
        class_weight="balanced"
    ))
])

best_model.fit(X_train, y_train)
pred = best_model.predict(X_test)

print("Best Model: TF-IDF + Logistic Regression (C=3.0, max_df=0.95, sublinear_tf=True)")
print("Accuracy:", accuracy_score(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred))
print("\nConfusion:\n", confusion_matrix(y_test, pred))

Best Model: TF-IDF + Logistic Regression (C=3.0, max_df=0.95, sublinear_tf=True)
Accuracy: 0.8705084745762712

Report:
               precision    recall  f1-score   support

     negatif       0.87      0.90      0.88       787
     positif       0.88      0.84      0.86       688

    accuracy                           0.87      1475
   macro avg       0.87      0.87      0.87      1475
weighted avg       0.87      0.87      0.87      1475


Confusion:
 [[706  81]
 [110 578]]


# 6. Inference

In [14]:
def predict_sentiment(text: str) -> str:
    t = clean_text(text)
    return best_model.predict([t])[0]

samples = [
    "Game nya seru banget, grafiknya bagus, mantap!",
    "Parah, crash terus dan lag, bikin kesel.",
    "Update terbaru lumayan, tapi masih ada bug kecil.",
    "Eventnya keren, hadiah banyak, bikin betah main.",
    "Login susah, sering force close, tolong diperbaiki.",
    "Kontrolnya enak, gameplay smooth, recommended!",
    "Pay to win banget, balance jelek, kecewa.",
    "Suaranya jernih, efeknya keren, overall bagus.",
    "Server sering down, match gagal terus, capek.",
    "Setelah update malah makin berat dan panas di HP.",
    "Bagus sih, tapi iklannya kebanyakan dan ganggu.",
    "Mantap! Banyak fitur, ringan, dan tidak cepat bosan.",
    "UI nya rapi, mudah dipahami, pengalaman bagus.",
    "Bug quest tidak selesai, progress hilang, parah.",
    "Top up error, item tidak masuk, refund susah.",
    "Komunitasnya asik, mode permainan variatif.",
    "Sering disconnect pas rank, jadi rugi terus.",
    "Loading lama banget, padahal jaringan stabil.",
    "Grafik HD tapi tetap lancar, salut developernya.",
    "Aplikasi sering ngadat, kadang tidak bisa dibuka sama sekali."
]

for i, s in enumerate(samples, 1):
    print(f"{i:02d}. {s} -> {predict_sentiment(s)}")

01. Game nya seru banget, grafiknya bagus, mantap! -> positif
02. Parah, crash terus dan lag, bikin kesel. -> negatif
03. Update terbaru lumayan, tapi masih ada bug kecil. -> positif
04. Eventnya keren, hadiah banyak, bikin betah main. -> positif
05. Login susah, sering force close, tolong diperbaiki. -> negatif
06. Kontrolnya enak, gameplay smooth, recommended! -> positif
07. Pay to win banget, balance jelek, kecewa. -> negatif
08. Suaranya jernih, efeknya keren, overall bagus. -> positif
09. Server sering down, match gagal terus, capek. -> negatif
10. Setelah update malah makin berat dan panas di HP. -> negatif
11. Bagus sih, tapi iklannya kebanyakan dan ganggu. -> positif
12. Mantap! Banyak fitur, ringan, dan tidak cepat bosan. -> positif
13. UI nya rapi, mudah dipahami, pengalaman bagus. -> positif
14. Bug quest tidak selesai, progress hilang, parah. -> negatif
15. Top up error, item tidak masuk, refund susah. -> negatif
16. Komunitasnya asik, mode permainan variatif. -> positif
17

# 7. Simpan Model

In [15]:
import joblib

joblib.dump(best_model, "sentiment_model.joblib")
print("Saved model: sentiment_model.joblib")

Saved model: sentiment_model.joblib
