##1.Import Library

In [1]:
# Install dependencies (untuk Colab/Jupyter Notebook)
!pip install google-play-scraper
!pip install emoji
!pip install sastrawi
!pip install nlp-id

# --- Data Collection ---
from google_play_scraper import Sort, reviews

# --- Data Handling ---
import pandas as pd
import numpy as np

# --- Text Processing ---
import re
import emoji
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nlp_id.lemmatizer import Lemmatizer
from collections import defaultdict

# --- Feature Extraction ---
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# --- Machine Learning ---
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# --- Evaluation ---
from sklearn.metrics import accuracy_score, classification_report


Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1
Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hIns

In [2]:
!pip install some_package
!pip freeze > requirements.txt

Collecting some_package
  Downloading some-package-0.1.zip (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: some_package
  Building wheel for some_package (setup.py) ... [?25l[?25hdone
  Created wheel for some_package: filename=some_package-0.1-py3-none-any.whl size=1420 sha256=68730f596bd312513a92f625643bb6746511c7d99b68be2d07c5849e1b96d693
  Stored in directory: /root/.cache/pip/wheels/79/65/17/0555c887f2a873f0170c91acb34c531cdc4e15546962aab916
Successfully built some_package
Installing collected packages: some_package
Successfully installed some_package-0.1


## 2.Scraping Data

In [3]:
def fetch_reviews(app_id, target_score, limit=5000):
    all_reviews = []
    token = None

    while len(all_reviews) < limit:
        result, token = reviews(
            app_id,
            lang='id',
            country='id',
            sort=Sort.NEWEST,
            count=200,
            continuation_token=token
        )

        for r in result:
            if r['score'] == target_score:
                all_reviews.append(r)

            if len(all_reviews) >= limit:
                break

        if not token:
            break

    return all_reviews


data = []
for score in range(1, 6):
    data.extend(fetch_reviews('com.whatsapp', score))

df = pd.DataFrame(data)
print(df['score'].value_counts())
print(df[['score', 'content']].head())

score
1    5000
2    5000
3    5000
4    5000
5    5000
Name: count, dtype: int64
   score                                            content
0      1  kalo buat sw kok pecah kenapa? nunggu thn 2090...
1      1               tolong baikin bug tidak bisa unblock
2      1  kenapa bentar bentar harus di updates sih memo...
3      1  error jika nelpon tidak ada suara, dan kadang ...
4      1  banyak bug nya ,mau update story aja nunggu be...


In [4]:
df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,2cca4720-48fd-4469-953e-6abc6e5890d2,Rifki azis,https://play-lh.googleusercontent.com/a-/ALV-U...,kalo buat sw kok pecah kenapa? nunggu thn 2090...,1,0,2.23.5.78,2025-05-12 15:01:57,,,2.23.5.78
1,62a0cb04-8105-4653-9047-81c7ad5dc11e,Miku Kanaeru,https://play-lh.googleusercontent.com/a/ACg8oc...,tolong baikin bug tidak bisa unblock,1,0,2.25.14.76,2025-05-12 15:01:04,,,2.25.14.76
2,868b2ddd-f0ec-458e-b523-8294df9a7953,Icha Hutajulu,https://play-lh.googleusercontent.com/a-/ALV-U...,kenapa bentar bentar harus di updates sih memo...,1,0,2.25.5.74,2025-05-12 14:55:37,,,2.25.5.74
3,ebffba86-a494-44e6-8a60-95db5bf8977e,Novita Novi,https://play-lh.googleusercontent.com/a/ACg8oc...,"error jika nelpon tidak ada suara, dan kadang ...",1,0,2.25.14.76,2025-05-12 14:53:37,,,2.25.14.76
4,ece9e0d6-06c2-417b-8f0c-3c6bc158d206,Damian Kater,https://play-lh.googleusercontent.com/a/ACg8oc...,"banyak bug nya ,mau update story aja nunggu be...",1,0,,2025-05-12 14:53:26,,,
...,...,...,...,...,...,...,...,...,...,...,...
24995,576da8ee-3f0b-4124-8b5d-e8c28b085066,Oktavian Gandut,https://play-lh.googleusercontent.com/a/ACg8oc...,mantap👍,5,0,,2025-05-05 23:29:48,,,
24996,c53568ac-e6b7-4655-8bd9-9708db102a3c,Riki Riki,https://play-lh.googleusercontent.com/a/ACg8oc...,bgus,5,0,,2025-05-05 23:15:33,,,
24997,90579036-9ae4-4998-874f-b5f4935fb124,Siti Fatonah,https://play-lh.googleusercontent.com/a/ACg8oc...,bagus banget,5,0,2.25.7.80,2025-05-05 23:10:43,,,2.25.7.80
24998,c91b990d-f57d-40a1-ba9d-ef1ddf6575fc,Harni Nugraha,https://play-lh.googleusercontent.com/a/ACg8oc...,ohhh tentu bagus,5,0,2.25.13.74,2025-05-05 23:09:31,,,2.25.13.74


## 3.Preprocessing

In [5]:
def preprocess_pipeline(df):
    df_preprocessing = df.copy()
    df_preprocessing = df_preprocessing[['content', 'score']]

    # Cleaning teks
    def clean_text(text):
        if not isinstance(text, str):
            return ""
        text = emoji.replace_emoji(text, '')  # hapus emoji
        text = re.sub(r'http\S+|<.*?>|[@#]\S+|[^a-zA-Z\s]', '', text)  # hapus elemen tidak relevan
        text = re.sub(r'\s+', ' ', text).strip().lower()  # hapus spasi berlebih & lowercase
        return text

    df_preprocessing['content_clean'] = df_preprocessing['content'].apply(clean_text)

    # Stopword Removal
    factory_stopword = StopWordRemoverFactory()
    stopwords = set(factory_stopword.get_stop_words())

    def remove_stopwords(text):
        tokens = text.split()
        return [word for word in tokens if word not in stopwords]

    df_preprocessing['tokens_nostop'] = df_preprocessing['content_clean'].apply(remove_stopwords)

    # Lemmatization (atau Stemming)
    lemmatizer = Lemmatizer()

    def lemmatize_tokens(tokens):
        return [lemmatizer.lemmatize(token) for token in tokens]

    df_preprocessing['content_preprocessed'] = df_preprocessing['tokens_nostop'].apply(lemmatize_tokens)

    # Hapus baris kosong
    df_preprocessing = df_preprocessing[df_preprocessing['content_preprocessed'].apply(lambda x: len(x) > 0)]

    # Return hasil akhir
    return df_preprocessing[['content_preprocessed', 'score']]


In [6]:
df_preprocessing = preprocess_pipeline(df)

In [7]:
df_preprocessing

Unnamed: 0,content_preprocessed,score
0,"[kalo, buat, sw, kok, pecah, nunggu, thn, dulu...",1
1,"[baikin, bug, unblock]",1
2,"[bentar, bentar, updates, sih, memo, aku, gak,...",1
3,"[error, nelpon, suara, kadang, hubung, panggil...",1
4,"[banyak, bug, nya, mau, update, story, aja, nu...",1
...,...,...
24995,[mantap],5
24996,[bgus],5
24997,"[bagus, banget]",5
24998,"[ohhh, bagus]",5


## 4.Labeling

In [8]:
# Pembuatan bobot sentimen setiap kata berdasarkan score review

# Mapping skor review ke bobot
weight_map = {5: 2, 4: 1, 3: 0, 2: -1, 1: -2}

# Inisialisasi
df_word_weights = df_preprocessing.copy()
word_scores = defaultdict(int)
word_counts = defaultdict(int)

# Hitung skor rata-rata tiap kata berdasarkan skor review
for _, row in df_word_weights.iterrows():
    score = row['score']
    tokens = row['content_preprocessed']
    weight = weight_map.get(score, 0)

    unique_words = set(tokens)
    for word in unique_words:
        word_scores[word] += weight
        word_counts[word] += 1

word_weights = {}
for word in word_scores:
    avg = word_scores[word] / word_counts[word]
    word_weights[word] = round(avg, 2)  # Membulatkan ke dua angka desimal

# Hitung IDF untuk kata-kata di semua dokumen
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, token_pattern=None, lowercase=False)
tfidf_vectorizer.fit(df_word_weights['content_preprocessed'])
idf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

# Kalibrasi bobot dengan IDF
for word in word_weights:
    if word in idf_scores:
        word_weights[word] *= idf_scores[word]
    else:
        word_weights[word] *= 1.0

# Tambahkan kolom untuk menampung pasangan kata dan bobotnya
all_word_weights = []
for tokens in df_word_weights['content_preprocessed']:
    token_weights = [(word, word_weights.get(word, 0.0)) for word in tokens]  # Gunakan 0.0 jika kata tidak ditemukan
    all_word_weights.append(token_weights)

df_word_weights['word_weights'] = all_word_weights

# Output hasil
df_word_weights[['content_preprocessed', 'word_weights']]


Unnamed: 0,content_preprocessed,word_weights
0,"[kalo, buat, sw, kok, pecah, nunggu, thn, dulu...","[(kalo, -1.1471985619625078), (buat, -1.334645..."
1,"[baikin, bug, unblock]","[(baikin, -7.755026218748823), (bug, -2.455204..."
2,"[bentar, bentar, updates, sih, memo, aku, gak,...","[(bentar, -9.255344610406688), (bentar, -9.255..."
3,"[error, nelpon, suara, kadang, hubung, panggil...","[(error, -3.584121552626839), (nelpon, -3.5930..."
4,"[banyak, bug, nya, mau, update, story, aja, nu...","[(banyak, -2.0316643012711495), (bug, -2.45520..."
...,...,...
24995,[mantap],"[(mantap, 6.3075833181922345)]"
24996,[bgus],"[(bgus, 5.439872318265129)]"
24997,"[bagus, banget]","[(bagus, 2.311252292944158), (banget, -1.16072..."
24998,"[ohhh, bagus]","[(ohhh, 20.807533417004485), (bagus, 2.3112522..."


In [9]:
#PELABELAN DENGAN BOBOT BARU

# 1. Hitung total bobot per kalimat dari word_weights
def hitung_sentence_weight(word_weights):
    if not word_weights:
        return 0.0  # Hindari pembagian nol jika input kosong
    total = 0
    for word, weight in word_weights:
        total += weight
    return total / len(word_weights)  # Rata-rata: total dibagi jumlah kata

# 2. Fungsi pelabelan berdasarkan bobot kalimat
def label_by_weight(w):
    if w > 0.1:
        return 'Positif'
    elif w < -0.1:
        return 'Negatif'
    else:
        return 'Netral'

# 3. Terapkan ke DataFrame df_label (ganti df_word_weights dengan df_label)
df_label = df_word_weights.copy()  # Membuat salinan df_word_weights ke df_label
df_label['sentence_weight'] = df_label['word_weights'].apply(hitung_sentence_weight)
df_label['label_by_weight'] = df_label['sentence_weight'].apply(label_by_weight)

# 4. Tampilkan hasil di df_label
df_label[['content_preprocessed', 'sentence_weight', 'label_by_weight']]


Unnamed: 0,content_preprocessed,sentence_weight,label_by_weight
0,"[kalo, buat, sw, kok, pecah, nunggu, thn, dulu...",-2.280030,Negatif
1,"[baikin, bug, unblock]",-10.339255,Negatif
2,"[bentar, bentar, updates, sih, memo, aku, gak,...",-6.277977,Negatif
3,"[error, nelpon, suara, kadang, hubung, panggil...",-1.744999,Negatif
4,"[banyak, bug, nya, mau, update, story, aja, nu...",-2.307293,Negatif
...,...,...,...
24995,[mantap],6.307583,Positif
24996,[bgus],5.439872,Positif
24997,"[bagus, banget]",0.575265,Positif
24998,"[ohhh, bagus]",11.559393,Positif


##5.Pelatihan dan Evaluasi Model

In [10]:
# Siapkan data
X = df_label['content_preprocessed']
y = df_label['label_by_weight']

In [11]:
# Split data 70:30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, token_pattern=None, lowercase=False)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Prediksi
y_train_pred = model.predict(X_train_tfidf)
y_test_pred = model.predict(X_test_tfidf)

# Evaluasi
print("TF-IDF + Logistic Regression + (70:30)")
print(f"Akurasi Train: {accuracy_score(y_train, y_train_pred):.2%}")
print(f"Akurasi Test: {accuracy_score(y_test, y_test_pred):.2%}")
print("Classification Report (Test Data):")
print(classification_report(y_test, y_test_pred, zero_division=0))


TF-IDF + Logistic Regression + (70:30)
Akurasi Train: 95.70%
Akurasi Test: 93.20%
Classification Report (Test Data):
              precision    recall  f1-score   support

     Negatif       0.93      0.98      0.96      5289
      Netral       0.00      0.00      0.00        70
     Positif       0.93      0.83      0.88      1922

    accuracy                           0.93      7281
   macro avg       0.62      0.60      0.61      7281
weighted avg       0.92      0.93      0.93      7281



In [12]:
# Split 75:25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# CountVectorizer
bow_vectorizer = CountVectorizer(tokenizer=lambda x: x, token_pattern=None, lowercase=False)
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# Model
model = SVC(kernel='linear', random_state=42)
model.fit(X_train_bow, y_train)

# Evaluasi
y_train_pred = model.predict(X_train_bow)
y_test_pred = model.predict(X_test_bow)

print("BoW + SVM + (75:25) ")
print(f"Akurasi Train: {accuracy_score(y_train, y_train_pred):.2%}")
print(f"Akurasi Test: {accuracy_score(y_test, y_test_pred):.2%}")
print(classification_report(y_test, y_test_pred, zero_division=0))


BoW + SVM + (75:25) 
Akurasi Train: 98.11%
Akurasi Test: 92.53%
              precision    recall  f1-score   support

     Negatif       0.94      0.97      0.95      4414
      Netral       0.50      0.02      0.03        56
     Positif       0.89      0.84      0.87      1597

    accuracy                           0.93      6067
   macro avg       0.78      0.61      0.62      6067
weighted avg       0.92      0.93      0.92      6067



In [13]:
# Split 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Evaluasi
y_train_pred = model.predict(X_train_tfidf)
y_test_pred = model.predict(X_test_tfidf)

print("TF-IDF + Random Forest + (80:20)")
print(f"Akurasi Train: {accuracy_score(y_train, y_train_pred):.2%}")
print(f"Akurasi Test: {accuracy_score(y_test, y_test_pred):.2%}")
print(classification_report(y_test, y_test_pred, zero_division=0))


TF-IDF + Random Forest + (80:20)
Akurasi Train: 100.00%
Akurasi Test: 92.75%
              precision    recall  f1-score   support

     Negatif       0.94      0.97      0.95      3523
      Netral       1.00      0.11      0.19        47
     Positif       0.89      0.85      0.87      1284

    accuracy                           0.93      4854
   macro avg       0.94      0.64      0.67      4854
weighted avg       0.93      0.93      0.92      4854

