In [1]:
!pip install google-play-scraper



In [25]:
from google_play_scraper import reviews, Sort
import pandas as pd

# Daftar aplikasi dan ID PlayStore-nya
apps = {
    "tiktok": "com.zhiliaoapp.musically",
    "shopee": "com.shopee.id",
    "gojek": "com.gojek.app",
    "ruangguru": "com.ruangguru.livestudents",
    "whatsapp": "com.whatsapp"
}

# Fungsi scraping review menggunakan google-play-scraper
def scrape_reviews(app_name, app_id, max_reviews=900):
    print(f"Scraping {app_name}...")
    result, _ = reviews(
        app_id,
        lang='id',
        country='id',
        sort=Sort.NEWEST,
        count=max_reviews
    )
    df = pd.DataFrame(result)
    if 'content' in df.columns:
        df = df[['content']]
    df.to_csv(f"{app_name}.csv", index=False)
    print(f"{app_name}.csv saved ✅\n")

# Jalankan scraping untuk semua aplikasi
for name, app_id in apps.items():
    scrape_reviews(name, app_id, max_reviews=900)

print("SEMUA DATA TELAH DIAMBIL ✅")

Scraping tiktok...
tiktok.csv saved ✅

Scraping shopee...
shopee.csv saved ✅

Scraping gojek...
gojek.csv saved ✅

Scraping ruangguru...
ruangguru.csv saved ✅

Scraping whatsapp...
whatsapp.csv saved ✅

SEMUA DATA TELAH DIAMBIL ✅


In [5]:
# PEMBERSIHAN & PELABELAN

import pandas as pd
import re
import string
import os
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# --- 2.1 Pembersihan Teks --- #

# Inisialisasi stopword
stop_factory = StopWordRemoverFactory()
stopwords = stop_factory.get_stop_words()

# Fungsi pembersihan teks
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[' + string.punctuation + ']', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

# Daftar file hasil scraping
files = ['tiktok.csv', 'shopee.csv', 'gojek.csv', 'ruangguru.csv', 'whatsapp.csv']

# Proses cleaning
for file in files:
    if os.path.exists(file):
        print(f"Cleaning {file}...")
        df = pd.read_csv(file)
        if 'content' in df.columns:
            df['cleaned'] = df['content'].apply(clean_text)
            df.to_csv(file.replace('.csv', '_cleaned.csv'), index=False)
            print(f"{file.replace('.csv', '_cleaned.csv')} saved ✅\n")
        else:
            print(f"Kolom 'content' tidak ditemukan dalam {file} ⚠️\n")
    else:
        print(f"File {file} tidak ditemukan ⚠️\n")

print("SEMUA FILE TELAH DIBERSIHKAN ✅")

Cleaning tiktok.csv...
tiktok_cleaned.csv saved ✅

Cleaning shopee.csv...
shopee_cleaned.csv saved ✅

Cleaning gojek.csv...
gojek_cleaned.csv saved ✅

Cleaning ruangguru.csv...
ruangguru_cleaned.csv saved ✅

Cleaning whatsapp.csv...
whatsapp_cleaned.csv saved ✅

SEMUA FILE TELAH DIBERSIHKAN ✅


In [7]:
# --- 2.2 Pelabelan Semi-Otomatis --- #

# Kata kunci untuk labeling
positive_words = ['bagus', 'mantap', 'cepat', 'puas', 'keren', 'lancar']
negative_words = ['jelek', 'error', 'lemot', 'buruk', 'parah', 'gagal']

# Fungsi labeling berbasis kata kunci
def label_sentiment(text):
    if isinstance(text, str):
        if any(word in text for word in positive_words):
            return 'positif'
        elif any(word in text for word in negative_words):
            return 'negatif'
        else:
            return 'netral'
    else:
        return 'netral'

# Labeling untuk semua file cleaned
for file in files:
    cleaned_file = file.replace('.csv', '_cleaned.csv')
    if os.path.exists(cleaned_file):
        print(f"Labeling {cleaned_file}...")
        df = pd.read_csv(cleaned_file)
        if 'cleaned' in df.columns:
            df['sentiment'] = df['cleaned'].apply(label_sentiment)
            df.to_csv(file.replace('.csv', '_labeled.csv'), index=False)
            print(f"{file.replace('.csv', '_labeled.csv')} saved ✅\n")
        else:
            print(f"Kolom 'cleaned' tidak ditemukan dalam {cleaned_file} ⚠️\n")
    else:
        print(f"File {cleaned_file} tidak ditemukan ⚠️\n")

print("SEMUA FILE TELAH DILABELI ✅")

Labeling tiktok_cleaned.csv...
tiktok_labeled.csv saved ✅

Labeling shopee_cleaned.csv...
shopee_labeled.csv saved ✅

Labeling gojek_cleaned.csv...
gojek_labeled.csv saved ✅

Labeling ruangguru_cleaned.csv...
ruangguru_labeled.csv saved ✅

Labeling whatsapp_cleaned.csv...
whatsapp_labeled.csv saved ✅

SEMUA FILE TELAH DILABELI ✅


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Contoh load data
df = pd.read_csv('tiktok_labeled.csv')

# Pastikan tidak ada NaN atau data kosong
df['cleaned'] = df['cleaned'].fillna('')

# Inisialisasi TF-IDF
tfidf = TfidfVectorizer(max_features=200)

# Terapkan TF-IDF
X_tfidf = tfidf.fit_transform(df['cleaned'])

# Konversi ke DataFrame untuk melihat hasil
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())

# Lihat beberapa baris pertama
print(tfidf_df.head())

   air  aja       aku  akun  apa  apk  aplikasi  aplikasinya  app  asik  ...  \
0  0.0  0.0  0.000000   0.0  0.0  0.0  0.000000          0.0  0.0   0.0  ...   
1  0.0  0.0  0.462748   0.0  0.0  0.0  0.000000          0.0  0.0   0.0  ...   
2  0.0  0.0  0.000000   0.0  0.0  0.0  0.369211          0.0  0.0   0.0  ...   
3  0.0  0.0  0.000000   0.0  0.0  0.0  0.000000          0.0  0.0   0.0  ...   
4  0.0  0.0  0.000000   0.0  0.0  0.0  0.000000          0.0  0.0   0.0  ...   

   versi  very  video  vidio  yah  yes   yg  you  على   من  
0    0.0   0.0    0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1    0.0   0.0    0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2    0.0   0.0    0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3    0.0   0.0    0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4    0.0   0.0    0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 200 columns]
