In [25]:
# PEMBERSIHAN & PELABELAN

import pandas as pd
import re
import string
import os
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# --- 2.1 Pembersihan Teks --- #

# Inisialisasi stopword
stop_factory = StopWordRemoverFactory()
stopwords = stop_factory.get_stop_words()

# Fungsi pembersihan teks
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[' + string.punctuation + ']', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

# Daftar file hasil scraping
files = ['tiktok.csv', 'shopee.csv', 'gojek.csv', 'ruangguru.csv', 'whatsapp.csv']

# Proses cleaning
for file in files:
    if os.path.exists(file):
        print(f"Cleaning {file}...")
        df = pd.read_csv(file)
        if 'content' in df.columns:
            df['cleaned'] = df['content'].apply(clean_text)
            df.to_csv(file.replace('.csv', '_cleaned.csv'), index=False)
            print(f"{file.replace('.csv', '_cleaned.csv')} saved ✅\n")
        else:
            print(f"Kolom 'content' tidak ditemukan dalam {file} ⚠️\n")
    else:
        print(f"File {file} tidak ditemukan ⚠️\n")

print("SEMUA FILE TELAH DIBERSIHKAN ✅")

Cleaning tiktok.csv...
tiktok_cleaned.csv saved ✅

Cleaning shopee.csv...
shopee_cleaned.csv saved ✅

Cleaning gojek.csv...
gojek_cleaned.csv saved ✅

Cleaning ruangguru.csv...
ruangguru_cleaned.csv saved ✅

Cleaning whatsapp.csv...
whatsapp_cleaned.csv saved ✅

SEMUA FILE TELAH DIBERSIHKAN ✅


In [27]:
# --- 2.2 Pelabelan Semi-Otomatis --- #

# Kata kunci untuk labeling
positive_words = ['bagus', 'mantap', 'cepat', 'puas', 'keren', 'lancar']
negative_words = ['jelek', 'error', 'lemot', 'buruk', 'parah', 'gagal']

# Fungsi labeling berbasis kata kunci
def label_sentiment(text):
    if isinstance(text, str):
        if any(word in text for word in positive_words):
            return 'positif'
        elif any(word in text for word in negative_words):
            return 'negatif'
        else:
            return 'netral'
    else:
        return 'netral'

# Labeling untuk semua file cleaned
for file in files:
    cleaned_file = file.replace('.csv', '_cleaned.csv')
    if os.path.exists(cleaned_file):
        print(f"Labeling {cleaned_file}...")
        df = pd.read_csv(cleaned_file)
        if 'cleaned' in df.columns:
            df['sentiment'] = df['cleaned'].apply(label_sentiment)
            df.to_csv(file.replace('.csv', '_labeled.csv'), index=False)
            print(f"{file.replace('.csv', '_labeled.csv')} saved ✅\n")
        else:
            print(f"Kolom 'cleaned' tidak ditemukan dalam {cleaned_file} ⚠️\n")
    else:
        print(f"File {cleaned_file} tidak ditemukan ⚠️\n")

print("SEMUA FILE TELAH DILABELI ✅")

Labeling tiktok_cleaned.csv...
tiktok_labeled.csv saved ✅

Labeling shopee_cleaned.csv...
shopee_labeled.csv saved ✅

Labeling gojek_cleaned.csv...
gojek_labeled.csv saved ✅

Labeling ruangguru_cleaned.csv...
ruangguru_labeled.csv saved ✅

Labeling whatsapp_cleaned.csv...
whatsapp_labeled.csv saved ✅

SEMUA FILE TELAH DILABELI ✅


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Contoh load data
df = pd.read_csv('tiktok_labeled.csv')

# Pastikan tidak ada NaN atau data kosong
df['cleaned'] = df['cleaned'].fillna('')

# Inisialisasi TF-IDF
tfidf = TfidfVectorizer(max_features=200)

# Terapkan TF-IDF
X_tfidf = tfidf.fit_transform(df['cleaned'])

# Konversi ke DataFrame untuk melihat hasil
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())

# Lihat beberapa baris pertama
print(tfidf_df.head())

   air  aja       aku  akun  apa  apk  aplikasi  aplikasinya  app  asik  ...  \
0  0.0  0.0  0.000000   0.0  0.0  0.0  0.000000          0.0  0.0   0.0  ...   
1  0.0  0.0  0.462748   0.0  0.0  0.0  0.000000          0.0  0.0   0.0  ...   
2  0.0  0.0  0.000000   0.0  0.0  0.0  0.369211          0.0  0.0   0.0  ...   
3  0.0  0.0  0.000000   0.0  0.0  0.0  0.000000          0.0  0.0   0.0  ...   
4  0.0  0.0  0.000000   0.0  0.0  0.0  0.000000          0.0  0.0   0.0  ...   

   versi  very  video  vidio  yah  yes   yg  you  على   من  
0    0.0   0.0    0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1    0.0   0.0    0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2    0.0   0.0    0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3    0.0   0.0    0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4    0.0   0.0    0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 200 columns]


In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data yang sudah dibersihkan dan dilabeli
df = pd.read_csv('tiktok_labeled.csv')

# Tangani NaN dengan mengganti dengan string kosong
df['cleaned'] = df['cleaned'].fillna('')

# Ekstraksi fitur TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['cleaned'])
y = df['sentiment'].apply(lambda x: 1 if x == 'positif' else (0 if x == 'negatif' else 2))  # Mengubah label menjadi numerik

# Pembagian data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

from sklearn.metrics import precision_score, recall_score, f1_score

# Fungsi untuk mengevaluasi model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)  # Menangani label dengan tidak ada prediksi
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)        # Menangani label dengan tidak ada prediksi
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)                # Menangani label dengan tidak ada prediksi
    return accuracy, precision, recall, f1

# 1. Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
logreg_accuracy, logreg_precision, logreg_recall, logreg_f1 = evaluate_model(logreg, X_test, y_test)

# 2. Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate_model(rf, X_test, y_test)

# 3. Neural Network (MLP)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
mlp.fit(X_train, y_train)
mlp_accuracy, mlp_precision, mlp_recall, mlp_f1 = evaluate_model(mlp, X_test, y_test)

# Evaluasi model
logreg_accuracy, logreg_precision, logreg_recall, logreg_f1 = evaluate_model(logreg, X_test, y_test)
rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate_model(rf, X_test, y_test)
mlp_accuracy, mlp_precision, mlp_recall, mlp_f1 = evaluate_model(mlp, X_test, y_test)

# Hasil evaluasi
print("Evaluasi Model:")
print(f"Logistic Regression: Accuracy={logreg_accuracy:.4f}, Precision={logreg_precision:.4f}, Recall={logreg_recall:.4f}, F1-score={logreg_f1:.4f}")
print(f"Random Forest: Accuracy={rf_accuracy:.4f}, Precision={rf_precision:.4f}, Recall={rf_recall:.4f}, F1-score={rf_f1:.4f}")
print(f"Neural Network (MLP): Accuracy={mlp_accuracy:.4f}, Precision={mlp_precision:.4f}, Recall={mlp_recall:.4f}, F1-score={mlp_f1:.4f}")

Evaluasi Model:
Logistic Regression: Accuracy=0.9444, Precision=0.9482, Recall=0.9444, F1-score=0.9329
Random Forest: Accuracy=0.9500, Precision=0.9517, Recall=0.9500, F1-score=0.9390
Neural Network (MLP): Accuracy=0.9278, Precision=0.9290, Recall=0.9278, F1-score=0.9170


In [37]:
# Fungsi untuk melakukan prediksi sentimen pada teks baru
def predict_sentiment(text, model, tfidf_vectorizer=None):
    # Pembersihan teks
    text_cleaned = clean_text(text)
    
    # Ekstraksi fitur (menggunakan TF-IDF jika model menggunakan TF-IDF)
    if tfidf_vectorizer:
        text_tfidf = tfidf_vectorizer.transform([text_cleaned])
        pred = model.predict(text_tfidf)
    else:
        # Jika menggunakan model yang sudah dilatih dengan representasi lain seperti Word2Vec
        tokens = text_cleaned.split()
        text_w2v = get_avg_w2v(tokens, model_w2v)
        pred = model.predict([text_w2v])
    
    # Mengembalikan prediksi sentimen
    return pred[0]

# Contoh penggunaan untuk teks baru:
text_input = "Aplikasi ini sangat bagus, mudah digunakan dan cepat."
sentiment_prediction = predict_sentiment(text_input, logreg, tfidf)

print(f"Sentimen untuk teks '{text_input}' adalah: {sentiment_prediction}")

Sentimen untuk teks 'Aplikasi ini sangat bagus, mudah digunakan dan cepat.' adalah: 1
