In [None]:
# Pseudo-labeling Parfume Dataset â€” Versi teliti & siap run
# Pastikan path FILE_PATH sesuai environment Anda

import os
import re
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# ---------------------
# Config paths
# ---------------------
FILE_PATH = r"C:\Users\A S U S\OneDrive\Documents\TUGASSS\Proyek-Web-Scraping-Shopee-Perbandingan-Dua-Produk\Lebelingdata\dataset_parfume_komentar.csv"
OUTPUT_FOLDER = r"C:\Users\A S U S\OneDrive\Documents\TUGASSS\Proyek-Web-Scraping-Shopee-Perbandingan-Dua-Produk\Lebelingdata"
OUTPUT_FILE = os.path.join(OUTPUT_FOLDER, "hasil_prediksi_pseudo_label.csv")



In [25]:
# ---------------------
# Preprocessing function
# ---------------------
def clean_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).lower()
    # hilangkan kutip, tanda baca non-alfanumerik kecuali spasi
    s = s.replace('"', ' ').replace("'", " ")
    s = re.sub(r"[^a-z0-9\s]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ---------------------
# Load dataset
# ---------------------
print("Membaca dataset dari:", FILE_PATH)
df = pd.read_csv(FILE_PATH)
print("Kolom dataset:", df.columns.tolist())

# Pastikan kolom 'comment' dan 'label' ada
if 'comment' not in df.columns or 'label' not in df.columns:
    raise ValueError("Dataset harus memiliki kolom 'comment' dan 'label' (periksa header CSV).")

# Bersihkan teks dan buang baris kosong
df['comment'] = df['comment'].astype(str).apply(clean_text)
df = df[df['comment'].str.strip() != ""].reset_index(drop=True)
print(f"Jumlah baris setelah preprocessing: {len(df)}")

# ---------------------
# Lihat ringkasan label & contoh
# ---------------------
print("\nJumlah per label:")
print(df['label'].value_counts())

print("\n15 contoh label positive:")
print(df[df['label']=='positive']['comment'].head(15).to_list())

print("\n15 contoh label negative:")
print(df[df['label']=='negative']['comment'].head(15).to_list())

print("\n10 contoh label neutral:")
print(df[df['label']=='neutral']['comment'].head(10).to_list())



Membaca dataset dari: C:\Users\A S U S\OneDrive\Documents\TUGASSS\Proyek-Web-Scraping-Shopee-Perbandingan-Dua-Produk\Lebelingdata\dataset_parfume_komentar.csv
Kolom dataset: ['comment', 'label']
Jumlah baris setelah preprocessing: 250

Jumlah per label:
label
positive    100
negative    100
neutral      50
Name: count, dtype: int64

15 contoh label positive:
['parfumnya sesuai deskripsi dan enak dipakai', 'parfumnya sesuai deskripsi dan enak dipakai waktu dipakai', 'wangi parfum ini bikin nyaman seharian menurut saya', 'parfumnya sesuai deskripsi dan enak dipakai menurut saya', 'wangi parfum ini bikin nyaman seharian', 'suka banget sama aroma parfumnya nggak nyengat waktu dipakai', 'aromanya bikin mood jadi lebih baik waktu dipakai', 'aromanya bikin mood jadi lebih baik waktu dipakai', 'aromanya lembut dan elegan cocok buat sehari hari sejauh ini', 'suka banget sama aroma parfumnya nggak nyengat waktu dipakai', 'suka banget sama aroma parfumnya nggak nyengat waktu dipakai', 'parfum ini

In [26]:
# ---------------------
# TF-IDF
# ---------------------
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_all = vectorizer.fit_transform(df['comment'])   # sparse matrix
y_all = df['label'].values

# ---------------------
# Split: train / val / unlabeled(sim)
# 60% train, 20% val, 20% unlabeled (simulasi jika tidak ada unlabeled asli)
# ---------------------
X_temp, X_unlabeled, y_temp, y_unlabeled = train_test_split(
    X_all, y_all, test_size=0.20, random_state=42, stratify=y_all
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)  # 0.25 * 0.8 = 0.2

print(f"Ukuran train: {X_train.shape}, val: {X_val.shape}, unlabeled(sim): {X_unlabeled.shape}")



Ukuran train: (150, 286), val: (50, 286), unlabeled(sim): (50, 286)


In [27]:
# ---------------------
# Pseudo-labeling util
# ---------------------
def pseudo_labeling(model, unlabeled_X, threshold=0.85):
    """
    model: classifier trained
    unlabeled_X: sparse matrix
    threshold: probability threshold
    return: (preds_selected_array, selected_X_sparse) or (empty array, None)
    """
    probas = model.predict_proba(unlabeled_X)
    max_probas = np.max(probas, axis=1)
    preds = model.classes_[np.argmax(probas, axis=1)]
    mask = max_probas >= threshold
    if mask.sum() == 0:
        return np.array([]), None
    return preds[mask], unlabeled_X[mask]

# ---------------------
# Train baseline model
# ---------------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

pred_val = model.predict(X_val)
print("\nEvaluasi awal pada validation set:")
print("Accuracy:", accuracy_score(y_val, pred_val))
print(classification_report(y_val, pred_val))

# ---------------------
# Pseudo-labeling dan augmentasi
# ---------------------
THRESHOLD = 0.85
pseudo_preds, pseudo_X = pseudo_labeling(model, X_unlabeled, threshold=THRESHOLD)
print(f"\nJumlah sample yang di-pseudo-label (threshold={THRESHOLD}): {0 if pseudo_X is None else pseudo_X.shape[0]}")

if pseudo_X is not None and pseudo_X.shape[0] > 0:
    X_train_aug = sparse.vstack([X_train, pseudo_X])
    y_train_aug = np.concatenate([y_train, pseudo_preds])
else:
    X_train_aug = X_train
    y_train_aug = y_train

# Retrain with augmented data
model_aug = LogisticRegression(max_iter=1000)
model_aug.fit(X_train_aug, y_train_aug)

pred_val_aug = model_aug.predict(X_val)
print("\nEvaluasi setelah pseudo-labeling pada validation set:")
print("Accuracy:", accuracy_score(y_val, pred_val_aug))
print(classification_report(y_val, pred_val_aug))

# ---------------------
# Prediksi seluruh dataset & simpan CSV
# ---------------------
pred_all = model_aug.predict(X_all)
result_df = pd.DataFrame({
    'comment': df['comment'],
    'label_asli': df['label'],
    'prediksi_model': pred_all
})

os.makedirs(OUTPUT_FOLDER, exist_ok=True)
result_df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')
print("\nHasil prediksi disimpan di:", OUTPUT_FILE)

# ---------------------
# Catatan / next steps
# ---------------------
print("\nSelesai.")
print("- Untuk menambah kolom probabilitas prediksi, kita bisa simpan model.predict_proba(X_all).")
print("- Untuk meningkatkan performa: coba model lain, grid-search, atau augmentasi data.")



Evaluasi awal pada validation set:
Accuracy: 1.0
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        20
     neutral       1.00      1.00      1.00        10
    positive       1.00      1.00      1.00        20

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50


Jumlah sample yang di-pseudo-label (threshold=0.85): 0

Evaluasi setelah pseudo-labeling pada validation set:
Accuracy: 1.0
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        20
     neutral       1.00      1.00      1.00        10
    positive       1.00      1.00      1.00        20

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50


Hasil prediksi disimpan di: C:\Users\A S U S\OneDrive\Document