In [None]:
# ========================================================================
# CELL 1: INSTALASI LIBRARY
# Jalankan ini sekali saja di awal.
# Tanda seru (!) artinya kita menyuruh terminal komputer melakukan perintah ini.
# ========================================================================
!pip install pandas torch transformers seaborn matplotlib tqdm openpyxl scikit-learn ipywidgets

In [1]:
# ==============================================================================
# CELL 2: IMPORT & KONFIGURASI
# Menyiapkan variabel-variabel utama proyek.
# ==============================================================================
import os
import glob
import re
import pandas as pd
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import pipeline
from tqdm.notebook import tqdm  # Pakai tqdm khusus notebook biar loading bar-nya cantik

# --- PENGATURAN TAMPILAN GRAFIK ---
sns.set_style("whitegrid") # Biar background grafik ada garis-garis grid (lebih rapi)
%matplotlib inline 
# ^ Perintah "Magic" agar grafik muncul langsung di bawah cell ini (bukan pop-up)

# --- KONFIGURASI PROYEK ---
# Sesuaikan dengan nama folder di GitHub kamu
NAMA_FOLDER_DATA = 'DatasetHotel' 

# Label Kategori (Kamu bisa ubah/tambah list ini sesuai kebutuhan manajemen)
# AI akan menggunakan "Zero-Shot Learning" untuk mencocokkan review ke kategori ini.
CANDIDATE_LABELS = [
    "Kualitas Makanan & Restoran",      
    "Kebersihan & Kenyamanan Kamar",    
    "Pelayanan Staf & Keramahan",       
    "Fasilitas Hotel (Kolam/Gym/Spa)",  
    "Lokasi & Akses Strategis",         
    "Infrastruktur (AC/WiFi/Parkir/Air)", 
    "Harga & Value for Money"           
]

print("‚úÖ Konfigurasi selesai. Lanjut ke Cell berikutnya.")

  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Konfigurasi selesai. Lanjut ke Cell berikutnya.


In [None]:
def setup_models():
    print("üß† MEMUAT KECERDASAN BUATAN (AI MODEL)...")
    
    device = 0 if torch.cuda.is_available() else -1
    if device == 0:
        print(f"‚úÖ GPU Terdeteksi: {torch.cuda.get_device_name(0)}")
    else:
        print("‚ö†Ô∏è Menggunakan CPU.")

    # --- KITA KEMBALI KE MODEL RINGAN ---
    # Model ini lebih kecil (400MB) jadi kemungkinan sukses download lebih besar
    model_sentiment = "w11wo/indonesian-roberta-base-sentiment-classifier"
    model_zeroshot = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"

    try:
        print("‚è≥ Sedang memuat model... (Harap pastikan internet lancar)")
        clf_sentiment = pipeline("sentiment-analysis", model=model_sentiment, tokenizer=model_sentiment, device=device)
        clf_aspect = pipeline("zero-shot-classification", model=model_zeroshot, device=device)
        print("‚úÖ SUKSES! Model Siap.")
        return clf_sentiment, clf_aspect
    except Exception as e:
        print(f"‚ùå Error Internet: {e}")
        return None, None

clf_sentiment, clf_aspect = setup_models()

üß† MEMUAT KECERDASAN BUATAN (AI MODEL)...
‚ö†Ô∏è Menggunakan CPU.
‚è≥ Sedang memuat model Multilingual... (Mungkin download lagi sekitar 500MB)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment/resolve/main/pytorch_model.bin: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment/resolve/main/pytorch_model.bin: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
'(MaxRetryError('HTTPSConnectio

‚ùå Error: 'NoneType' object has no attribute 'endswith'


Error while downloading from https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment/resolve/refs%2Fpr%2F15/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /cardiffnlp/twitter-xlm-roberta-base-sentiment/resolve/refs%2Fpr%2F15/model.safetensors (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001E012C2DE50>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 9e086cf9-9005-44cd-9957-ff626b892a55)')' thrown while requesting GET https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment/resolve/refs%2Fpr%2F15/model.safetensors
Retrying in 1s [Retry 1/5].
Error while downloading from https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment/resolve/refs%2Fpr%2F15/model.safetensors: HTTPSConnectionPool(ho

In [None]:
# ==============================================================================
# CELL 4: FUNGSI PENDUKUNG (DATA LOADING & CLEANING)
# ==============================================================================

def clean_text_safe(text):
    """Membersihkan teks tapi JANGAN hapus Emoji/Tanda Baca"""
    text = str(text)
    text = re.sub(r'<.*?>', ' ', text)  # Hapus tag HTML
    text = re.sub(r'http\S+', '', text) # Hapus Link/URL
    text = " ".join(text.split())       # Hapus spasi berlebih
    return text

def load_data(folder_name):
    print(f"üìÇ Mencari data di folder: {folder_name}")
    
    # Cek lokasi folder (bisa di folder ini atau folder atasnya)
    current_dir = os.getcwd()
    target_path = os.path.join(current_dir, folder_name)
    
    # Jika tidak ketemu, coba cari di folder parent (kalau script ada di dlm folder src)
    if not os.path.exists(target_path):
        target_path = os.path.join(current_dir, "..", folder_name)
    
    if not os.path.exists(target_path):
        print(f"‚ùå Gagal menemukan folder '{folder_name}'. Cek struktur foldermu.")
        return pd.DataFrame()

    # Cari semua file .csv di dalam sub-folder manapun
    files = glob.glob(os.path.join(target_path, "**", "*.csv"), recursive=True)
    all_dfs = []
    
    print(f"üîç Ditemukan {len(files)} file CSV. Mulai membaca...")

    for file_path in files:
        try:
            # --- LOGIKA DETEKSI STRUKTUR FOLDER GITHUB ---
            # Path: .../DatasetHotel/BUMN/Bintang3/NamaFile.csv
            path_parts = os.path.normpath(file_path).split(os.sep)
            
            # Ambil nama folder sebagai label
            tipe_hotel = path_parts[-3] if len(path_parts) > 3 else "Unknown"    # BUMN/KOMPETITOR
            kelas_bintang = path_parts[-2] if len(path_parts) > 2 else "Unknown" # Bintang3/4/5
            nama_hotel = os.path.basename(file_path).replace('.csv', '')

            df = pd.read_csv(file_path)
            
            # Normalisasi nama kolom (Cari kolom yang isinya teks review)
            col_map = {}
            for col in df.columns:
                lower_col = col.lower()
                if ('review' in lower_col and 'text' in lower_col) or 'content' in lower_col:
                    col_map[col] = 'text_review'
            
            df = df.rename(columns=col_map)
            
            if 'text_review' in df.columns:
                df['Tipe'] = tipe_hotel
                df['Kelas'] = kelas_bintang
                df['Nama_Hotel'] = nama_hotel
                
                # Simpan hanya kolom penting
                cols = ['text_review', 'Rating', 'Tipe', 'Kelas', 'Nama_Hotel']
                valid_cols = [c for c in cols if c in df.columns]
                all_dfs.append(df[valid_cols])
                
        except Exception as e:
            print(f"‚ö†Ô∏è Error baca file {file_path}: {e}")
            continue

    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
        # Hapus data kosong & duplikat
        final_df = final_df.dropna(subset=['text_review']).drop_duplicates(subset=['text_review'])
        print(f"‚úÖ BERHASIL! Total Data Bersih: {len(final_df)} Review.")
        return final_df
    else:
        print("‚ùå Data Kosong.")
        return pd.DataFrame()

print("‚úÖ Fungsi siap. Lanjut ke Cell Load Data.")

In [None]:
# ==============================================================================
# CELL 5: LOAD DATA SEKARANG
# Mari kita lihat apakah data kamu terbaca dengan benar.
# ==============================================================================

# Panggil fungsi load
df_hotel = load_data(NAMA_FOLDER_DATA)

# Tampilkan 5 data teratas sebagai sampel
if not df_hotel.empty:
    print("\nContoh 5 Data Pertama:")
    display(df_hotel.head()) # 'display' khusus untuk mempercantik tabel di Jupyter
else:
    print("‚ö†Ô∏è Tidak ada data untuk ditampilkan.")

In [None]:
# ==============================================================================
# CELL 6: MENJALANKAN ANALISIS AI
# Proses ini akan membaca satu per satu review.
# ==============================================================================

def run_analysis_notebook(df):
    if df.empty:
        print("Data kosong, tidak bisa analisis.")
        return df

    print("üöÄ Memulai Analisis Sentimen & Topik...")
    
    # 1. Bersihkan Teks Dulu
    df['clean_text'] = df['text_review'].apply(clean_text_safe)
    texts = df['clean_text'].astype(str).tolist()
    
    results_sentiment = []
    results_aspect = []
    
    # 2. Loop Analisis dengan Progress Bar
    for text in tqdm(texts, desc="Sedang Menganalisis"):
        # A. Sentimen (IndoBERT)
        try:
            # Potong teks max 512 karakter (batas kemampuan BERT)
            res = clf_sentiment(text[:512], truncation=True, max_length=512)[0]
            sentiment_label = res['label']
        except:
            sentiment_label = "neutral"
            
        # B. Aspek (Zero-Shot)
        try:
            res = clf_aspect(text[:512], CANDIDATE_LABELS, multi_label=False)
            best_aspect = res['labels'][0] # Ambil skor tertinggi
        except:
            best_aspect = "Lainnya"
            
        results_sentiment.append(sentiment_label)
        results_aspect.append(best_aspect)
        
    # 3. Masukkan Hasil ke Tabel
    df['AI_Sentiment'] = results_sentiment
    df['AI_Aspek'] = results_aspect
    
    return df

# --- JALANKAN ANALISIS ---
# Kalau mau tes cepat dulu, hilangkan tanda pagar (#) di baris bawah ini:
# df_hotel_sample = df_hotel.head(20) 
# df_result = run_analysis_notebook(df_hotel_sample)

# Kalau mau proses SEMUA data (mungkin lama), pakai baris ini:
df_result = run_analysis_notebook(df_hotel)

# Simpan ke Excel biar aman
df_result.to_excel("Hasil_Analisis_Lengkap.xlsx", index=False)
print("‚úÖ Analisis Selesai! File Excel 'Hasil_Analisis_Lengkap.xlsx' sudah disimpan.")

# Lihat hasilnya
df_result[['text_review', 'AI_Sentiment', 'AI_Aspek']].head()

In [None]:
# ==============================================================================
# CELL 7: GRAFIK 1 - PERBANDINGAN TIPE
# ==============================================================================
plt.figure(figsize=(10, 6))

# Membuat Bar Chart
sns.countplot(data=df_result, x='Tipe', hue='AI_Sentiment', palette='viridis')

plt.title('Head-to-Head: Sentimen BUMN vs KOMPETITOR', fontsize=14)
plt.xlabel('Tipe Hotel')
plt.ylabel('Jumlah Review')
plt.show()

In [None]:
# ==============================================================================
# CELL 8: GRAFIK 2 - PERBANDINGAN KELAS BINTANG
# ==============================================================================
plt.figure(figsize=(10, 6))

try:
    # Urutkan agar grafik rapi (Bintang3 -> Bintang4 -> Bintang5)
    order_bintang = sorted(df_result['Kelas'].unique())
    
    sns.countplot(data=df_result, x='Kelas', hue='AI_Sentiment', order=order_bintang, palette='rocket')
    
    plt.title('Distribusi Sentimen Berdasarkan Kelas Bintang', fontsize=14)
    plt.xlabel('Kelas Hotel')
    plt.ylabel('Jumlah Review')
    plt.show()
except Exception as e:
    print(f"Gagal membuat grafik bintang: {e}")

In [None]:
# ==============================================================================
# CELL 9: GRAFIK 3 - PETA MASALAH (Review Negatif Saja)
# ==============================================================================

# Filter: Ambil hanya yang sentimennya NEGATIVE
df_neg = df_result[df_result['AI_Sentiment'] == 'negative']

if not df_neg.empty:
    plt.figure(figsize=(12, 8))
    
    # Hitung aspek mana yang paling banyak dikeluhkan
    order_aspek = df_neg['AI_Aspek'].value_counts().index
    
    # Buat grafik horizontal
    sns.countplot(data=df_neg, y='AI_Aspek', hue='Tipe', order=order_aspek, palette='magma')
    
    plt.title('TOP KELUHAN UTAMA (Berdasarkan Review Negatif)', fontsize=14)
    plt.xlabel('Jumlah Komplain')
    plt.ylabel('Kategori Masalah')
    plt.show()
else:
    print("üéâ Wow! Tidak ada review negatif sama sekali.")