In [None]:
import os
import glob
import re
import pandas as pd
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import pipeline
from tqdm.auto import tqdm

In [None]:
# ==============================================================================
# KONFIGURASI PROYEK
# ==============================================================================
# Nama folder utama data (harus ada di sebelah file python ini)
NAMA_FOLDER_DATA = 'Project_Hotel'

# Label Kategori Aspek (Bisa disesuaikan)
CANDIDATE_LABELS = [
    "Kualitas Makanan & Restoran",      
    "Kebersihan & Kenyamanan Kamar",    
    "Pelayanan Staf & Keramahan",       
    "Fasilitas Hotel (Kolam/Gym/Spa)",  
    "Lokasi & Akses Strategis",         
    "Infrastruktur (AC/WiFi/Parkir/Air)", 
    "Harga & Value for Money"           
]

In [None]:
# ==============================================================================
# BAGIAN 1: SETUP HARDWARE & MODEL AI
# ==============================================================================
def setup_models():
    print("\n" + "="*60)
    print(" üß† MEMUAT KECERDASAN BUATAN (AI MODEL)")
    print("="*60)

    # Cek GPU NVIDIA
    device = 0 if torch.cuda.is_available() else -1
    if device == 0:
        print(f"‚úÖ GPU Terdeteksi: {torch.cuda.get_device_name(0)}")
        print("   -> Analisis akan berjalan CEPAT.")
    else:
        print("‚ö†Ô∏è GPU Tidak Terdeteksi. Menggunakan CPU.")
        print("   -> Analisis akan berjalan LAMBAT. Mohon bersabar.")

    # Load Model (IndoBERT & mDeBERTa)
    model_sentiment = "w11wo/indonesian-roberta-base-sentiment-classifier"
    model_zeroshot = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"

    print("\n‚è≥ Sedang mendownload/memuat model... (Hanya lama di awal)")
    try:
        classifier_sentiment = pipeline("sentiment-analysis", model=model_sentiment, tokenizer=model_sentiment, device=device)
        classifier_aspect = pipeline("zero-shot-classification", model=model_zeroshot, device=device)
        print("‚úÖ Semua Model Berhasil Dimuat!")
        return classifier_sentiment, classifier_aspect
    except Exception as e:
        print(f"‚ùå Gagal memuat model. Error: {e}")
        exit()

In [None]:
# ==============================================================================
# BAGIAN 2: PREPROCESSING ILMIAH
# ==============================================================================
def clean_text_safe(text):
    text = str(text)
    # Hapus HTML & URL (Sampah teknis)
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'http\S+', '', text)
    # Hapus Spasi Ganda
    text = " ".join(text.split())
    # PERTAHANKAN Emoji & Tanda Baca (Penting untuk Sentimen!)
    return text

In [None]:
# ==============================================================================
# BAGIAN 3: LOAD DATA DENGAN DETEKSI BINTANG
# ==============================================================================
def load_data_from_folder(root_folder):
    print("\n" + "="*60)
    print(f" üìÇ MEMBACA DATA DARI FOLDER: {root_folder}")
    print("="*60)
    
    current_dir = os.getcwd()
    target_path = os.path.join(current_dir, root_folder)
    
    if not os.path.exists(target_path):
        print(f"‚ùå Folder '{root_folder}' tidak ditemukan!")
        return pd.DataFrame()

    files = glob.glob(os.path.join(target_path, "**", "*.csv"), recursive=True)
    all_dfs = []

    print(f"üîç Ditemukan {len(files)} file CSV.")

    for file_path in files:
        try:
            # === LOGIKA DETEKSI KATEGORI & BINTANG DARI FOLDER ===
            path_parts = os.path.normpath(file_path).split(os.sep)
            
            # Mengambil nama folder parent sebagai metadata
            # Contoh: .../BUMN/Bintang_5/GrandAston.csv
            tipe_hotel = path_parts[-3] if len(path_parts) > 3 else "Unknown"    # BUMN
            kelas_bintang = path_parts[-2] if len(path_parts) > 2 else "Unknown" # Bintang_5
            nama_hotel = os.path.basename(file_path).replace('.csv', '')

            # Baca CSV
            df = pd.read_csv(file_path)
            
            # Normalisasi Nama Kolom
            col_map = {col: col for col in df.columns}
            found_text = False
            for col in df.columns:
                if ('review' in col.lower() and 'text' in col.lower()) or 'content' in col.lower():
                    col_map[col] = 'text_review'
                    found_text = True
                    break
            
            df = df.rename(columns=col_map)

            if found_text:
                df['Tipe'] = tipe_hotel         
                df['Kelas'] = kelas_bintang     
                df['Nama_Hotel'] = nama_hotel
                
                # Simpan kolom penting
                cols = ['text_review', 'Rating', 'Tipe', 'Kelas', 'Nama_Hotel']
                valid_cols = [c for c in cols if c in df.columns]
                all_dfs.append(df[valid_cols])
            else:
                print(f"   ‚ö†Ô∏è Skip {nama_hotel}: Kolom teks tidak ditemukan.")

        except Exception as e:
            print(f"   ‚ùå Error membaca {file_path}: {e}")

    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
        final_df = final_df.dropna(subset=['text_review'])
        final_df = final_df.drop_duplicates(subset=['text_review'])
        print(f"‚úÖ Total Data Bersih: {len(final_df)} Review.")
        return final_df
    else:
        return pd.DataFrame()

In [None]:
# ==============================================================================
# BAGIAN 4: EKSEKUSI ANALISIS
# ==============================================================================
def run_analysis(df, sentiment_model, aspect_model):
    print("\n" + "="*60)
    print(" üöÄ MEMULAI ANALISIS AI")
    print("="*60)
    
    df['clean_text'] = df['text_review'].apply(clean_text_safe)
    texts = df['clean_text'].astype(str).tolist()
    
    results_sentiment = []
    results_aspect = []
    scores_aspect = []

    print("ü§ñ Sedang berpikir (Inference)...")
    for text in tqdm(texts, unit="review"):
        # A. Sentimen
        try:
            res = sentiment_model(text[:512], truncation=True, max_length=512)[0]
            sentiment_label = res['label']
        except:
            sentiment_label = "neutral"
            
        # B. Aspek
        try:
            res = aspect_model(text[:512], CANDIDATE_LABELS, multi_label=False)
            best_aspect = res['labels'][0]
            best_score = res['scores'][0]
        except:
            best_aspect = "Lainnya"
            best_score = 0.0
            
        results_sentiment.append(sentiment_label)
        results_aspect.append(best_aspect)
        scores_aspect.append(best_score)
        
    df['AI_Sentiment'] = results_sentiment
    df['AI_Aspek'] = results_aspect
    df['AI_Confidence'] = scores_aspect
    return df

In [None]:
# ==============================================================================
# BAGIAN 5: MAIN (VISUALISASI BINTANG DISINI)
# ==============================================================================
if __name__ == "__main__":
    # 1. Setup
    clf_sentiment, clf_aspect = setup_models()
    
    # 2. Load Data
    df_hotel = load_data_from_folder(NAMA_FOLDER_DATA)
    
    if not df_hotel.empty:
        # Opsional: Uncomment baris bawah untuk tes cepat 10 data
        # df_hotel = df_hotel.head(10) 
        
        # 3. Run Analysis
        df_result = run_analysis(df_hotel, clf_sentiment, clf_aspect)
        
        # 4. Simpan Excel
        df_result.to_excel('Laporan_Analisis_AI_Lengkap.xlsx', index=False)
        print(f"\n‚úÖ Hasil Excel tersimpan.")
        
        # 5. VISUALISASI GRAFIK
        print("\nüìä Membuat Grafik...")
        sns.set_style("whitegrid")
        
        # GRAFIK 1: SENTIMEN BUMN VS KOMPETITOR
        plt.figure(figsize=(10, 6))
        sns.countplot(data=df_result, x='Tipe', hue='AI_Sentiment', palette='viridis')
        plt.title('Sentimen: BUMN vs Kompetitor')
        plt.savefig('Grafik_1_BUMN_vs_Kompetitor.png')
        
        # GRAFIK 2: ISU NEGATIF (PAIN POINTS)
        df_neg = df_result[df_result['AI_Sentiment'] == 'negative']
        if not df_neg.empty:
            plt.figure(figsize=(12, 8))
            order = df_neg['AI_Aspek'].value_counts().index
            sns.countplot(data=df_neg, y='AI_Aspek', hue='Tipe', order=order, palette='magma')
            plt.title('Peta Masalah Utama (Review Negatif)')
            plt.tight_layout()
            plt.savefig('Grafik_2_PainPoints.png')

        # --- GRAFIK 3: SENTIMEN BERDASARKAN BINTANG (REQUEST KHUSUS) ---
        plt.figure(figsize=(10, 6))
        # Mengurutkan agar grafik rapi (Bintang 3 -> 4 -> 5)
        # Jika nama foldermu beda, sesuaikan list order ini
        urutan_bintang = sorted(df_result['Kelas'].unique()) 
        
        sns.countplot(data=df_result, x='Kelas', hue='AI_Sentiment', 
                      order=urutan_bintang, palette='rocket')
        plt.title('Distribusi Sentimen Berdasarkan Kategori Bintang')
        plt.xlabel('Kelas Hotel')
        plt.ylabel('Jumlah Review')
        plt.savefig('Grafik_3_Per_Bintang.png')
        print("   -> Grafik_3_Per_Bintang.png tersimpan! (Cek foldermu)")

        print("\nüéâ SELESAI! Silakan buka file Excel dan gambar Grafiknya.")
        
    else:
        print("‚ùå Data kosong.")