In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. LOAD DATASET ---
# Ganti nama file sesuai file kamu
file_path = 'dataset_tugas_purbaya_vs_srimulyani_lengkap(5000).csv'
df = pd.read_csv(file_path)

# --- 2. STATISTIK DATASET AWAL ---
total_data = len(df)
jumlah_tokoh = df['tokoh'].value_counts()

print(f"=== STATISTIK DATASET AWAL ===")
print(f"Total Data Mentah: {total_data} baris")
print(f"Distribusi per Tokoh:\n{jumlah_tokoh}")

# --- 3. ANALISIS CHANNEL YOUTUBE ---
# Menghitung jumlah komentar per channel
channel_counts = df['channel_name'].value_counts().head(
    10)  # Ambil top 10 channel

plt.figure(figsize=(12, 6))
sns.barplot(x=channel_counts.values, y=channel_counts.index, palette='viridis')
plt.title('Top 10 Channel YouTube Sumber Data', fontsize=15)
plt.xlabel('Jumlah Komentar Diambil')
plt.ylabel('Nama Channel')
plt.show()

# --- 4. ANALISIS RELEVANSI DATA DENGAN JUDUL ---
# Judul: "Kebijakan Fiskal: Purbaya vs Sri Mulyani"
# Kita cek apakah komentar mengandung kata kunci relevan
keywords_fiskal = [
    'pajak', 'utang', 'apbn', 'fiskal', 'ekonomi', 'anggaran', 'gaji',
    'subsidi', 'bansos', 'keuangan', 'rupiah', 'investasi', 'bumn',
    'purbaya', 'sri mulyani', 'menkeu', 'menteri', 'prabowo', 'negara'
]


def check_relevance(text):
    if not isinstance(text, str):
        return False
    text = text.lower()
    return any(keyword in text for keyword in keywords_fiskal)


# Terapkan filter
df['is_relevant'] = df['text'].apply(check_relevance)
relevance_counts = df['is_relevant'].value_counts()

# Hitung persentase
relevant_percent = (relevance_counts[True] / total_data) * 100
print(f"\n=== TINGKAT RELEVANSI DATA ===")
print(
    f"Data Relevan (mengandung keyword fiskal/tokoh): {relevance_counts.get(True, 0)} ({relevant_percent:.2f}%)")
print(f"Data Mungkin OOT (Out of Topic): {relevance_counts.get(False, 0)}")

# Visualisasi Relevansi (Pie Chart)
plt.figure(figsize=(7, 7))
plt.pie(relevance_counts, labels=['Relevan (Sesuai Konteks)', 'Potensi Noise/Umum'],
        autopct='%1.1f%%', colors=['#66b3ff', '#ff9999'], startangle=90)
plt.title('Seberapa Sesuai Data dengan Judul "Kebijakan Fiskal"?', fontsize=14)
plt.show()