In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Opsi Tampilan
pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")

# --- 1. MEMUAT DATA ---
# Menggunakan dataset: chronic_disease_children_trend.csv
df = pd.read_csv('../data/chronic_disease_children_trend.csv')

# --- 2. INSPEKSI DATA AWAL ---
print("--- Data Info Awal ---")
print(df.head())
print(df.info())

# --- 3. EKSEKUSI EDA ---

# 3.1. Pengecekan Nilai Hilang
print("\n--- Pengecekan Nilai Hilang ---")
missing_info = df.isnull().sum()
print(missing_info[missing_info > 0]) # Tampilkan hanya kolom yang missing

# 3.2. Statistik Deskriptif (Prevalensi)
print("\n--- Statistik Deskriptif ---")
numerical_cols = ['Asthma_Prevalence_pct', 'Pneumonia_Prevalence_pct', 'Anemia_Prevalence_pct']
print(df[numerical_cols].describe())

# 3.3. Distribusi Prevalensi Penyakit (Histogram)
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(df['Asthma_Prevalence_pct'], kde=True, ax=axes[0]).set_title('Asma')
sns.histplot(df['Pneumonia_Prevalence_pct'], kde=True, ax=axes[1]).set_title('Pneumonia')
sns.histplot(df['Anemia_Prevalence_pct'], kde=True, ax=axes[2]).set_title('Anemia')
plt.tight_layout()
plt.savefig('prevalence_histograms.png')
plt.close()

# 3.4. Analisis Tren Waktu (Rata-rata Nasional)
print("\n--- Tren Waktu Nasional ---")
df_trend = df.groupby('Year')[numerical_cols].mean().reset_index()

plt.figure(figsize=(12, 6))
plt.plot(df_trend['Year'], df_trend['Asthma_Prevalence_pct'], label='Asma', marker='o')
plt.plot(df_trend['Year'], df_trend['Pneumonia_Prevalence_pct'], label='Pneumonia', marker='o')
plt.plot(df_trend['Year'], df_trend['Anemia_Prevalence_pct'], label='Anemia', marker='o')
plt.title('Tren Prevalensi Rata-rata Nasional')
plt.xlabel('Tahun')
plt.ylabel('Prevalensi Rata-rata (%)')
plt.legend()
plt.grid(True)
plt.savefig('time_trend_analysis.png')
plt.close()

# 3.5. Analisis Per Provinsi (Contoh: Anemia Tertinggi)
print("\n--- Top 10 Provinsi Anemia ---")
df_province_anemia = df.groupby('Province')['Anemia_Prevalence_pct'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=df_province_anemia.index, y=df_province_anemia.values)
plt.xticks(rotation=45, ha='right')
plt.title('Top 10 Provinsi: Rata-rata Prevalensi Anemia Tertinggi')
plt.ylabel('Rata-rata Prevalensi Anemia (%)')
plt.tight_layout()
plt.savefig('province_anemia_bar_chart.png')
plt.close()

--- Data Info Awal ---
     Province  Year  Asthma_Prevalence_pct  Pneumonia_Prevalence_pct  \
0  Jawa Barat  2015                  11.92                      2.79   
1  Jawa Barat  2016                  10.56                      2.94   
2  Jawa Barat  2017                  10.90                      3.08   
3  Jawa Barat  2018                  10.75                      4.36   
4  Jawa Barat  2019                   9.58                      3.63   

   Anemia_Prevalence_pct  
0                  24.12  
1                  32.21  
2                  25.76  
3                  28.24  
4                  27.52  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Province                  54 non-null     object 
 1   Year                      54 non-null     int64  
 2   Asthma_Prevalence_pct     54 non-null     float64
 3   