In [3]:
!pip install lightgbm gradio



In [26]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import joblib

# Path file (ganti jika perlu)
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/jumlah siswa menurut kelompok umur tiap propinsi kab enrekang sd-2024.xlsx'
# Jika di Jupyter lokal di Windows, gunakan r'C:\path\to\file.xlsx' atau upload file ke working dir
OUT_DIR = 'outputs'
os.makedirs(OUT_DIR, exist_ok=True)


In [27]:
df = pd.read_excel(DATA_PATH, header=2)

df.head(10)
df.columns


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/jumlah siswa menurut kelompok umur tiap propinsi kab enrekang sd-2024.xlsx'

In [28]:
df = df.dropna(how="all")
df = df.reset_index(drop=True)

df.head()


In [29]:
df = df.rename(columns={
    'Kecamatan': 'kecamatan',
    '< 7 Tahun - Jml': 'lt7_jml',
    '< 7 Tahun - %': 'lt7_pct',
    '7 - 12 Tahun - Jml': 'j7_12_jml',
    '7 - 12 Tahun - %': 'j7_12_pct',
    '> 12 Tahun - Jml': 'gt12_jml',
    '> 12 Tahun - %': 'gt12_pct',
    'Jumlah': 'jumlah',
    'Status': 'status'
})

df.columns


In [30]:
num_cols = ['lt7_jml','lt7_pct','j7_12_jml','j7_12_pct','gt12_jml','gt12_pct','jumlah']

for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df.info()


In [31]:
df['prop_lt7'] = df['lt7_jml'] / df['jumlah']
df['prop_7_12'] = df['j7_12_jml'] / df['jumlah']
df['prop_gt12'] = df['gt12_jml'] / df['jumlah']

df[['prop_lt7','prop_7_12','prop_gt12']] = df[['prop_lt7','prop_7_12','prop_gt12']].fillna(0)

df.head()


In [32]:
plt.figure(figsize=(12,4))
plt.bar(df['kecamatan'], df['prop_lt7'])
plt.xticks(rotation=90)
plt.title("Proporsi Siswa < 7 Tahun per Kecamatan")
plt.tight_layout()
plt.savefig(f"{OUT_DIR}/prop_lt7.png")
plt.show()

plt.figure(figsize=(12,4))
plt.bar(df['kecamatan'], df['prop_7_12'])
plt.xticks(rotation=90)
plt.title("Proporsi Siswa 7–12 Tahun per Kecamatan")
plt.tight_layout()
plt.savefig(f"{OUT_DIR}/prop_7_12.png")
plt.show()

plt.figure(figsize=(12,4))
plt.bar(df['kecamatan'], df['prop_gt12'])
plt.xticks(rotation=90)
plt.title("Proporsi Siswa > 12 Tahun per Kecamatan")
plt.tight_layout()
plt.savefig(f"{OUT_DIR}/prop_gt12.png")
plt.show()


In [33]:
features = ['prop_lt7','prop_7_12','prop_gt12']
X = df[features].values

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

joblib.dump(scaler, f"{OUT_DIR}/scaler.pkl")

X_scaled[:5]


In [34]:
sse = {}
sil_scores = {}

for k in range(2, 7):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)

    sse[k] = km.inertia_
    sil_scores[k] = silhouette_score(X_scaled, km.labels_)

# Elbow
plt.plot(list(sse.keys()), list(sse.values()), marker='o')
plt.title("Elbow Method")
plt.xlabel("Jumlah Cluster (k)")
plt.ylabel("SSE")
plt.savefig(f"{OUT_DIR}/elbow.png")
plt.show()

# Silhouette
plt.plot(list(sil_scores.keys()), list(sil_scores.values()), marker='o')
plt.title("Silhouette Score")
plt.xlabel("Jumlah Cluster (k)")
plt.ylabel("Score")
plt.savefig(f"{OUT_DIR}/silhouette.png")
plt.show()

sil_scores


In [35]:
best_k = max(sil_scores, key=sil_scores.get)
print("Cluster terbaik:", best_k)

kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
kmeans.fit(X_scaled)

df['cluster'] = kmeans.labels_

joblib.dump(kmeans, f"{OUT_DIR}/kmeans.pkl")
df.to_csv(f"{OUT_DIR}/hasil_cluster.csv", index=False)

df.head()


In [37]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8,6))

for lab in sorted(df['cluster'].unique()):
    plt.scatter(X_pca[df['cluster']==lab, 0],
                X_pca[df['cluster']==lab, 1],
                s=70,
                label=f"Cluster {lab}")

plt.legend()
plt.title("Visualisasi Cluster (PCA)")
plt.savefig(f"{OUT_DIR}/pca_clusters.png")
plt.show()


In [38]:
summary = df.groupby('cluster')[features].mean()
summary.to_csv(f"{OUT_DIR}/cluster_summary.csv")

summary


In [40]:
os.listdir(OUT_DIR)


In [41]:
!pip install gradio


In [42]:
import gradio as gr
import joblib
import numpy as np

# Load model yang sudah disimpan
scaler = joblib.load("outputs/scaler.pkl")
kmeans = joblib.load("outputs/kmeans.pkl")


In [43]:
def prediksi_cluster(prop_lt7, prop_7_12, prop_gt12):
    # Input → array
    X = np.array([[prop_lt7, prop_7_12, prop_gt12]])

    # Scaling
    X_scaled = scaler.transform(X)

    # Prediksi cluster
    cluster = int(kmeans.predict(X_scaled)[0])

    # Penjelasan tiap cluster (kamu boleh edit)
    penjelasan = {
        0: "Cluster 0 → Kecamatan dengan dominasi usia 7-12 tahun.",
        1: "Cluster 1 → Kecamatan dengan komposisi seimbang semua umur.",
        2: "Cluster 2 → Kecamatan dengan jumlah siswa <7 tahun cukup besar.",
        3: "Cluster 3 → Kecamatan dengan jumlah siswa >12 tahun relatif tinggi.",
        4: "Cluster 4 → Pola distribusi usia jarang ditemukan (outlier)."
    }

    return f"Cluster: {cluster}\n\nInterpretasi:\n{penjelasan.get(cluster, 'Tidak ada penjelasan')}"


In [44]:
interface = gr.Interface(
    fn=prediksi_cluster,
    inputs=[
        gr.Number(label="Proporsi Siswa < 7 Tahun"),
        gr.Number(label="Proporsi Siswa 7–12 Tahun"),
        gr.Number(label="Proporsi Siswa > 12 Tahun")
    ],
    outputs="text",
    title="Aplikasi Prediksi Cluster Usia Siswa SD",
    description="Masukkan proporsi siswa berdasarkan kelompok umur untuk mengetahui cluster kecamatan."
)

interface.launch()
