In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

import joblib
import gradio as gr
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Folder kerja:", os.getcwd())

file_path = "jumlah-siswa-laki-laki-menurut-tingkat-tiap-provinsi-sd-2023.xlsx"
df = pd.read_excel(file_path)

df = df.rename(columns={df.columns[0]: "Provinsi"})
print("Kolom data:", df.columns.tolist())
print("\n5 baris pertama:")
print(df.head())

Folder kerja: d:\AML
Kolom data: ['Provinsi', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8']

5 baris pertama:
               Provinsi   Unnamed: 1    Unnamed: 2     Unnamed: 3  \
0                   NaN          NaN           NaN            NaN   
1              Provinsi  Tingkat - I  Tingkat - II  Tingkat - III   
2  Prov. D.K.I. Jakarta        46080         44455          44131   
3      Prov. Jawa Barat       347282        350762         313852   
4     Prov. Jawa Tengah       193862        201087         188244   

     Unnamed: 4   Unnamed: 5    Unnamed: 6 Unnamed: 7 Unnamed: 8  
0           NaN          NaN           NaN        NaN        NaN  
1  Tingkat - IV  Tingkat - V  Tingkat - VI     Jumlah     Status  
2         46366        50129         48618     279779     Negeri  
3        327596       331468        336546    2007506     Negeri  
4        191176       198140        208929    1181438     Negeri  


In [3]:
num_cols = [c for c in df.columns if c != "Provinsi"]

for c in num_cols:
    # Ubah ke string dulu lalu buang semua karakter non-angka
    df[c] = (
        df[c]
        .astype(str)
        .str.replace(r"[^0-9]", "", regex=True)  # buang titik, koma, spasi, dll
        .replace("", np.nan)                     # string kosong -> NaN
    )
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Buang baris yang semua kolom numeriknya NaN
df = df.dropna(subset=num_cols, how="all")

print("\nSetelah cleaning numerik:")
print(df.head())



Setelah cleaning numerik:
                Provinsi  Unnamed: 1  Unnamed: 2  Unnamed: 3  Unnamed: 4  \
2   Prov. D.K.I. Jakarta     46080.0     44455.0     44131.0     46366.0   
3       Prov. Jawa Barat    347282.0    350762.0    313852.0    327596.0   
4      Prov. Jawa Tengah    193862.0    201087.0    188244.0    191176.0   
5  Prov. D.I. Yogyakarta     15177.0     15972.0     15594.0     15527.0   
6       Prov. Jawa Timur    177450.0    179162.0    175194.0    179548.0   

   Unnamed: 5  Unnamed: 6  Unnamed: 7  Unnamed: 8  
2     50129.0     48618.0    279779.0         NaN  
3    331468.0    336546.0   2007506.0         NaN  
4    198140.0    208929.0   1181438.0         NaN  
5     16578.0     17376.0     96224.0         NaN  
6    190737.0    201092.0   1103183.0         NaN  


  .replace("", np.nan)                     # string kosong -> NaN


In [4]:

df['total_siswa'] = df[num_cols].sum(axis=1, min_count=1)


df['total_siswa'] = pd.to_numeric(df['total_siswa'], errors='coerce')


df = df.dropna(subset=['total_siswa'])


n_unique = df['total_siswa'].nunique()
print("Jumlah nilai unik total_siswa:", n_unique)

if n_unique < 3:

    median_val = df['total_siswa'].median()
    df['kategori_siswa'] = df['total_siswa'].apply(
        lambda x: 'Rendah' if x < median_val else 'Tinggi'
    )
else:

    df['kategori_siswa'] = pd.qcut(
        df['total_siswa'],
        q=3,
        labels=['Rendah', 'Sedang', 'Tinggi'],
        duplicates='drop'  # jaga-jaga kalau bin mepet
    )

print(df[['total_siswa', 'kategori_siswa']].head())


Jumlah nilai unik total_siswa: 79
   total_siswa kategori_siswa
2     559558.0         Tinggi
3    4015012.0         Tinggi
4    2362876.0         Tinggi
5     192448.0         Sedang
6    2206366.0         Tinggi


In [5]:
print("Distribusi kelas kategori_siswa:")
X = df[num_cols].copy()
y = df['kategori_siswa']

print(y.value_counts())


Distribusi kelas kategori_siswa:
kategori_siswa
Tinggi    27
Rendah    26
Sedang    26
Name: count, dtype: int64


In [6]:

from sklearn.model_selection import train_test_split

class_counts = y.value_counts()
if class_counts.min() < 2:
    strat = None
    print("\n⚠️ Ada kelas yang datanya sedikit, stratify dimatikan.")
else:
    strat = y

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=strat
)

print("\nJumlah data train:", len(X_train))
print("Jumlah data test :", len(X_test))


Jumlah data train: 55
Jumlah data test : 24


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

preprocess = ColumnTransformer(
    transformers=[
        ("scale", StandardScaler(), num_cols)
    ]
)

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

clf = Pipeline(steps=[
    ("prep", preprocess),
    ("model", rf)
])

clf.fit(X_train, y_train)
print("\n✅ Model berhasil dilatih!")

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count



✅ Model berhasil dilatih!


In [8]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[8 0 0]
 [0 8 0]
 [0 0 8]]
              precision    recall  f1-score   support

      Rendah       1.00      1.00      1.00         8
      Sedang       1.00      1.00      1.00         8
      Tinggi       1.00      1.00      1.00         8

    accuracy                           1.00        24
   macro avg       1.00      1.00      1.00        24
weighted avg       1.00      1.00      1.00        24



In [9]:
import joblib

joblib.dump(clf, "model_absensi.pkl")
print("Model telah disimpan sebagai model_absensi.pkl")


Model telah disimpan sebagai model_absensi.pkl


In [10]:
import joblib
joblib.dump(clf, "model_kategori_siswa.pkl")


['model_kategori_siswa.pkl']

In [11]:
import gradio as gr
import joblib
import pandas as pd


MODEL_PATH = "model_kategori_siswa.pkl"   # GANTI sesuai file yang muncul di glob()

try:
    model = joblib.load(MODEL_PATH)
    print(f"Model '{MODEL_PATH}' berhasil dimuat.")
except FileNotFoundError:
    raise FileNotFoundError(f"❌ File model '{MODEL_PATH}' tidak ditemukan. "
                            f"Pastikan file tersebut berada di folder: {os.getcwd()}")


FEATURE_COLS = ['Kelas1', 'Kelas2', 'Kelas3', 'Kelas4', 'Kelas5', 'Kelas6']


def prediksi_siswa(k1, k2, k3, k4, k5, k6):
    df_input = pd.DataFrame([[k1, k2, k3, k4, k5, k6]], columns=FEATURE_COLS)

    pred = model.predict(df_input)[0]

    try:
        proba = model.predict_proba(df_input).max()
        return f"Kategori: {pred} (Probabilitas: {proba:.2f})"
    except:
        return f"Kategori: {pred}"



demo = gr.Interface(
    fn=prediksi_siswa,
    inputs=[
        gr.Number(label="Jumlah Siswa Kelas 1"),
        gr.Number(label="Jumlah Siswa Kelas 2"),
        gr.Number(label="Jumlah Siswa Kelas 3"),
        gr.Number(label="Jumlah Siswa Kelas 4"),
        gr.Number(label="Jumlah Siswa Kelas 5"),
        gr.Number(label="Jumlah Siswa Kelas 6"),
    ],
    outputs="text",
    title="Prediksi Kategori Jumlah Siswa SD",
    description="Masukkan jumlah siswa kelas 1–6 untuk memprediksi kategori (Rendah / Sedang / Tinggi)."
)

demo.launch(inbrowser=True)


Model 'model_kategori_siswa.pkl' berhasil dimuat.
* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\queueing.py", line 763, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
    )
    ^
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<11 lines>...
    )
    ^
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\blocks.py", line 2106, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<8 lines>...
    )
    ^
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\blocks.py", line 1588, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
          

Created dataset file at: .gradio\flagged\dataset1.csv
