In [1]:
# train_status_xgb.py
import os, json, time, joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

def normalize_col(c: str) -> str:
    c = str(c).strip().lower()
    for ch in [" ", "-", ".", ",", "/", "(", ")", "[", "]"]:
        c = c.replace(ch, "_")
    while "__" in c:
        c = c.replace("__", "_")
    return c.strip("_")

def find_col(cols, substrs):
    for c in cols:
        low = c.lower()
        if all(s in low for s in substrs):
            return c
    return None

# === 1) BACA DATA ===
EXCEL_PATH = r"E:\Coding\web-fullstack\ecugrow\notebooks\data\eucalyptus.xlsx"  # GANTI path
df = pd.read_excel(EXCEL_PATH)
df.columns = [normalize_col(c) for c in df.columns]

# === 2) MAP FITUR ===
AUTO_MAP = {
    "suhu_udara":       "suhu_udara_°c",
    "kelembapan_udara": "kelembaban_udara_%",
    "suhu_tanah":       "suhu_tanah_°c",
    "kelembapan_tanah": "kelembaban_tanah_%",
    "ph_tanah":         "ph_tanah",
    "nitrogen":         "nitrogen_mg_kg",
    "fosfor":           "fosfor_mg_kg",
    "kalium":           "kalium_mg_kg",
    "curah_hujan":      "curah_hujan_mm",
}
for k, v in list(AUTO_MAP.items()):
    if v not in df.columns:
        if k == "kelembapan_udara":
            AUTO_MAP[k] = find_col(df.columns, ["kelembab","udara"])
        elif k == "kelembapan_tanah":
            AUTO_MAP[k] = find_col(df.columns, ["kelembab","tanah"])
        elif k == "suhu_udara":
            AUTO_MAP[k] = find_col(df.columns, ["suhu","udara"])
        elif k == "suhu_tanah":
            AUTO_MAP[k] = find_col(df.columns, ["suhu","tanah"])
        elif k == "curah_hujan":
            AUTO_MAP[k] = find_col(df.columns, ["curah","hujan"])
        else:
            AUTO_MAP[k] = find_col(df.columns, [k.replace("_"," ")])

FEATURES = [AUTO_MAP[k] for k in AUTO_MAP if AUTO_MAP[k] in df.columns]
assert FEATURES, "Tidak ada fitur yang terdeteksi."

TARGET = "status_kesuburan"
assert TARGET in df.columns, "Kolom target 'status_kesuburan' tidak ditemukan."

# === 3) DATA ===
X = df[FEATURES].apply(pd.to_numeric, errors="coerce")
X = X.mask(X == 0, np.nan)  # kebijakan dataset: 0 dianggap missing
y_raw = df[TARGET].astype(str).fillna("Unknown")

le = LabelEncoder()
y = le.fit_transform(y_raw)
classes = list(le.classes_)

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# === 4) PIPELINE XGB ===
pipe = Pipeline(steps=[
    ("imp", SimpleImputer(strategy="median")),
    ("xgb", XGBClassifier(
        objective="multi:softprob",
        num_class=len(classes),
        random_state=42,
        n_estimators=700,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        reg_alpha=0.0,
        tree_method="hist",
        n_jobs=-1
    ))
])

pipe.fit(Xtr, ytr)
yhat = pipe.predict(Xte)

acc = accuracy_score(yte, yhat)
f1w = f1_score(yte, yhat, average="weighted")
print(f"[STATUS-XGB] ACC={acc:.4f} | F1w={f1w:.4f}")
print(classification_report(yte, yhat, target_names=classes))
print("Confusion matrix:\n", confusion_matrix(yte, yhat))

# === 5) SIMPAN ===
os.makedirs("models", exist_ok=True)
MODEL_PATH = "models/status_xgb_clf.pkl"
META_PATH  = "models/status_metadata.json"

joblib.dump(pipe, MODEL_PATH)
meta = {
    "model_kind": "classifier",
    "algo": "xgboost.XGBClassifier",
    "features": FEATURES,   # urutan fitur yang harus dipakai saat infer
    "classes": classes,
    "trained_at": int(time.time())
}
with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Saved:", MODEL_PATH)
print("Saved:", META_PATH)

[STATUS-XGB] ACC=0.9533 | F1w=0.9534
              precision    recall  f1-score   support

Kurang Subur       1.00      0.96      0.98        75
Sangat Subur       0.92      0.90      0.91        60
      Sedang       0.95      0.97      0.96       165

    accuracy                           0.95       300
   macro avg       0.95      0.94      0.95       300
weighted avg       0.95      0.95      0.95       300

Confusion matrix:
 [[ 72   0   3]
 [  0  54   6]
 [  0   5 160]]
Saved: models/status_xgb_clf.pkl
Saved: models/status_metadata.json
