In [1]:
# ============================================================
# EUCAGROW - Train Classifier (Status Kesuburan) + (Optional) Regressor Waktu Tanam
# ============================================================
import os, json, warnings, random
from pathlib import Path

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

SEED = 42
random.seed(SEED); np.random.seed(SEED)

# =========================
# 0) KONFIG
# =========================
# Ganti sesuai file Excel kamu
EXCEL_PATH = r"E:\Coding\web-fullstack\ecugrow\notebooks\data\eucalyptus.xlsx"

# Project root otomatis ditebak dari lokasi Excel: .../ecugrow
# (kalau mau hardcode, set env EUCAGROW_PROJECT_ROOT atau ubah variabel ini manual)
_excel_p = Path(EXCEL_PATH)
_guess_root = _excel_p.parents[2] if len(_excel_p.parents) >= 3 else Path.cwd()
PROJECT_ROOT = Path(os.getenv("EUCAGROW_PROJECT_ROOT", _guess_root))
SAVE_DIR = PROJECT_ROOT / "models"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("SAVE_DIR    :", SAVE_DIR)

# =========================
# 1) UTIL
# =========================
def normalize_col(c: str) -> str:
    c = str(c).strip().lower()
    for ch in [" ", "-", ".", ",", "/", "(", ")", "[", "]"]:
        c = c.replace(ch, "_")
    while "__" in c:
        c = c.replace("__", "_")
    return c.strip("_")

def find_with_tokens(cols, tokens):
    """Cari kolom yg mengandung semua token (robust ke satuan & ejaan)."""
    for c in cols:
        low = c.lower()
        if all(t in low for t in tokens):
            return c
    return None

# =========================
# 2) LOAD EXCEL
# =========================
assert os.path.exists(EXCEL_PATH), f"File tidak ditemukan: {EXCEL_PATH}"
df = pd.read_excel(EXCEL_PATH)
df.columns = [normalize_col(c) for c in df.columns]
print("Kolom (normalized):", df.columns.tolist())

# =========================
# 3) AUTO-MAP -> RENAME KE KANONIK
# =========================
FEATURES_CANON = [
    "suhu_udara","kelembapan_udara","suhu_tanah","kelembapan_tanah",
    "ph_tanah","nitrogen","fosfor","kalium","curah_hujan"
]

# Token yang cocok dengan kolom kamu:
# ['timestamp','suhu_udara_°c','kelembaban_udara_%','suhu_tanah_°c',
#  'kelembaban_tanah_%','ph_tanah','nitrogen_mg_kg','fosfor_mg_kg','kalium_mg_kg',
#  'status_kesuburan','rekomendasi','curah_hujan_mm','lokasi','waktu_tanam']
CANDIDATES = {
    "suhu_udara":        [("suhu","udara")],
    "kelembapan_udara":  [("kelembab","udara"), ("humidity","udara")],   # 'kelembaban' juga match
    "suhu_tanah":        [("suhu","tanah")],
    "kelembapan_tanah":  [("kelembab","tanah"), ("moist","tanah")],
    "ph_tanah":          [("ph","tanah"), ("ph","soil")],
    "nitrogen":          [("nitrogen",), (" n ",), ("_n",), ("n_",)],
    "fosfor":            [("fosfor",), ("phosph",), (" p ",), ("_p",), ("p_",)],
    "kalium":            [("kalium",), (" potas",), (" k ",), ("_k",), ("k_",)],
    "curah_hujan":       [("curah","hujan"), ("rain",), ("precip",)]
}

mapping = {}
for canon, patterns in CANDIDATES.items():
    found = None
    for toks in patterns:
        found = find_with_tokens(df.columns, toks)
        if found: break
    mapping[canon] = found

missing = [k for k,v in mapping.items() if v is None]
if missing:
    raise AssertionError(f"Kolom belum ketemu (update CANDIDATES atau cek Excel): {missing}")

print("AUTO-MAP ->", mapping)
# rename ke nama kanonik yang dipakai aplikasi/web
df = df.rename(columns={v:k for k,v in mapping.items()})

# kolom target
status_col = find_with_tokens(df.columns, ["status","kesubur"])   # -> status_kesuburan
hari_col   = find_with_tokens(df.columns, ["waktu","tanam"])      # -> waktu_tanam
assert status_col is not None, "Kolom status kesuburan tidak ditemukan."

# =========================
# 4) TRAIN CLASSIFIER STATUS
# =========================
X_raw = df[FEATURES_CANON].copy()
# Optional: kalau 0 artinya missing di dataset kamu, ubah ke NaN
X_raw = X_raw.mask(X_raw == 0, np.nan)

y_status = df[status_col].astype(str)

Xtr, Xte, ytr, yte = train_test_split(
    X_raw, y_status, test_size=0.2, random_state=SEED, stratify=y_status
)

pipe_clf = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("rf",  RandomForestClassifier(
        n_estimators=400, random_state=SEED, class_weight="balanced"
    )),
])

param_dist = {
    "rf__n_estimators": [200, 300, 400, 600],
    "rf__max_depth": [None, 8, 12, 16, 24],
    "rf__min_samples_split": [2, 5, 10, 20],
    "rf__min_samples_leaf": [1, 2, 4, 8],
    "rf__max_features": ["sqrt", "log2", None],
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
search = RandomizedSearchCV(
    pipe_clf, param_distributions=param_dist, n_iter=25,
    scoring="f1_weighted", cv=cv, random_state=SEED, n_jobs=-1, verbose=0
)
search.fit(Xtr, ytr)
clf_best = search.best_estimator_

pred = clf_best.predict(Xte)
acc = accuracy_score(yte, pred)
f1w = f1_score(yte, pred, average="weighted")
print(f"[STATUS-CLF] ACC={acc:.4f} | F1w={f1w:.4f}")
print(classification_report(yte, pred))
print("Confusion matrix:\n", confusion_matrix(yte, pred))

# =========================
# 5) SAVE MODEL + METADATA (ABSOLUTE PATH ke PROJECT_ROOT/models)
# =========================
clf_path  = SAVE_DIR / "status_rf_clf.pkl"
meta_path = SAVE_DIR / "status_metadata.json"

joblib.dump(clf_best, clf_path)
print("Saved:", clf_path.resolve())

meta = {
    "model_kind": "classifier",
    "features": FEATURES_CANON,
    # classes string -> untuk verifikasi / UI
    "classes": clf_best.named_steps["rf"].classes_.tolist()
}
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print("Saved:", meta_path.resolve())

# =========================
# 6) (OPSIONAL) TRAIN REGRESSOR WAKTU TANAM
# =========================
if hari_col is not None:
    y_days = pd.to_numeric(df[hari_col], errors="coerce")
    Xtr_r, Xte_r, ytr_r, yte_r = train_test_split(
        X_raw, y_days, test_size=0.2, random_state=SEED
    )
    pipe_reg = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("rf",  RandomForestRegressor(n_estimators=600, random_state=SEED, n_jobs=-1))
    ])
    pipe_reg.fit(Xtr_r, ytr_r)
    pred_r = pipe_reg.predict(Xte_r)
    rmse = float(np.sqrt(np.mean((pred_r - yte_r)**2)))
    mae  = float(np.mean(np.abs(pred_r - yte_r)))
    print(f"[DAYS-REG] RMSE={rmse:.3f} | MAE={mae:.3f}")

    reg_path  = SAVE_DIR / "waktu_tanam_rf_reg.pkl"
    reg_meta  = SAVE_DIR / "waktu_tanam_metadata.json"
    joblib.dump(pipe_reg, reg_path)
    with open(reg_meta, "w", encoding="utf-8") as f:
        json.dump({"model_kind":"regressor","features":FEATURES_CANON}, f, ensure_ascii=False, indent=2)
    print("Saved:", reg_path.resolve())
    print("Saved:", reg_meta.resolve())


PROJECT_ROOT: E:\Coding\web-fullstack\ecugrow
SAVE_DIR    : E:\Coding\web-fullstack\ecugrow\models
Kolom (normalized): ['timestamp', 'suhu_udara_°c', 'kelembaban_udara_%', 'suhu_tanah_°c', 'kelembaban_tanah_%', 'ph_tanah', 'nitrogen_mg_kg', 'fosfor_mg_kg', 'kalium_mg_kg', 'status_kesuburan', 'rekomendasi', 'curah_hujan_mm', 'lokasi', 'waktu_tanam']
AUTO-MAP -> {'suhu_udara': 'suhu_udara_°c', 'kelembapan_udara': 'kelembaban_udara_%', 'suhu_tanah': 'suhu_tanah_°c', 'kelembapan_tanah': 'kelembaban_tanah_%', 'ph_tanah': 'ph_tanah', 'nitrogen': 'nitrogen_mg_kg', 'fosfor': 'fosfor_mg_kg', 'kalium': 'kalium_mg_kg', 'curah_hujan': 'curah_hujan_mm'}
[STATUS-CLF] ACC=0.9433 | F1w=0.9434
              precision    recall  f1-score   support

Kurang Subur       0.97      0.95      0.96        75
Sangat Subur       0.90      0.92      0.91        60
      Sedang       0.95      0.95      0.95       165

    accuracy                           0.94       300
   macro avg       0.94      0.94      0.9