In [3]:
!pip install xgboost


Collecting xgboost
  Using cached xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.1.1



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import joblib
import xgboost as xgb

# Path dasar 
ROOT = Path.cwd().parent
OUT_PATH = ROOT / "data" / "processed" / "features" / "train_features.csv"

print("Feature file:", OUT_PATH)

df = pd.read_csv(OUT_PATH)
df.head()

Feature file: d:\Bismillah Kuliah\Semester 3\TPSC\New folder\rice-leaf-disease-ml\data\processed\features\train_features.csv


Unnamed: 0,contrast_mean,contrast_std,dissimilarity_mean,dissimilarity_std,homogeneity_mean,homogeneity_std,energy_mean,energy_std,ASM_mean,ASM_std,correlation_mean,correlation_std,entropy,cluster_shade,cluster_prominence,intensity_mean,intensity_std,intensity_var,label,filename
0,192.332773,160.534113,5.177595,2.878678,0.549261,0.046668,0.457731,0.001552,0.20952,0.00142,0.978471,0.01797,7.620698,408263.3,442047100.0,113.71916,29.970697,898.242655,bacterial_leaf_blight,bacterial_leaf_blight (1).JPG
1,160.716867,127.354181,1.887305,1.208933,0.881796,0.015744,0.85539,0.001064,0.731693,0.00182,0.959218,0.032288,2.399805,1781980.0,539123500.0,110.395865,43.555198,1897.055304,bacterial_leaf_blight,bacterial_leaf_blight (10).JPG
2,136.917713,106.136375,3.474702,1.786967,0.573344,0.055062,0.439615,0.001641,0.193264,0.001442,0.984,0.012404,7.398068,491160.3,486075100.0,106.934608,30.278577,916.792216,bacterial_leaf_blight,bacterial_leaf_blight (100).JPG
3,149.157389,118.73215,3.02203,1.693803,0.7613,0.026582,0.708265,0.001072,0.501641,0.001518,0.976686,0.018545,4.547847,1956823.0,557629800.0,107.343281,38.826943,1507.531466,bacterial_leaf_blight,bacterial_leaf_blight (101).JPG
4,150.107731,101.642898,1.953991,1.071589,0.829012,0.02255,0.778002,0.001105,0.605288,0.00172,0.973426,0.017975,3.37735,2012634.0,564275900.0,111.84546,35.328775,1248.122347,bacterial_leaf_blight,bacterial_leaf_blight (102).JPG


In [3]:
# Buang kolom non-fitur
X = df.drop(columns=["label", "filename"]).values

# Label dalam bentuk kategori
y_cat = df["label"].astype("category")
y = y_cat.cat.codes              # encode jadi angka (0,1,2,...)
class_names = list(y_cat.cat.categories)

print("Shape X:", X.shape)
print("Shape y:", y.shape)
print("Classes :", class_names)

Shape X: (2167, 18)
Shape y: (2167,)
Classes : ['bacterial_leaf_blight', 'brown_spot', 'healthy', 'leaf_blast', 'leaf_scald', 'narrow_brown_spot']


In [4]:
# split train & validation 
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,  
)

print("Train size:", X_train.shape[0])
print("Val size  :", X_val.shape[0])

Train size: 1733
Val size  : 434


In [5]:
# baseline 

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1,
)

rf.fit(X_train, y_train)
print("Random Forest training selesai")



Random Forest training selesai


In [6]:
# XGBoot dengan Hyperparameter Tuning 

from sklearn.model_selection import RandomizedSearchCV

xgb_clf = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=len(class_names),
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=42,
)

param_dist = {
    "n_estimators":      [100, 200, 300, 400],
    "max_depth":         [3, 4, 5, 6, 8],
    "learning_rate":     [0.01, 0.05, 0.1, 0.2],
    "subsample":         [0.7, 0.8, 0.9, 1.0],
    "colsample_bytree":  [0.7, 0.8, 0.9, 1.0],
    "gamma":             [0, 0.1, 0.3, 0.5],
    "min_child_weight":  [1, 3, 5],
}

search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=20,             
    scoring="f1_macro",
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42,
)

search.fit(X_train, y_train)

print("Best params:", search.best_params_)
print("Best CV f1_macro:", search.best_score_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params: {'subsample': 0.9, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Best CV f1_macro: 0.8744472530565677


In [7]:
from pathlib import Path
import joblib

ROOT = Path.cwd().parent
MODEL_DIR = ROOT / "models"
MODEL_DIR.mkdir(exist_ok=True)

FINAL_MODEL_PATH = MODEL_DIR / "rice_leaf_xgb.pkl"
joblib.dump(search.best_estimator_, FINAL_MODEL_PATH)

print("Model final disimpan ke:", FINAL_MODEL_PATH)


Model final disimpan ke: d:\Bismillah Kuliah\Semester 3\TPSC\New folder\rice-leaf-disease-ml\models\rice_leaf_xgb.pkl


In [8]:
from io import StringIO
from sklearn.metrics import classification_report

RESULTS_DIR = ROOT / "results"
RESULTS_DIR.mkdir(exist_ok=True)

y_pred_final = search.best_estimator_.predict(X_val)

with open(RESULTS_DIR / "xgb_report.txt", "w", encoding="utf-8") as f:
    f.write("Model Final: XGBoost Tuned\n")
    f.write("Best CV F1-macro: {:.4f}\n\n".format(search.best_score_))

    f.write("Best Hyperparameters:\n")
    for k, v in search.best_params_.items():
        f.write(f"{k}: {v}\n")

    f.write("\nClassification Report (Validation Set):\n")
    f.write(classification_report(y_val, y_pred_final, target_names=class_names))

print("Report disimpan ke:", RESULTS_DIR / "xgb_report.txt")


Report disimpan ke: d:\Bismillah Kuliah\Semester 3\TPSC\New folder\rice-leaf-disease-ml\results\xgb_report.txt


In [9]:
feature_cols = df.drop(columns=["label", "filename"]).columns.tolist()
feature_cols


['contrast_mean',
 'contrast_std',
 'dissimilarity_mean',
 'dissimilarity_std',
 'homogeneity_mean',
 'homogeneity_std',
 'energy_mean',
 'energy_std',
 'ASM_mean',
 'ASM_std',
 'correlation_mean',
 'correlation_std',
 'entropy',
 'cluster_shade',
 'cluster_prominence',
 'intensity_mean',
 'intensity_std',
 'intensity_var']

In [10]:
y_cat.cat.categories

Index(['bacterial_leaf_blight', 'brown_spot', 'healthy', 'leaf_blast',
       'leaf_scald', 'narrow_brown_spot'],
      dtype='object')