## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

import joblib


## Load Dataset

In [2]:
df = pd.read_csv("../data/lymphography.csv")

X = df.drop("class", axis=1)
y = df["class"]

## Cross-Validation Setup

In [3]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

## Model Dictionary

In [4]:
models = {
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True),
    "DT": DecisionTreeClassifier(random_state=42),
    "MLP": MLPClassifier(max_iter=500, random_state=42),
    "RF": RandomForestClassifier(random_state=42)
}

## Hyperparameter Tuning

In [5]:
param_grids = {
    "KNN": {
        "n_neighbors": [5, 7, 9],
        "weights": ["uniform", "distance"]
    },
    "SVM": {
        "C": [1, 10, 50],
        "kernel": ["rbf"],
        "gamma": ["scale"]
    },
    "DT": {
        "max_depth": [5, 10, 20],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    },
    "MLP": {
        "hidden_layer_sizes": [(50,), (100,)],
        "activation": ["relu"],
        "alpha": [0.0001, 0.001]
    },
    "RF": {
        "n_estimators": [100, 200],
        "max_depth": [20, 30],
        "min_samples_split": [2],
        "min_samples_leaf": [1]
    }
}


## Training and Cross Validation

In [6]:
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    search = RandomizedSearchCV(
        model,
        param_grids[name],
        n_iter=5,
        cv=skf,
        scoring="accuracy",
        random_state=42,
        n_jobs=-1
    )
    
    search.fit(X, y)
    
    results[name] = {
        "best_score": search.best_score_,
        "best_params": search.best_params_,
        "best_model": search.best_estimator_
    }
    
    print(f"{name} Best Accuracy: {search.best_score_:.4f}")


Training KNN...




KNN Best Accuracy: 0.7786
Training SVM...
SVM Best Accuracy: 0.8390
Training DT...
DT Best Accuracy: 0.7305
Training MLP...




MLP Best Accuracy: 0.8457
Training RF...
RF Best Accuracy: 0.8590


## Compare Model Performance

In [7]:
for model, info in results.items():
    print(model, "→", round(info["best_score"], 4))

KNN → 0.7786
SVM → 0.839
DT → 0.7305
MLP → 0.8457
RF → 0.859


## Save Best Model

In [8]:
best_rf = results["RF"]["best_model"]

joblib.dump(best_rf, "../models/random_forest.pkl")


['../models/random_forest.pkl']

## Conclusion
- Multiple ML models were trained using stratified 10-fold cross-validation
- Hyperparameters were optimized using RandomizedSearchCV
- Random Forest achieved the best overall performance
- The trained model was saved for further evaluation and explainability
