<a href="https://colab.research.google.com/github/asteraaaaa/CSC4600_Data_Mining/blob/main/IzzahMahmud_AutoML_Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -U scikit-learn==1.4.2
!pip install lazypredict optuna



In [3]:
# =========================
# IMPORTS AND DATA
# =========================
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from lazypredict.Supervised import LazyClassifier
import optuna
import pandas as pd
import numpy as np

# Load dataset
iris = load_iris(as_frame=True)
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
iris

{'data':      sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 0                 5.10              3.50               1.40              0.20
 1                 4.90              3.00               1.40              0.20
 2                 4.70              3.20               1.30              0.20
 3                 4.60              3.10               1.50              0.20
 4                 5.00              3.60               1.40              0.20
 ..                 ...               ...                ...               ...
 145               6.70              3.00               5.20              2.30
 146               6.30              2.50               5.00              1.90
 147               6.50              3.00               5.20              2.00
 148               6.20              3.40               5.40              2.30
 149               5.90              3.00               5.10              1.80
 
 [150 rows x 4 columns],
 'target': 0     

# **PART A — Manual “Traditional” Data Mining Workflow**

In [5]:
# Define 3 common algorithms
models = {
    "Logistic Regression": LogisticRegression(max_iter=300),
    "SVM (RBF)": SVC(kernel="rbf", probability=True),
    "Random Forest": RandomForestClassifier(random_state=42)
}

print("=== MANUAL MODEL TESTING ===")
manual_results = []
for name, model in models.items():
    pipe = Pipeline([("scaler", StandardScaler()), ("model", model)])
    scores = cross_val_score(pipe, X_train, y_train, cv=5)
    mean_acc = np.mean(scores)
    manual_results.append((name, mean_acc))
    print(f"{name:20s} mean accuracy = {mean_acc:.3f}")

manual_df = pd.DataFrame(manual_results, columns=["Model", "Mean Accuracy"])
manual_df


=== MANUAL MODEL TESTING ===
Logistic Regression  mean accuracy = 0.958
SVM (RBF)            mean accuracy = 0.967
Random Forest        mean accuracy = 0.950


Unnamed: 0,Model,Mean Accuracy
0,Logistic Regression,0.96
1,SVM (RBF),0.97
2,Random Forest,0.95


# **PART B — AutoML (LazyPredict + Optuna)**

In [6]:
print("\n=== AUTOMATED MODEL COMPARISON (LazyPredict) ===")
lazy = LazyClassifier(ignore_warnings=True, random_state=42)
lazy_models, _ = lazy.fit(X_train, X_test, y_train, y_test)
display(lazy_models.head(10))  # Top 10 models leaderboard

# --- Automated hyperparameter tuning on the best (RandomForest) ---
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 400)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    model = RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth,
        random_state=42, n_jobs=-1
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, show_progress_bar=True)

best_params = study.best_params
best_acc = study.best_value
print(f"Best Auto-tuned RF params: {best_params}")
print(f"Best RF accuracy: {best_acc:.3f}")

best_rf = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train, y_train)
print("\nAutoML RandomForest report:\n")
print(classification_report(y_test, best_rf.predict(X_test), target_names=iris.target_names))



=== AUTOMATED MODEL COMPARISON (LazyPredict) ===


  0%|          | 0/29 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 87
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 4
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearDiscriminantAnalysis,1.0,1.0,,1.0,0.04
QuadraticDiscriminantAnalysis,1.0,1.0,,1.0,0.04
BaggingClassifier,0.97,0.97,,0.97,0.04
LabelPropagation,0.97,0.97,,0.97,0.02
NuSVC,0.97,0.97,,0.97,0.02
GaussianNB,0.97,0.97,,0.97,0.03
SVC,0.97,0.97,,0.97,0.01
LabelSpreading,0.97,0.97,,0.97,0.02
ExtraTreeClassifier,0.97,0.97,,0.97,0.02
ExtraTreesClassifier,0.93,0.93,,0.93,0.2


[I 2025-10-23 03:42:57,970] A new study created in memory with name: no-name-dc9e61d6-2df3-4fae-8823-ab046fcf6c17


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-10-23 03:42:58,581] Trial 0 finished with value: 0.9 and parameters: {'n_estimators': 175, 'max_depth': 8}. Best is trial 0 with value: 0.9.
[I 2025-10-23 03:42:58,986] Trial 1 finished with value: 0.9333333333333333 and parameters: {'n_estimators': 90, 'max_depth': 7}. Best is trial 1 with value: 0.9333333333333333.
[I 2025-10-23 03:42:59,412] Trial 2 finished with value: 0.9 and parameters: {'n_estimators': 98, 'max_depth': 19}. Best is trial 1 with value: 0.9333333333333333.
[I 2025-10-23 03:42:59,975] Trial 3 finished with value: 0.9 and parameters: {'n_estimators': 143, 'max_depth': 5}. Best is trial 1 with value: 0.9333333333333333.
[I 2025-10-23 03:43:02,108] Trial 4 finished with value: 0.9333333333333333 and parameters: {'n_estimators': 381, 'max_depth': 16}. Best is trial 1 with value: 0.9333333333333333.
[I 2025-10-23 03:43:02,497] Trial 5 finished with value: 0.9 and parameters: {'n_estimators': 60, 'max_depth': 20}. Best is trial 1 with value: 0.9333333333333333.
[

# **PART C — Compare Manual vs AutoML Results**

In [8]:
summary = pd.DataFrame({
    "Manual Best Model": [manual_df.loc[manual_df['Mean Accuracy'].idxmax(), 'Model']],
    "Manual Accuracy": [manual_df['Mean Accuracy'].max()],
    "AutoML Best Model": [lazy_models.index[0]],
    "AutoML Accuracy": [lazy_models['Accuracy'].iloc[0]],
    "Auto-Tuned RF Accuracy": [best_acc]
})
display(summary)


Unnamed: 0,Manual Best Model,Manual Accuracy,AutoML Best Model,AutoML Accuracy,Auto-Tuned RF Accuracy
0,SVM (RBF),0.97,LinearDiscriminantAnalysis,1.0,0.93
