In [1]:
# ================================================================
#   EXERCISE 1: CLASSIFIER PIPELINES (SVM, RF, NN)
# ================================================================

# === 1. Imports ===
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score


In [2]:
# === 2. Load all cleaned datasets ===
datasets = {
    # windows
    # "dataset1": pd.read_csv("dataset1_clean.csv"),
    # "dataset2": pd.read_csv("dataset2_clean.csv"),
    # "dataset3": pd.read_csv("dataset3_clean.csv"),
    # "dataset4": pd.read_csv("dataset4_clean.csv")
    "dataset1": pd.read_csv("dataset1_clean.csv"),
    "dataset2": pd.read_csv("dataset2_clean.csv"),
    "dataset3": pd.read_csv("dataset3_clean.csv"),
    "dataset4": pd.read_csv("dataset4_clean.csv")
}

print("‚úÖ All datasets loaded successfully!")


‚úÖ All datasets loaded successfully!


In [3]:
# 3Ô∏è‚É£ Define classifier pipelines
models = {
    "SVM_rbf": make_pipeline(StandardScaler(), SVC(kernel="rbf", C=1.0, gamma="scale")),
    "RandomForest": make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42)),
    "NeuralNet": make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42))
}

print("‚úÖ Pipelines created for SVM, RF, NN")

‚úÖ Pipelines created for SVM, RF, NN


In [4]:
# # === 3. Define classifier pipelines ===
# models = {
#     "SVM_rbf": make_pipeline(StandardScaler(), SVC(kernel="rbf", C=1.0, gamma="scale")),
#     "RandomForest": make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42)),
#     "NeuralNet": make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42))
# }
#
# print("‚úÖ Pipelines created for SVM, RF, NN")


from sklearn.model_selection import ParameterGrid

# === 3b. Define parameter grids per model ===
param_grids = {
    "SVM_rbf": ParameterGrid({ #todo Why are we using rbf?
        "svc__C": [0.1, 1, 10, 100],
        "svc__gamma": ["scale", "auto", 0.01, 0.001],
        "svc__class_weight": [None, "balanced"]
    }),
    "RandomForest": ParameterGrid({
        "randomforestclassifier__n_estimators": [100, 300, 600],
        "randomforestclassifier__max_depth": [None, 10, 20],
        "randomforestclassifier__min_samples_split": [2, 5],
        "randomforestclassifier__max_features": ["sqrt", "log2", None],
        "randomforestclassifier__class_weight": [None, "balanced"]
    }),
    "NeuralNet": ParameterGrid({
        "mlpclassifier__hidden_layer_sizes": [(100,), (200,), (100, 100)],
        "mlpclassifier__alpha": [0.0001, 0.001, 0.01],
        "mlpclassifier__learning_rate_init": [0.001, 0.01],
        "mlpclassifier__early_stopping": [True],   # helps stabilize runs
        "mlpclassifier__max_iter": [300],          # keep fixed here
    })
}

In [5]:
from sklearn.base import clone
from sklearn.model_selection import cross_validate, StratifiedKFold
import pandas as pd
import numpy as np
import json
import time

def evaluate_cv_paramset(model, X, y, params, cv=5, n_jobs=-1, random_state=42):
    est = clone(model).set_params(**params)
    kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
    scoring = {"accuracy": "accuracy", "f1_macro": "f1_macro"}
    cv_out = cross_validate(
        est, X, y,
        cv=kfold,
        scoring=scoring,
        n_jobs=n_jobs,
        return_train_score=False
    )
    result = {
        "CV_Accuracy_mean": np.mean(cv_out["test_accuracy"]),
        "CV_Accuracy_std":  np.std(cv_out["test_accuracy"]),
        "CV_F1_mean":       np.mean(cv_out["test_f1_macro"]),
        "CV_F1_std":        np.std(cv_out["test_f1_macro"]),
        "Fit_time_mean_s":  np.mean(cv_out["fit_time"]),
        "Fit_time_std_s":   np.std(cv_out["fit_time"]),
        "Score_time_mean_s":np.mean(cv_out["score_time"]),
        "Score_time_std_s": np.std(cv_out["score_time"]),
    }
    return result

In [6]:
from sklearn.base import clone
from sklearn.model_selection import cross_validate, StratifiedKFold
import pandas as pd
import numpy as np
import json
import time

def evaluate_cv_paramset(model, X, y, params, cv=5, n_jobs=-1, random_state=42):
    est = clone(model).set_params(**params)
    kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
    scoring = {"accuracy": "accuracy", "f1_macro": "f1_macro"}
    cv_out = cross_validate(
        est, X, y,
        cv=kfold,
        scoring=scoring,
        n_jobs=n_jobs,
        return_train_score=False
    )
    result = {
        "CV_Accuracy_mean": np.mean(cv_out["test_accuracy"]),
        "CV_Accuracy_std":  np.std(cv_out["test_accuracy"]),
        "CV_F1_mean":       np.mean(cv_out["test_f1_macro"]),
        "CV_F1_std":        np.std(cv_out["test_f1_macro"]),
        "Fit_time_mean_s":  np.mean(cv_out["fit_time"]),
        "Fit_time_std_s":   np.std(cv_out["fit_time"]),
        "Score_time_mean_s":np.mean(cv_out["score_time"]),
        "Score_time_std_s": np.std(cv_out["score_time"]),
    }
    return result

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

def evaluate_holdout_paramset(model, X, y, params, test_size=0.2, random_state=42):
    est = clone(model).set_params(**params)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    t0 = time.time()
    est.fit(X_train, y_train)
    train_time = time.time() - t0

    t0 = time.time()
    y_pred = est.predict(X_test)
    infer_time = time.time() - t0

    return {
        "Holdout_Accuracy": accuracy_score(y_test, y_pred),
        "Holdout_F1_macro": f1_score(y_test, y_pred, average="macro"),
        "Holdout_Train_time_s": train_time,
        "Holdout_Infer_time_s": infer_time
    }

In [8]:
results = []

# If your CSVs include the target column, set this:
TARGET_COL = "target"  # <-- change to your actual target name

for ds_name, df in datasets.items():
    # Split features/target
    X = df.drop(columns=[TARGET_COL]).values
    y = df[TARGET_COL].values

    for model_name, base_model in models.items():
        grid = param_grids[model_name]

        for params in grid:
            row = {
                "Dataset": ds_name,
                "Model": model_name,
                "Params": json.dumps(params, sort_keys=True)
            }

            # --- CV evaluation ---
            cv_metrics = evaluate_cv_paramset(base_model, X, y, params, cv=5, n_jobs=-1)
            row.update(cv_metrics)

            # --- Holdout evaluation (optional; comment out if not needed) ---
            holdout_metrics = evaluate_holdout_paramset(base_model, X, y, params, test_size=0.2)
            row.update(holdout_metrics)

            results.append(row)

results_df = pd.DataFrame(results)
print(f"‚úÖ Completed {len(results_df)} evaluations.")

‚úÖ Completed 632 evaluations.


In [9]:
# Rank models per dataset by CV_F1_mean (descending)
results_df["Rank_in_dataset_model"] = (
    results_df
    .groupby(["Dataset", "Model"])["CV_F1_mean"]
    .rank(ascending=False, method="dense")
)

# Persist for later analysis
results_df.to_csv("all_model_param_results.csv", index=False)
print("üíæ Saved to all_model_param_results.csv")

# Show top-3 settings per model/dataset
display(
    results_df
    .sort_values(["Dataset", "Model", "CV_F1_mean"], ascending=[True, True, False])
    .groupby(["Dataset", "Model"])
    .head(3)
    .reset_index(drop=True)
)

üíæ Saved to all_model_param_results.csv


Unnamed: 0,Dataset,Model,Params,CV_Accuracy_mean,CV_Accuracy_std,CV_F1_mean,CV_F1_std,Fit_time_mean_s,Fit_time_std_s,Score_time_mean_s,Score_time_std_s,Holdout_Accuracy,Holdout_F1_macro,Holdout_Train_time_s,Holdout_Infer_time_s,Rank_in_dataset_model
0,dataset1,NeuralNet,"{""mlpclassifier__alpha"": 0.001, ""mlpclassifier...",0.718861,0.062995,0.713829,0.064635,0.045008,0.025108,0.001709,0.000142,0.662338,0.647673,0.029142,0.000399,1.0
1,dataset1,NeuralNet,"{""mlpclassifier__alpha"": 0.001, ""mlpclassifier...",0.713564,0.028525,0.708461,0.031428,0.035847,0.012182,0.0016,0.000116,0.766234,0.760997,0.058984,0.000326,2.0
2,dataset1,NeuralNet,"{""mlpclassifier__alpha"": 0.0001, ""mlpclassifie...",0.711035,0.05127,0.705732,0.052827,0.04209,0.017852,0.001583,6.6e-05,0.662338,0.647673,0.028271,0.000344,3.0
3,dataset1,RandomForest,"{""randomforestclassifier__class_weight"": ""bala...",0.902368,0.019653,0.902358,0.019793,0.19323,0.000965,0.012392,0.000199,0.896104,0.89624,0.193487,0.011047,1.0
4,dataset1,RandomForest,"{""randomforestclassifier__class_weight"": ""bala...",0.902368,0.019653,0.902358,0.019793,0.192314,0.000443,0.012397,0.000165,0.896104,0.89624,0.189907,0.011678,1.0
5,dataset1,RandomForest,"{""randomforestclassifier__class_weight"": ""bala...",0.902368,0.019653,0.902358,0.019793,0.192234,0.000644,0.012288,0.000102,0.896104,0.89624,0.188042,0.011516,1.0
6,dataset1,SVM_rbf,"{""svc__C"": 100, ""svc__class_weight"": null, ""sv...",0.748748,0.037132,0.748329,0.037546,0.009186,0.000276,0.003726,0.000115,0.714286,0.714919,0.00879,0.002472,1.0
7,dataset1,SVM_rbf,"{""svc__C"": 100, ""svc__class_weight"": null, ""sv...",0.748748,0.037132,0.748329,0.037546,0.008952,0.000335,0.003519,0.00015,0.714286,0.714919,0.009564,0.002398,1.0
8,dataset1,SVM_rbf,"{""svc__C"": 100, ""svc__class_weight"": ""balanced...",0.748748,0.037132,0.748329,0.037546,0.008979,0.000167,0.003579,7.6e-05,0.714286,0.714919,0.009338,0.002409,1.0
9,dataset2,NeuralNet,"{""mlpclassifier__alpha"": 0.0001, ""mlpclassifie...",0.999941,2.1e-05,0.999939,2.1e-05,5.588167,0.217935,0.028802,0.001435,0.999958,0.999957,5.374732,0.0251,1.0
