In [1]:
# ================================================================
#   EXERCISE 1: CLASSIFIER PIPELINES (SVM, RF, NN)
# ================================================================

# === 1. Imports ===
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score


In [2]:
# === 2. Load all cleaned datasets ===
datasets = {
    "dataset1": pd.read_csv("dataset1_clean.csv"),
    "dataset2": pd.read_csv("dataset2_clean.csv"),
    "dataset3": pd.read_csv("dataset3_clean.csv"),
    "dataset4": pd.read_csv("dataset4_clean.csv")
}

print("✅ All datasets loaded successfully!")


✅ All datasets loaded successfully!


In [3]:
# === 3. Define classifier pipelines ===
models = {
    "SVM_rbf": make_pipeline(StandardScaler(), SVC(kernel="rbf", C=1.0, gamma="scale")),
    "RandomForest": make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42)),
    "NeuralNet": make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42))
}

print("✅ Pipelines created for SVM, RF, NN")

✅ Pipelines created for SVM, RF, NN


In [4]:
# === 4. Define evaluation functions ===
def evaluate_holdout(model, X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start

    start = time.time()
    y_pred = model.predict(X_test)
    infer_time = time.time() - start

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    return {"Accuracy": acc, "F1_macro": f1, "Train_time_s": train_time, "Infer_time_s": infer_time}

def evaluate_cv(model, X, y, cv=5):
    kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    acc = cross_val_score(model, X, y, cv=kfold, scoring="accuracy", n_jobs=-1)
    f1 = cross_val_score(model, X, y, cv=kfold, scoring="f1_macro", n_jobs=-1)
    return {"CV_Accuracy_mean": acc.mean(), "CV_Accuracy_std": acc.std(),
            "CV_F1_mean": f1.mean(), "CV_F1_std": f1.std()}

In [5]:
# === 5. Run experiments ===
results = []

for dname, df in datasets.items():
    print(f"\n=== Dataset: {dname} ===")
    X = df.drop(columns=["target"])
    y = df["target"]

    # Handle non-numeric targets if any
    if not np.issubdtype(y.dtype, np.number):
        y = pd.factorize(y)[0]

    for mname, model in models.items():
        print(f"→ Running {mname}...")
        try:
            holdout_metrics = evaluate_holdout(model, X, y)
            cv_metrics = evaluate_cv(model, X, y)
            results.append({
                "Dataset": dname,
                "Model": mname,
                **holdout_metrics,
                **cv_metrics
            })
        except Exception as e:
            print(f"❌ Error in {dname} - {mname}: {e}")



=== Dataset: dataset1 ===
→ Running SVM_rbf...
→ Running RandomForest...
→ Running NeuralNet...





=== Dataset: dataset2 ===
→ Running SVM_rbf...
→ Running RandomForest...
→ Running NeuralNet...

=== Dataset: dataset3 ===
→ Running SVM_rbf...
→ Running RandomForest...
→ Running NeuralNet...





=== Dataset: dataset4 ===
→ Running SVM_rbf...
→ Running RandomForest...
→ Running NeuralNet...


In [6]:
# === 6. Save & show results ===
results_df = pd.DataFrame(results)
results_df.to_csv("results_summary.csv", index=False)
print("\n✅ All experiments complete! Results saved to results_summary.csv")
results_df


✅ All experiments complete! Results saved to results_summary.csv


Unnamed: 0,Dataset,Model,Accuracy,F1_macro,Train_time_s,Infer_time_s,CV_Accuracy_mean,CV_Accuracy_std,CV_F1_mean,CV_F1_std
0,dataset1,SVM_rbf,0.701299,0.68145,0.150221,0.060483,0.686249,0.029627,0.670953,0.028535
1,dataset1,RandomForest,0.896104,0.89624,0.313368,0.021006,0.889364,0.028039,0.889636,0.027464
2,dataset1,NeuralNet,0.733766,0.72848,1.310231,0.003317,0.731771,0.013212,0.728019,0.014769
3,dataset2,SVM_rbf,0.999724,0.999718,110.288555,17.600029,0.999763,3.6e-05,0.999757,3.7e-05
4,dataset2,RandomForest,0.999979,0.999978,30.925761,0.375053,0.999996,8e-06,0.999996,9e-06
5,dataset2,NeuralNet,0.999958,0.999957,34.317478,0.10295,0.999945,2.2e-05,0.999944,2.2e-05
6,dataset3,SVM_rbf,0.885057,0.88319,0.012665,0.012983,0.914328,0.020334,0.913713,0.020609
7,dataset3,RandomForest,1.0,1.0,0.401967,0.026652,0.993077,0.005653,0.993075,0.005654
8,dataset3,NeuralNet,1.0,1.0,0.968211,0.002805,1.0,0.0,1.0,0.0
9,dataset4,SVM_rbf,0.816,0.669065,73.593191,14.177604,0.8189,0.003928,0.669935,0.005039
