In [1]:
from src.NestedCV import RepeatedNestedCV

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import pandas as pd


In [25]:
data = pd.read_csv("D:/MLinCB/Assignment-2-MLinCB/data/breast_cancer.csv")
data['target'] = data['diagnosis'].map({'M': 0, 'B': 1})

X = data.drop(columns=['id', 'diagnosis', 'target']) 
feature_names = X.columns
X = pd.DataFrame(X, columns=feature_names)
y = data['target'] 
data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,target
0,1,M,14.68,20.13,94.74,684.5,0.09867,0.07200,0.07395,0.05259,...,30.88,123.40,1138.0,0.1464,0.1871,0.29140,0.16090,0.3029,0.08216,0
1,2,B,11.50,18.45,73.28,407.4,0.09345,0.05991,0.02638,0.02069,...,22.46,83.12,508.9,0.1183,0.1049,0.08105,0.06544,0.2740,,1
2,3,M,15.85,23.95,103.70,782.7,0.08401,0.10020,0.09938,0.05364,...,27.66,112.00,876.5,0.1131,0.1924,0.23220,0.11190,0.2809,0.06287,0
3,4,M,18.82,21.97,123.70,1110.0,0.10180,0.13890,0.15940,0.08744,...,30.93,145.30,1603.0,0.1390,0.3463,0.39120,0.17080,0.3007,0.08314,0
4,5,B,12.95,16.02,83.14,513.7,0.10050,0.07943,0.06155,0.03370,...,19.93,88.81,585.4,0.1483,,0.22410,0.10560,0.3380,0.09584,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,508,B,13.00,25.13,82.61,520.2,0.08369,0.05073,0.01206,0.01762,...,31.88,91.06,628.5,0.1218,0.1093,0.04462,0.05921,0.2306,0.06291,1
508,509,B,14.20,20.53,92.41,,0.08931,0.11080,0.05063,,...,27.26,112.10,828.5,0.1153,0.3429,0.25120,0.13390,0.2534,0.07858,1
509,510,M,13.86,16.93,90.96,578.9,0.10260,0.15170,,0.05602,...,26.93,104.40,750.1,0.1460,0.4370,0.46360,0.16540,0.3630,0.10590,0
510,511,M,17.30,17.08,113.00,928.2,0.10080,0.10410,0.12660,0.08353,...,25.09,130.90,1222.0,0.1416,0.2405,0.33780,0.18570,0.3138,0.08113,0


In [26]:
estimators = {
    'lr_elasticnet': Pipeline([
        ('scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy='mean')),
        ('clf', LogisticRegression(
            penalty='elasticnet',
            solver='saga',
            max_iter=5000,
            random_state=42,
            verbose=0
        ))
    ]),
    'gnb': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('clf', GaussianNB())  # No verbosity/verbose parameter in GaussianNB
    ]),
    'lda': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('clf', LinearDiscriminantAnalysis(
            solver='lsqr',
            shrinkage='auto'  # No verbosity parameter here
        ))
    ]),
    'svc': Pipeline([
        ('scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy='mean')),
        ('clf', SVC(
            probability=True,
            random_state=42,
            verbose=0
        ))
    ]),
    'rf': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('clf', RandomForestClassifier(
            random_state=42,
            verbose=0
        ))
    ]),
    'lgbm': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('clf', lgb.LGBMClassifier(
            random_state=42,
            verbosity=-1
        ))
    ])
}
param_grids = {
    'lr_elasticnet': {
        'clf__C': [0.01, 0.1, 1, 10],
        'clf__l1_ratio': [0.0, 0.5, 1.0]
    },
    'gnb': {
        'clf__var_smoothing': [1e-9, 1e-8, 1e-7]
    },
    'lda': {
        'clf__shrinkage': [None, 0.5, 1.0]
    },
    'svc': {
        'clf__C': [0.1, 1, 10],
        'clf__gamma': ['scale', 'auto'],
        'clf__kernel': ['rbf', 'linear']
    },
    'rf': {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [None, 5, 10]
    },
    'lgbm': {
        'clf__n_estimators': [100, 200],
        'clf__num_leaves': [31, 63],
        'clf__learning_rate': [0.01, 0.1]
    }
}


In [28]:
rncv = RepeatedNestedCV(estimators, param_grids, R=10, N=5, K=3, inner_scoring='roc_auc', random_state=42)
raw = rncv.run(X, y)  # 50 evaluations per metric
summary, winner = rncv.summarize(raw, purpose_metrics=('auc', 'mcc'), alpha=0.05)

print("Summary with medians and 95% CI:")
for name, metrics in summary.items():
    print(f"\n{name}:")
    for m, stats in metrics.items():
        print(f"  {m}: median={stats['median']:.3f}, CI=[{stats['ci_lower']:.3f}, {stats['ci_upper']:.3f}]")
print(f"\nWinner based on purpose metrics AUC then MCC: {winner}")




Summary with medians and 95% CI:

lr_elasticnet:
  mcc: median=0.938, CI=[0.918, 0.939]
  auc: median=0.995, CI=[0.988, 0.997]
  ba: median=0.961, CI=[0.954, 0.971]
  f1: median=0.977, CI=[0.970, 0.977]
  f2: median=0.985, CI=[0.981, 0.991]
  recall: median=1.000, CI=[0.984, 1.000]
  precision: median=0.968, CI=[0.955, 0.970]
  prauc: median=0.997, CI=[0.992, 0.998]
  specificity: median=0.947, CI=[0.921, 0.947]
  npv: median=1.000, CI=[0.973, 1.000]

gnb:
  mcc: median=0.853, CI=[0.833, 0.857]
  auc: median=0.986, CI=[0.983, 0.988]
  ba: median=0.914, CI=[0.909, 0.922]
  f1: median=0.946, CI=[0.939, 0.948]
  f2: median=0.960, CI=[0.956, 0.966]
  recall: median=0.969, CI=[0.969, 0.969]
  precision: median=0.924, CI=[0.913, 0.926]
  prauc: median=0.992, CI=[0.990, 0.993]
  specificity: median=0.868, CI=[0.842, 0.868]
  npv: median=0.944, CI=[0.941, 0.949]

lda:
  mcc: median=0.897, CI=[0.876, 0.916]
  auc: median=0.991, CI=[0.989, 0.994]
  ba: median=0.936, CI=[0.921, 0.947]
  f1: media

## Winner model is SVC. Let's now run CV to tune hyperparametres

In [19]:
from src.CV import SVCParameterTuner
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

In [20]:
# Expanded hyperparameter grid for SVC
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
    'clf__kernel': ['rbf', 'linear', 'poly'],
    'clf__degree': [2, 3, 4],            # only used when kernel='poly'
    'clf__coef0': [0.0, 0.1, 0.5]         # poly and sigmoid kernels
}

# Initialize tuner with 5-fold CV
tuner = SVCParameterTuner(
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='accuracy',
    n_jobs=-1
)
tuner.fit(X, y)

# Display results
print(f"Best parameters: {tuner.best_params()}")
print(f"Best CV score: {tuner.best_score():.4f}")
# Retrieve the best pipeline
best_model = tuner.best_estimator()

Best parameters: {'clf__C': 10, 'clf__coef0': 0.0, 'clf__degree': 2, 'clf__gamma': 0.01, 'clf__kernel': 'rbf'}
Best CV score: 0.9765


# Train final model on the entire dataset with best parameters

In [None]:
import pickle

In [None]:
    best_params = tuner.best_params()
    final_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy='mean')),
        ('clf', SVC(
            C=best_params['clf__C'],
            gamma=best_params['clf__gamma'],
            kernel=best_params['clf__kernel'],
            degree=best_params.get('clf__degree', 3),
            coef0=best_params.get('clf__coef0', 0.0),
            probability=True,
            random_state=42,
            verbose=0
        ))
    ])
    final_pipeline.fit(X, y)

    # Save the trained model to disk
    with open('models/winner.pkl', 'wb') as f:
        pickle.dump(final_pipeline, f)
    print("Trained model saved as 'winner.pkl'.")

Trained model saved as 'winner.pkl'.
