In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, f1_score


In [None]:

def get_models_results_df(X, y):
    """
    Esegue modelli di classificazione (Logistic Regression + KNN con k=2,3,5,7)
    su due strategie di CV (KFold e StratifiedKFold) e ritorna un DataFrame
    con media e std dell'F1-score.
    """
    # Setup modelli
    models = {
        "logistic_regression": LogisticRegression(max_iter=5000),
    }
    for k in [2, 3, 5, 7]:
        models[f"knn_k={k}"] = KNeighborsClassifier(n_neighbors=k)
        
        
    df_models = pd.DataFrame({
        "model_name": list(models.keys()),
        "model_object": list(models.values())
    })
    print("===== MODELLI DEFINITI =====")
    print(df_models.head())

In [None]:
data = load_breast_cancer()
X, y = data.data, data.target

df_results = get_models_results_df(X, y)

#print("Prime righe di X:")
#print(pd.DataFrame(X, columns=data.feature_names).head(), "\n")

#print("Prime righe di y:")
#print(pd.Series(y).head(), "\n")

Prime righe di X:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  worst perimet

In [None]:
results_df = pd.DataFrame(columns=["model", "cv_strategy", "f1_mean", "f1_std"])

for model_name, model in models.items():
    for cv_name, cv in cv_strategies.items():
        scores = cross_val_score(model, X, y, cv=cv, scoring=scorer)
        results_df.loc[len(results_df)] = {
            "model": model_name,
            "cv_strategy": cv_name,
            "f1_mean": np.mean(scores),
            "f1_std": np.std(scores)
        }

# Stampa i risultati
print(results_df)


                 model       cv_strategy   f1_mean    f1_std
0  logistic_regression             kfold  0.960656  0.016474
1  logistic_regression  stratified_kfold  0.963964  0.014419
2              knn_k=2             kfold  0.932949  0.013839
3              knn_k=2  stratified_kfold  0.926589  0.011127
4              knn_k=3             kfold  0.936087  0.017667
5              knn_k=3  stratified_kfold  0.943201  0.017505
6              knn_k=5             kfold  0.949712  0.022984
7              knn_k=5  stratified_kfold  0.948953  0.014853
8              knn_k=7             kfold  0.942988  0.025222
9              knn_k=7  stratified_kfold  0.943811  0.014200


In [None]:
models = {
    "logistic_regression": LogisticRegression(max_iter=5000),
}
for k in [2, 3, 5, 7]:
    models[f"knn_k={k}"] = KNeighborsClassifier(n_neighbors=k)

# CV strategies
cv_strategies = {
    "kfold": KFold(n_splits=5, shuffle=True, random_state=42),
    "stratified_kfold": StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
}

# ----- STAMPA MODELLI -----
print("\nTABELLA MODELLI:")
df_models = pd.DataFrame({
    "model_name": list(models.keys()),
    "model_object": list(models.values())
})
print(df_models.head())

# ----- STAMPA CROSS-VALIDATION STRATEGIES -----
print("\nTABELLA STRATEGIE DI CROSS-VALIDATION:")
df_cv = pd.DataFrame({
    "cv_strategy": list(cv_strategies.keys()),
    "cv_object": list(cv_strategies.values())
})
print(df_cv.head())



TABELLA MODELLI:
            model_name                         model_object
0  logistic_regression    LogisticRegression(max_iter=5000)
1              knn_k=2  KNeighborsClassifier(n_neighbors=2)
2              knn_k=3  KNeighborsClassifier(n_neighbors=3)
3              knn_k=5               KNeighborsClassifier()
4              knn_k=7  KNeighborsClassifier(n_neighbors=7)

TABELLA STRATEGIE DI CROSS-VALIDATION:
        cv_strategy                                          cv_object
0             kfold   KFold(n_splits=5, random_state=42, shuffle=True)
1  stratified_kfold  StratifiedKFold(n_splits=5, random_state=42, s...


In [None]:
# Metric: F1 Score (appropriate for class imbalance)
scorer = make_scorer(f1_score)

results = []

for model_name, model in models.items():
    for cv_name, cv in cv_strategies.items():
        scores = cross_val_score(model, X, y, cv=cv, scoring=scorer)
        results.append({
            "model": model_name,
            "cv_strategy": cv_name,
            "f1_mean": np.mean(scores),
            "f1_std": np.std(scores)
        })

df_results = pd.DataFrame(results)
df_results

print(f"TABELLA PARZIALE ({model_name}, {cv_name}):")
print(pd.DataFrame(results).head(), "\n")

TABELLA PARZIALE (knn_k=7, stratified_kfold):
                 model       cv_strategy   f1_mean    f1_std
0  logistic_regression             kfold  0.960656  0.016474
1  logistic_regression  stratified_kfold  0.963964  0.014419
2              knn_k=2             kfold  0.932949  0.013839
3              knn_k=2  stratified_kfold  0.926589  0.011127
4              knn_k=3             kfold  0.936087  0.017667 

