In [74]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
import plotly.express as pex
import pandas as pd
import sklearn.tree as tree

stats = pd.DataFrame()

def computeModelStats(model,stats,dataset,hyperParams,randomizedSearch=False):
    """
    :param model: model that will be assessed
    :type model: {tuple(predictor,name:str)}
    :param stats: Dataframe in which results will be sent
    :type stats: {Dataframe}
    :param dataset: Dataset that will be used
    :type dataset: {tuple(Dict(X:DataFrame,y:array), name)}
    :param hyperParams: HyperParams grid that will be assessed
    :type hyperParams: {Dict}
    :return: void
    :rtype: void
    """
    X=dataset[0]["X"]
    y=dataset[0]["y"]

    if not randomizedSearch:
        grid_search = GridSearchCV(model[0],cv=StratifiedKFold(n_splits=5),refit='f1',scoring=['f1','recall','precision'],verbose=2, n_jobs=-1, param_grid=hyperParams,return_train_score=True)
    else:
        grid_search = RandomizedSearchCV(model[0],cv=StratifiedKFold(n_splits=5),refit='f1',scoring=['f1','recall','precision'],verbose=2, n_jobs=-1, param_distributions=hyperParams,return_train_score=True, n_iter=randomizedSearch)

    grid_search.fit(X,y)

    print(grid_search.cv_results_.keys())

    temp_test = pd.DataFrame({ key.replace("_test",""): grid_search.cv_results_ [key] for key in ["mean_test_f1", "std_test_f1","mean_test_recall", "mean_test_precision","params"] })
    temp_train = pd.DataFrame({ key.replace("_train",""): grid_search.cv_results_ [key] for key in ["mean_train_f1", "std_train_f1", "mean_train_recall", "mean_train_precision" , "std_test_f1","params"]})
    temp_train["type"] = "train"
    temp_test["type"] = "test"
    temp = temp_test.append(temp_train)
    temp["dataset"] = dataset[1]
    temp["model"] = model[1]
    temp["params"] = temp["params"].astype(str)
    stats = stats.append(temp)
    fig = pex.bar(temp,y="mean_f1",hover_data=["params"],barmode="group",hover_name="params",color="type", title=f"Mean f1 score on testing and training set for each parameter combination <br> (Model: {model[1]}, Dataset: {dataset[1]})",labels={"mean_f1": f"Average f1","index": "Params"},error_y="std_f1")

    fig.show()
    return stats

In [75]:
#Chargement du df
df = pd.read_csv('drinking_water_potability.csv').dropna(axis=0)
y = df["Potability"]
#Preprosseing vite fait
df.drop("Potability", axis=1,inplace=True)


In [76]:
#Dataset (avec X contenant les feature et y contenant les prédicats) et en 2ème param la méthode utilisée
dataset = ({"X":df,"y":y},"Listwise deletion")

#Modèle et en 2ème param la méthode utilisée
model = (tree.DecisionTreeClassifier(),"Decision Tree")

#hyper grid pour tester quelques params (je dirais ~3 valeur par params)
pgrid = {"max_depth": [1, 2, 3, 4],
         "min_samples_split": [2, 3, 5, 10]}

#tada
stats = computeModelStats(model,stats,dataset,pgrid)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_min_samples_split', 'params', 'split0_test_f1', 'split1_test_f1', 'split2_test_f1', 'split3_test_f1', 'split4_test_f1', 'mean_test_f1', 'std_test_f1', 'rank_test_f1', 'split0_train_f1', 'split1_train_f1', 'split2_train_f1', 'split3_train_f1', 'split4_train_f1', 'mean_train_f1', 'std_train_f1', 'split0_test_recall', 'split1_test_recall', 'split2_test_recall', 'split3_test_recall', 'split4_test_recall', 'mean_test_recall', 'std_test_recall', 'rank_test_recall', 'split0_train_recall', 'split1_train_recall', 'split2_train_recall', 'split3_train_recall', 'split4_train_recall', 'mean_train_recall', 'std_train_recall', 'split0_test_precision', 'split1_test_precision', 'split2_test_precision', 'split3_test_precision', 'split4_test_precision', 'mean_test_precision', 'std_test_precision', 'rank_test_precision', 'split0_train_preci

# SVM

In [77]:
from sklearn.svm import SVC
from scipy.stats import reciprocal,uniform

clf_svm = (SVC(gamma="auto"),"Scalar vector machine")
pgrid_svm = {"gamma": [0.5,0.1,0.01,0.001,0.0001], "C": [1,3,5,8,10]}
stats = computeModelStats(clf_svm,stats,dataset,pgrid_svm,randomizedSearch=14)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_gamma', 'param_C', 'params', 'split0_test_f1', 'split1_test_f1', 'split2_test_f1', 'split3_test_f1', 'split4_test_f1', 'mean_test_f1', 'std_test_f1', 'rank_test_f1', 'split0_train_f1', 'split1_train_f1', 'split2_train_f1', 'split3_train_f1', 'split4_train_f1', 'mean_train_f1', 'std_train_f1', 'split0_test_recall', 'split1_test_recall', 'split2_test_recall', 'split3_test_recall', 'split4_test_recall', 'mean_test_recall', 'std_test_recall', 'rank_test_recall', 'split0_train_recall', 'split1_train_recall', 'split2_train_recall', 'split3_train_recall', 'split4_train_recall', 'mean_train_recall', 'std_train_recall', 'split0_test_precision', 'split1_test_precision', 'split2_test_precision', 'split3_test_precision', 'split4_test_precision', 'mean_test_precision', 'std_test_precision', 'rank_test_precision', 'split0_train_precision', 'split1_train

In [78]:
dataset_scaled = dataset[0]["X"].copy(deep=True)
for col in dataset_scaled:
    dataset_scaled[col]-= dataset_scaled[col].mean()
    dataset_scaled[col]/= dataset_scaled[col].std()
dataset_scaled = ({"X":dataset_scaled,"y":y}, "dataset scaled")
stats = computeModelStats(clf_svm,stats,dataset_scaled,pgrid_svm,randomizedSearch=14)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_gamma', 'param_C', 'params', 'split0_test_f1', 'split1_test_f1', 'split2_test_f1', 'split3_test_f1', 'split4_test_f1', 'mean_test_f1', 'std_test_f1', 'rank_test_f1', 'split0_train_f1', 'split1_train_f1', 'split2_train_f1', 'split3_train_f1', 'split4_train_f1', 'mean_train_f1', 'std_train_f1', 'split0_test_recall', 'split1_test_recall', 'split2_test_recall', 'split3_test_recall', 'split4_test_recall', 'mean_test_recall', 'std_test_recall', 'rank_test_recall', 'split0_train_recall', 'split1_train_recall', 'split2_train_recall', 'split3_train_recall', 'split4_train_recall', 'mean_train_recall', 'std_train_recall', 'split0_test_precision', 'split1_test_precision', 'split2_test_precision', 'split3_test_precision', 'split4_test_precision', 'mean_test_precision', 'std_test_precision', 'rank_test_precision', 'split0_train_precision', 'split1_train

In [79]:
def plot_params_incidence(model,dataset,params):
    mask1 = stats["dataset"] == dataset[1]
    mask2 = stats["model"] == model[1]
    mask3 = stats["type"] == "test"

    df_stats_temp = stats[mask1&mask2&mask3]
    df_stats_temp = df_stats_temp.join(pd.json_normalize(df_stats_temp["params"].apply(eval))).groupby(params).agg({"mean_f1":"mean","std_f1":"mean"})

    return pex.line(df_stats_temp, labels=params,y="mean_f1",title=f"Mean f1 score on testing depending on hyper parameter values <br>(Model: {model[1]}, Dataset: {dataset[1]}").show()

In [80]:
plot_params_incidence(clf_svm, dataset_scaled,"C")

In [81]:
plot_params_incidence(clf_svm, dataset_scaled,"gamma")

# Linear Regression

As we have seen in the exploratory phase it doesnt seem we have any Linear correlation but we're goiing to test

In [82]:
stats

Unnamed: 0,mean_f1,std_f1,mean_recall,mean_precision,params,type,std_test_f1,dataset,model
0,0.097143,0.069715,0.055427,0.430499,"{'max_depth': 1, 'min_samples_split': 2}",test,,Listwise deletion,Decision Tree
1,0.097143,0.069715,0.055427,0.430499,"{'max_depth': 1, 'min_samples_split': 3}",test,,Listwise deletion,Decision Tree
2,0.097143,0.069715,0.055427,0.430499,"{'max_depth': 1, 'min_samples_split': 5}",test,,Listwise deletion,Decision Tree
3,0.097143,0.069715,0.055427,0.430499,"{'max_depth': 1, 'min_samples_split': 10}",test,,Listwise deletion,Decision Tree
4,0.207770,0.039601,0.129524,0.651819,"{'max_depth': 2, 'min_samples_split': 2}",test,,Listwise deletion,Decision Tree
...,...,...,...,...,...,...,...,...,...
9,0.000000,0.000000,0.000000,0.000000,"{'gamma': 0.001, 'C': 5}",train,0.000000,dataset scaled,Scalar vector machine
10,0.045048,0.011594,0.023119,0.920474,"{'gamma': 0.01, 'C': 1}",train,0.008923,dataset scaled,Scalar vector machine
11,0.372351,0.019769,0.238601,0.849403,"{'gamma': 0.01, 'C': 8}",train,0.028098,dataset scaled,Scalar vector machine
12,0.998304,0.000756,0.997842,0.998766,"{'gamma': 0.5, 'C': 8}",train,0.023377,dataset scaled,Scalar vector machine


# KNN

# Bagging

# Random Forest

# ExtraTreesClassifier

# XGBoost