In [159]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
import plotly.express as pex
import pandas as pd
import sklearn.tree as tree

stats = pd.DataFrame()

def computeModelStats(model,stats,dataset,hyperParams,randomizedSearch=False):
    """
    :param model: model that will be assessed
    :type model: {tuple(predictor,name:str)}
    :param stats: Dataframe in which results will be sent
    :type stats: {Dataframe}
    :param dataset: Dataset that will be used
    :type dataset: {tuple(Dict(X:DataFrame,y:array), name)}
    :param hyperParams: HyperParams grid that will be assessed
    :type hyperParams: {Dict}
    :return: void
    :rtype: void
    """
    X=dataset[0]["X"]
    y=dataset[0]["y"]

    if not randomizedSearch:
        grid_search = GridSearchCV(model[0],cv=StratifiedKFold(n_splits=5),scoring='accuracy',verbose=2, n_jobs=-1, param_grid=hyperParams,return_train_score=True)
    else:
        grid_search = RandomizedSearchCV(model[0],cv=StratifiedKFold(n_splits=5),scoring='accuracy',verbose=2, n_jobs=-1, param_distributions=hyperParams,return_train_score=True, n_iter=randomizedSearch)

    grid_search.fit(X,y)

    temp_test = pd.DataFrame({ key.replace("_test",""): grid_search.cv_results_ [key] for key in ["mean_test_score", "std_test_score","params"] })
    temp_train = pd.DataFrame({ key.replace("_train",""): grid_search.cv_results_ [key] for key in ["mean_train_score", "std_train_score","params"]})
    temp_train["type"] = "train"
    temp_test["type"] = "test"
    temp = temp_test.append(temp_train)
    temp["dataset"] = dataset[1]
    temp["model"] = model[1]
    temp["params"] = temp["params"].astype(str)
    stats = stats.append(temp)
    fig = pex.bar(temp,y="mean_score",hover_data=["params"],barmode="group",hover_name="params",color="type", title=f"Mean accuracy on testing and training set for each parameter combination <br> (Model: {model[1]}, Dataset: {dataset[1]})",labels={"mean_score": f"Average accuracy","index": "Params"},error_y="std_score")

    fig.show()
    return stats

In [160]:
#Chargement du df
df = pd.read_csv('drinking_water_potability.csv').dropna(axis=0)
y = df["Potability"]
#Preprosseing vite fait
df.drop("Potability", axis=1,inplace=True)


In [161]:
#Dataset (avec X contenant les feature et y contenant les prédicats) et en 2ème param la méthode utilisée
dataset = ({"X":df,"y":y},"Listwise deletion")

#Modèle et en 2ème param la méthode utilisée
model = (tree.DecisionTreeClassifier(),"Decision Tree")

#hyper grid pour tester quelques params (je dirais ~3 valeur par params)
pgrid = {"max_depth": [1, 2, 3, 4],
         "min_samples_split": [2, 3, 5, 10]}

#tada
stats = computeModelStats(model,stats,dataset,pgrid)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


# SVM

In [162]:
from sklearn.svm import SVC
from scipy.stats import reciprocal,uniform

clf_svm = (SVC(gamma="auto"),"Scalar vector machine")
pgrid_svm = {"gamma": [0.5,0.1,0.01,0.001,0.0001], "C": [1,3,5,8,10]}
stats = computeModelStats(clf_svm,stats,dataset,pgrid_svm,randomizedSearch=14)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


In [None]:
dataset_scaled = dataset[0]["X"].copy(deep=True)
for col in dataset_scaled:
    dataset_scaled[col]-= dataset_scaled[col].mean()
    dataset_scaled[col]/= dataset_scaled[col].std()
dataset_scaled = ({"X":dataset_scaled,"y":y}, "dataset scaled")
stats = computeModelStats(clf_svm,stats,dataset_scaled,pgrid_svm,randomizedSearch=14)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


In [None]:
def plot_params_incidence(model,dataset,params):
    mask1 = stats["dataset"] == dataset[1]
    mask2 = stats["model"] == model[1]
    mask3 = stats["type"] == "test"

    df_stats_temp = stats[mask1&mask2&mask3]
    df_stats_temp = df_stats_temp.join(pd.json_normalize(df_stats_temp["params"].apply(eval))).sort_values(by=params)

    pex.line(df_stats_temp,x=params, labels=params,y="mean_score",title=f"Mean accuracy on testing depending on hyper prameter values (Model: {model[1]}, Dataset: {dataset[1]}, Params: {params}])").show()

In [None]:
plot_params_incidence(clf_svm, dataset_scaled,["C"])

In [None]:
plot_params_incidence(clf_svm, dataset_scaled,["gamma"])

# Linear Regression

As we have seen in the exploratory phase it doesnt seem we have any Linear correlation but we're goiing to test