In [11]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

In [12]:
df = pd.read_csv('drinking_water_potability.csv')
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.05786,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.54173,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0
4,9.092223,181.101509,17978.98634,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681736,47580.99160,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.80216,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.57822,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.86938,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [13]:
df["Missing"] = df.isna().any(axis=1)
dimensions = ["ph",
              "Hardness",
              "Solids",
              "Chloramines",
              "Sulfate",
              "Conductivity",
              "Organic_carbon",
              "Trihalomethanes",
              "Turbidity"]

In [14]:
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer

In [15]:
# 1 : Suppression de toutes les lignes avec 2 ou 3 valeurs manquantes et imputation pour les lignes avec 1 valeur manquante
df_1 = df.dropna(thresh=10)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(df_1)
df_1=pd.DataFrame(imp_mean.transform(df_1))

In [16]:
# 2 : Suppression des lignes avec 3 valeurs manquantes et imputation pour les autres
df_2 = df.dropna(thresh=9)
imp_mean.fit(df_2)
df_2=pd.DataFrame(imp_mean.transform(df_2))


### On se rend compte avec ces imputations, du fait que la méthode "most frequent value" n'est pas adaptée à notre dataset. en effet, la valeur la plus fréquente est un outlier (0 pour le ph par exemple). 
### De plus la médiane est équivalente à la moyenne.
### On se concentre donc sur la valeur moyenne et on abandonne la "most frequent value" et la médiane.

# Iterative Imputer

### Nous allons ici utiliser une imputation multivariable sur les données ayant max 2 valeurs manquantes.

In [17]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

df_3 = df.dropna(thresh=10)
imp_iter = IterativeImputer(random_state=0)
df_3 = pd.DataFrame(imp_iter.fit_transform(df_3))

# Stochastic regression - multivariate feature imputation

In [18]:
df_4 = df.dropna(thresh=10)
it_imp = IterativeImputer(sample_posterior=True)
df_4 = pd.DataFrame(it_imp.fit_transform(df_4))

In [19]:
#Dictionnaire qui contient les dataframes d'intérêt
all_df={"df_1" : df_1, "df_2" : df_2, 
               "df_3" : df_3, "df_4" : df_4}

for dataframe in all_df.values() :
    #on redonne d'abord les noms des colonnes qui ont disparu
    dataframe.columns=['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability',
       'Missing'] 
    dataframe.drop('Missing', axis=1, inplace=True)
    #fig = px.scatter(y=dataframe.loc[:,"ph"])
    dataframe.columns
    #fig.show()


# Model Testing


In [20]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
import plotly.express as px
import pandas as pd
import sklearn.tree as tree

stats = pd.DataFrame()


def computeModelStats(model, stats, dataset, hyperParams, randomizedSearch=False):
    """
    :param model: model that will be assessed
    :type model: {tuple(predictor,name:str)}
    :param stats: Dataframe in which results will be sent
    :type stats: {Dataframe}
    :param dataset: Dataset that will be used
    :type dataset: {tuple(Dict(X:DataFrame,y:array), name)}
    :param hyperParams: HyperParams grid that will be assessed
    :type hyperParams: {Dict}
    :return: void
    :rtype: void
    """
    X = dataset[0]["X"]
    y = dataset[0]["y"]

    if not randomizedSearch:
        grid_search = GridSearchCV(model[0], cv=StratifiedKFold(n_splits=5), refit='f1',
                                   scoring=['f1', 'recall', 'precision'], verbose=3, n_jobs=-1, param_grid=hyperParams,
                                   return_train_score=True)
    else:
        grid_search = RandomizedSearchCV(model[0], cv=StratifiedKFold(n_splits=5), refit='f1',
                                         scoring=['f1', 'recall', 'precision'], verbose=3, n_jobs=-1,
                                         param_distributions=hyperParams, return_train_score=True,
                                         n_iter=randomizedSearch)

    grid_search.fit(X, y)

    temp_test = pd.DataFrame({key.replace("_test", ""): grid_search.cv_results_[key] for key in
                              ["mean_test_f1", "std_test_f1", "mean_test_recall", "mean_test_precision", "params"]})
    temp_train = pd.DataFrame({key.replace("_train", ""): grid_search.cv_results_[key] for key in
                               ["mean_train_f1", "std_train_f1", "mean_train_recall", "mean_train_precision",
                                "std_test_f1", "params"]})
    temp_train["type"] = "train"
    temp_test["type"] = "test"
    temp = temp_test.append(temp_train)
    temp["dataset"] = dataset[1]
    temp["model"] = model[1]
    temp["params"] = temp["params"].astype(str)
    stats = stats.append(temp)
    fig = px.bar(temp, y="mean_f1", hover_data=["params"], barmode="group", hover_name="params", color="type",
                  title=f"Mean f1 score on testing and training set for each parameter combination <br> (Model: {model[1]}, Dataset: {dataset[1]})",
                  labels={"mean_f1": f"Average f1", "index": "Params"}, error_y="std_f1")

    fig.show()
    return stats


def extract_features_and_labels(df):
    print(df.columns)
    y = df.Potability
    X = df.drop("Potability", axis=1)
    return {"X": X, "y": y}


In [32]:
mean_accurracy_3_missing_values = (
    extract_features_and_labels(df_1), "Mean imputation for individuals with 3 MV")

mean_accurracy_2_3_missing_values = (
    extract_features_and_labels(df_2), "Mean imputation for individuals with 2 or 3 MV")

multivariate_feature_imputation = (extract_features_and_labels(df_3), "Multivariate feature imputation")

multivariate_stochastic_feature_imputation = (extract_features_and_labels(df_4), "Multivariate stochastic feature imputation")

multivariate_feature_imputation_subset_selection = (
    extract_features_and_labels(df_3[["ph", "Hardness", "Chloramines", "Sulfate", "Potability"]]),
    "Multivariate feature imputation with subset selection")

multivariate_stochastic_feature_imputation_subset_selection = (
    extract_features_and_labels(df_4[["ph", "Hardness", "Chloramines", "Sulfate", "Potability"]]),
    "Multivariate stochastic feature imputation with subset selection")


mean_accurracy_2_3_missing_values_subset_selection = (
    extract_features_and_labels(df_2[["ph", "Hardness", "Chloramines", "Sulfate", "Potability"]]),
    "Mean imputation subset selection for individuals with 2 or 3 MV")

Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
Index(['ph', 'Hardness', 'Chloramines', 'Sulfate', 'Potability'], dtype='object')
Index(['ph', 'Hardness', 'Chloramines', 'Sulfate', 'Potability'], dtype='object')
Index(['ph', 'Hardness', 'Chloramines', 'Sulfate', 'Potability'], dtype='object')


# Random Forest
We know usually we tend to end with random forest. However we wanted to test several datasets, given the ability of random forests to ability to match unscaled data it will help us to prune some dataset for the next models.

In [25]:
from sklearn.ensemble import RandomForestClassifier
model = (RandomForestClassifier(), "RandomForest Classifier")

pgrid = {"max_depth": [3,6,9],
         "n_estimators": [50, 80, 110, 130],
         "min_samples_split": [3, 6, 9, 12],
         "max_features": ["sqrt", "log2"]}


stats = computeModelStats(model, stats, mean_accurracy_3_missing_values, pgrid)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [26]:
stats = computeModelStats(model, stats, mean_accurracy_2_3_missing_values, pgrid)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [27]:
stats = computeModelStats(model, stats, mean_accurracy_2_3_missing_values_subset_selection, pgrid)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [29]:
stats = computeModelStats(model, stats, multivariate_feature_imputation, pgrid)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [30]:
stats = computeModelStats(model, stats, multivariate_stochastic_feature_imputation, pgrid)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [31]:
stats = computeModelStats(model, stats, multivariate_feature_imputation_subset_selection, pgrid)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [33]:
stats = computeModelStats(model, stats, multivariate_stochastic_feature_imputation_subset_selection, pgrid)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [34]:
def plot_params_incidence(model=None,dataset=None,params=None,barchart=False):
    mask1 = stats["dataset"] == dataset[1] if dataset != None else True
    mask2 = stats["model"] == model[1] if model != None else True
    mask3 = stats["type"] == "test"

    df_stats_temp = stats[mask1&mask2&mask3]
    df_stats_temp = df_stats_temp.join(pd.json_normalize(df_stats_temp["params"].apply(eval))).groupby([params]).agg({"mean_f1":"mean","std_f1":"mean"})

    if not barchart:
        return px.line(df_stats_temp, labels="params",y="mean_f1",title=f"Mean f1 score on testing depending on hyper parameter values <br>(Model: {model[1] if model else 'All'}, Dataset: {dataset[1]if dataset else 'All'}").show()
    return px.bar(df_stats_temp, labels="params",y="mean_f1",title=f"Mean f1 score on testing depending on hyper parameter values <br>(Model: {model[1] if model else 'All'}, Dataset: {dataset[1]if dataset else 'All'})",error_y="std_f1").show()

In [35]:
plot_params_incidence(model=model,params = "max_depth", barchart=True)

In [36]:
plot_params_incidence(model=model,params = "n_estimators", barchart=True)

In [37]:
plot_params_incidence(model=model,params = "min_samples_split", barchart=True)

In [38]:
plot_params_incidence(model=model,params = "max_features", barchart=True)