In [121]:
%pip install plotly
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

Note: you may need to restart the kernel to use updated packages.


In [122]:
df = pd.read_csv('drinking_water_potability.csv')
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.05786,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.54173,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0
4,9.092223,181.101509,17978.98634,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681736,47580.99160,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.80216,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.57822,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.86938,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [123]:
df["Missing"] = df.isna().any(axis=1)
dimensions = ["ph",
              "Hardness",
              "Solids",
              "Chloramines",
              "Sulfate",
              "Conductivity",
              "Organic_carbon",
              "Trihalomethanes",
              "Turbidity"]

# Gestion des valeurs manquantes : Simple Imputer


In [124]:
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer

In [125]:
# 1 : Suppression de toutes les lignes avec 2 ou 3 valeurs manquantes et imputation pour les lignes avec 1 valeur manquante
df_1 = df.dropna(thresh=10)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(df_1)
df_1=pd.DataFrame(imp_mean.transform(df_1))

In [126]:
# 2 : Suppression des lignes avec 3 valeurs manquantes et imputation pour les autres
df_2 = df.dropna(thresh=9)
imp_mean.fit(df_2)
df_2=pd.DataFrame(imp_mean.transform(df_2))


### On se rend compte avec ces imputations, du fait que la méthode "most frequent value" n'est pas adaptée à notre dataset. en effet, la valeur la plus fréquente est un outlier (0 pour le ph par exemple). 
### De plus la médiane est équivalente à la moyenne.
### On se concentre donc sur la valeur moyenne et on abandonne la "most frequent value" et la médiane.

# Iterative Imputer

### Nous allons ici utiliser une imputation multivariable sur les données ayant max 2 valeurs manquantes.

In [127]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

df_3 = df.dropna(thresh=10)
imp_iter = IterativeImputer(random_state=0)
df_3 = pd.DataFrame(imp_iter.fit_transform(df_3))

# Stochastic regression - multivariate feature imputation

In [188]:
df_4 = df.dropna(thresh=10)
it_imp = IterativeImputer(sample_posterior=True)
df_4 = pd.DataFrame(it_imp.fit_transform(df_4))

In [129]:
#Dictionnaire qui contient les dataframes d'intérêt
all_df={"df_1" : df_1, "df_2" : df_2, 
               "df_3" : df_3, "df_4" : df_4}

for dataframe in all_df.values() :
    #on redonne d'abord les noms des colonnes qui ont disparu
    dataframe.columns=['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability',
       'Missing'] 
    dataframe.drop('Missing', axis=1, inplace=True)
    #fig = px.scatter(y=dataframe.loc[:,"ph"])
    dataframe.columns
    #fig.show()

In [196]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
stats = pd.DataFrame()
scores = []
for df in all_df.values():
    y = df['Potability']
    X = df.drop('Potability', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores.append(f1_score(y_test, y_pred, average='macro'))
scores
    

KeyError: 'Potability'

### A ce stade, on a donc x datasets dont les valeurs manquantes ont été imputées selon différentes méthodes. Nous allons évaluer nos modèles sur chacun de ces datasets et garder celui pour lesquels les résultats sont les plus probants.

# Gestion des outliers

### Deux méthodes sont possibles : éliminer tous les outliers, ou bien les imputer par les méthodes vues ci-dessus. Nous allons successivement essayer les deux.

In [170]:
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(n_neighbors=20)
df_test = df_4
df_test= df_test.drop(columns=['Solids', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'])
is_outlier = pd.DataFrame(clf.fit_predict(df_test))
df_test["is_outlier"] = is_outlier
print(df_test.shape)
df_5 = df_test.loc[df_test["is_outlier"] == 1] #on supprime tous les outliers
df_5 = df_5.drop("is_outlier", axis=1)
all_df['df_5'] = df_5
print(df_5.shape)

(3116, 5)
(3088, 4)


In [187]:
#all_df
is_outlier_ph = clf.fit_predict(np.array(df_test["ph"]).reshape(-1,1))
is_outlier_sulfate = clf.fit_predict(np.array(df_test["Sulfate"]).reshape(-1,1))
is_outlier_hardness = clf.fit_predict(np.array(df_test["Hardness"]).reshape(-1,1))
is_outlier_chloramines = clf.fit_predict(np.array(df_test["Chloramines"]).reshape(-1,1))
imputed_df_6 = df_test
print(df_test.shape)
sums= [sum(is_outlier_ph), sum(is_outlier_sulfate), sum(is_outlier_chloramines), sum(is_outlier_hardness)]
stats = pd.DataFrame()
stats["Nombre d'outliers"] = [(3116 - k )/2 for k in sums]
stats.index = ["ph", "Sulfate", "Chloramines", "Hardness"]
#stats
imputed_df_6["is_outlier_ph"] = is_outlier_ph
imputed_df_6["is_outlier_sulfate"] = is_outlier_sulfate
imputed_df_6["is_outlier_chloramines"] = is_outlier_chloramines
imputed_df_6["is_outlier_hardness"] = is_outlier_hardness
#imputed_df_6[imputed_df_6["is_outlier_ph"] == -1].loc[:,("ph")] = "N/A"
new_column = imputed_df_6[imputed_df_6["is_outlier_ph"] == -1].loc[:,'ph'] = "N/A"

(3116, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
