In [101]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import itertools
import time
import pandas as pd
import numpy as np

In [102]:
df = pd.read_csv('drinking_water_potability.csv')
df["Missing"] = df.isna().sum(axis=1)
dimensions = ["ph",
              "Hardness",
              "Solids",
              "Chloramines",
              "Sulfate",
              "Conductivity",
              "Organic_carbon",
              "Trihalomethanes",
              "Turbidity"]

### Création datasets

In [103]:
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer

In [104]:
# 1 : Suppression de toutes les lignes avec 2 ou 3 valeurs manquantes et imputation pour les lignes avec 1 valeur manquante
df_missing_1 = df.dropna(thresh=10)

In [105]:
# 1.1 : Mean-imputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(df_missing_1)
df_missing_11=pd.DataFrame(imp_mean.transform(df_missing_1))

#df_missing_11.head()

In [106]:
# 1.2 : Median-imputer
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_median.fit(df_missing_1)
df_missing_12=pd.DataFrame(imp_median.transform(df_missing_1))


In [107]:
# 2 : Suppression des lignes avec 3 valeurs manquantes et imputation pour les autres
df_missing_2 = df.dropna(thresh=9)


In [108]:
# 2.1 : Mean
imp_mean.fit(df_missing_2)
df_missing_21=pd.DataFrame(imp_mean.transform(df_missing_2))


In [109]:
# 2.2 : Median
imp_median.fit(df_missing_2)
df_missing_22=pd.DataFrame(imp_median.transform(df_missing_2))

In [110]:
#Dictionnaire qui contient les dataframes d'intérêt
all_df_missing={"df_missing_11" : df_missing_11,"df_missing_12" : df_missing_12, 
                  "df_missing_21" : df_missing_21, "df_missing_22" : df_missing_22}

for dataframe in all_df_missing.values() :
   #on redonne d'abord les noms des colonnes qui ont disparu
   dataframe.columns=['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability',
       'Missing']

In [111]:
for dataframe in all_df_missing.values() :   
   dataframe.drop('Missing', axis=1, inplace=True)
    #fig = px.scatter(y=dataframe.loc[:,"ph"])
   dataframe.columns
    #fig.show()

In [112]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

df_missing_3 = df.dropna(thresh=10)
imp_iter = IterativeImputer(random_state=0)
df_missing_3 = pd.DataFrame(imp_iter.fit_transform(df_missing_3))
df_missing_3.columns = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability',
       'Missing']
df_missing_3.drop('Missing', axis=1, inplace=True)
df_missing_1.drop('Missing', axis=1, inplace=True)
df_missing_2.drop('Missing', axis=1, inplace=True)
all_df_missing["df_missing_3"] = df_missing_3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [113]:
df_missing_0 = df.dropna().drop(['Missing'],axis=1)
all_df_missing["df_missing_0"] = df_missing_0

# Iterative Imputer

### Nous allons ici utiliser une imputation multivariable sur les données ayant max 2 valeurs manquantes.

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

df_4 = df.dropna(thresh=10)
imp_iter = IterativeImputer(random_state=0)
df_4 = pd.DataFrame(imp_iter.fit_transform(df_4))

### Forward subset selection

#### Définition modeles

In [129]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# Build RF classifier to use in feature selection
SVM_rbf = SVC(C=10, gamma=0.1)
clf = RandomForestClassifier()

def mean_norm(df_input):
    return df_input.apply(lambda x: (x-x.mean())/ x.std(), axis=0)

In [115]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.model_selection import train_test_split

def forward_selection(model,X,y):

        if scale:
                X_scaled = mean_norm(X)
                X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.5, random_state=42)
        else:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

        # Build step forward feature selection
        sfs1 = sfs(model,
                k_features=9,
                forward=True,
                floating=False,
                verbose=2,
                scoring='f1',
                cv=5,
                n_jobs=-1)

         # Perform SFFS on train set
        feature_names = X.columns
        sfs1 = sfs1.fit(X_train, y_train,custom_feature_names=feature_names)

        return sfs1.get_metric_dict()

In [116]:
def impute_sfs_one_dataset(model, dataset):

    X = dataset.drop(['Potability'],axis=1)
    y = dataset.Potability
    
    results_dict = forward_selection(model, X, y)
    results_df = pd.DataFrame.from_dict(results_dict).transpose()
    results_df.sort_values('avg_score', inplace=True, ascending=False)
    
    return results_df

In [123]:
def impute_sfs_all_datasets(model, datasets, scale = False):

    best_results = pd.DataFrame(columns = ['Dataset_name', 'Feature_names', 'f1_score'])
    keys = list(all_df_missing.keys())
    i = 0

    for dataset in datasets.values():
        results_df = impute_sfs_one_dataset(model, dataset)
        results_df.reset_index(drop=True, inplace=True)
        carac_dataset = [keys[i], results_df.loc[0, 'feature_names'], results_df.loc[0, 'avg_score']]
        best_results.loc[i] = carac_dataset
        i += 1
        
    best_results.sort_values('f1_score', inplace=True, ascending=False)
    
    return best_results

In [120]:
best_results_clf = impute_sfs_all_datasets(clf, all_df_missing)
best_results_clf

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:   14.3s remaining:   17.9s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   16.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   16.3s finished

[2021-11-04 02:53:04] Features: 1/9 -- score: 0.443660706986964[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    3.3s remaining:    5.5s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.4s finished

[2021-11-04 02:53:09] Features: 2/9 -- score: 0.4304008232139417[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   7 | elapsed:    4.0s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:    4.3s finished

[2021-11-04 02:53:13] Fe

Unnamed: 0,Dataset_name,Feature_names,f1_score
5,df_missing_0,"(ph, Hardness, Solids, Chloramines, Sulfate)",0.519706
0,df_missing_11,"(ph, Hardness, Solids, Chloramines, Sulfate, C...",0.515756
4,df_missing_3,"(ph, Hardness, Solids, Chloramines, Sulfate)",0.51164
1,df_missing_12,"(ph, Hardness, Solids, Chloramines, Sulfate, C...",0.507628
3,df_missing_22,"(ph, Hardness, Solids, Sulfate, Trihalomethanes)",0.490221
2,df_missing_21,"(ph, Hardness, Solids, Chloramines, Sulfate, T...",0.479521


In [130]:
best_results_svm = impute_sfs_all_datasets(SVM_rbf, all_df_missing, scale=True)
best_results_svm

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    2.9s remaining:    3.7s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    4.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    4.5s finished

[2021-11-04 03:09:49] Features: 1/9 -- score: 0.3392013075536392[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    3.1s remaining:    5.3s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.8s finished

[2021-11-04 03:09:53] Features: 2/9 -- score: 0.4250796134975275[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   7 | elapsed:    2.3s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:    3.3s finished

[2021-11-04 03:09:56] F

Unnamed: 0,Dataset_name,Feature_names,f1_score
5,df_missing_0,"(ph, Chloramines, Sulfate, Organic_carbon)",0.455908
4,df_missing_3,"(ph, Chloramines, Sulfate, Turbidity)",0.452179
1,df_missing_12,"(ph, Chloramines, Sulfate, Organic_carbon, Tur...",0.451742
0,df_missing_11,"(ph, Sulfate, Trihalomethanes)",0.42795
2,df_missing_21,"(ph, Chloramines, Sulfate, Trihalomethanes)",0.42337
3,df_missing_22,"(ph, Chloramines, Sulfate, Trihalomethanes)",0.422363
