In [2]:
%pip install plotly
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

Note: you may need to restart the kernel to use updated packages.


In [3]:
df = pd.read_csv('drinking_water_potability.csv')
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.05786,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.54173,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0
4,9.092223,181.101509,17978.98634,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681736,47580.99160,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.80216,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.57822,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.86938,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [4]:
df["Missing"] = df.isna().any(axis=1)
dimensions = ["ph",
              "Hardness",
              "Solids",
              "Chloramines",
              "Sulfate",
              "Conductivity",
              "Organic_carbon",
              "Trihalomethanes",
              "Turbidity"]

# Gestion des valeurs manquantes : Simple Imputer


In [5]:
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer

In [6]:
# 1 : Suppression de toutes les lignes avec 2 ou 3 valeurs manquantes et imputation pour les lignes avec 1 valeur manquante
df_missing_1 = df.dropna(thresh=10)

In [7]:
# 1.1 : Mean-imputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(df_missing_1)
df_missing_11=pd.DataFrame(imp_mean.transform(df_missing_1))

#df_missing_11.head()

In [8]:
# 1.2 : Median-imputer
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_median.fit(df_missing_1)
df_missing_12=pd.DataFrame(imp_median.transform(df_missing_1))


In [9]:
# 2 : Suppression des lignes avec 3 valeurs manquantes et imputation pour les autres
df_missing_2 = df.dropna(thresh=9)


In [10]:
# 2.1 : Mean
imp_mean.fit(df_missing_2)
df_missing_21=pd.DataFrame(imp_mean.transform(df_missing_2))


In [11]:
# 2.2 : Median
imp_median.fit(df_missing_2)
df_missing_22=pd.DataFrame(imp_median.transform(df_missing_2))

In [12]:
#Dictionnaire qui contient les dataframes d'intérêt
all_df_missing={"df_missing_11" : df_missing_11,"df_missing_12" : df_missing_12, 
                  "df_missing_21" : df_missing_21, "df_missing_22" : df_missing_22}

for dataframe in all_df_missing.values() :
   #on redonne d'abord les noms des colonnes qui ont disparu
   dataframe.columns=['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability',
       'Missing'] 
   dataframe.drop('Missing', axis=1, inplace=True)
    #fig = px.scatter(y=dataframe.loc[:,"ph"])
   dataframe.columns
    #fig.show()

On se rend compte avec ces imputations, du fait que la méthode "most frequent value" n'est pas adaptée à notre dataset. en effet, la valeur la plus fréquente est un outlier (0 pour le ph par exemple). 
On se concentre donc sur la valeur moyenne et la médiane, et on abandonne la "most frequent value"

In [13]:
df_test = all_df_missing["df_missing_22"]
df_test.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,7.036752,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0.0
1,3.71608,129.422921,18630.05786,6.635246,333.073546,592.885359,15.180013,56.329076,4.500656,0.0
2,8.099124,224.236259,19909.54173,9.275884,333.073546,418.606213,16.868637,66.420093,3.055934,0.0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0.0
4,9.092223,181.101509,17978.98634,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0.0


# Iterative Imputer

### Nous allons ici utiliser une imputation multivariable sur les données ayant max 2 valeurs manquantes.

In [38]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

df_missing_3 = df.dropna(thresh=10)
imp_iter = IterativeImputer(random_state=0)
df_missing_3 = pd.DataFrame(imp_iter.fit_transform(df_missing_3))
df_missing_3.columns = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability',
       'Missing']
df_missing_3.drop('Missing', axis=1, inplace=True)
all_df_missing["df_missing_3"] = df_missing_3


## A ce stade, on a donc 7 datasets dont les valeurs manquantes ont été imputées selon différentes méthodes. Nous allons évaluer nos modèles sur chacun de ces datasets et garder celui pour lesquels les résultats sont les plus probants.

# Best Subset Selection


In [15]:
# Subset Selection
from sklearn.preprocessing import scale 
from sklearn import model_selection
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression, PLSSVD
from sklearn.metrics import mean_squared_error
import itertools
import statsmodels.api as sm

# Some libraries for PCA visualization
import seaborn as sns 
#Make Plotly figure
import plotly.graph_objs as go
y = df_missing_11['Potability'] #mettre le bon df
X = df_missing_11.drop('Potability', axis=1)

def processSubset(feature_set):
    # Fit OLS (Ordinary Least Squares) model on feature_set and calculate RSS
    model = sm.OLS(y,X[list(feature_set)])
    regr = model.fit()
    RSS = ((regr.predict(X[list(feature_set)]) - y) ** 2).sum()
    return {"model":regr, "RSS":RSS}

def getBest(k):
    results = []
    
    for combo in itertools.combinations(X.columns, k):
        results.append(processSubset(combo))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    print("Processed", models.shape[0], "models on", k, "predictors ")
    
    # Return the best model, along with some other useful information about the model
    return best_model


# It might take a while... Please be patient. 
# As long as the star to the left of the cell is displayed do not run again the cell.

models_best = pd.DataFrame(columns=["RSS", "model"])

for i in range(1,10):
    models_best.loc[i] = getBest(i)



Processed 9 models on 1 predictors 
Processed 36 models on 2 predictors 
Processed 84 models on 3 predictors 
Processed 126 models on 4 predictors 
Processed 126 models on 5 predictors 
Processed 84 models on 6 predictors 
Processed 36 models on 7 predictors 
Processed 9 models on 8 predictors 
Processed 1 models on 9 predictors 


In [16]:
print(models_best.loc[5, "model"].summary())

                                 OLS Regression Results                                
Dep. Variable:             Potability   R-squared (uncentered):                   0.392
Model:                            OLS   Adj. R-squared (uncentered):              0.391
Method:                 Least Squares   F-statistic:                              400.8
Date:                Mon, 01 Nov 2021   Prob (F-statistic):                        0.00
Time:                        16:10:46   Log-Likelihood:                         -2187.0
No. Observations:                3116   AIC:                                      4384.
Df Residuals:                    3111   BIC:                                      4414.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

# Définition de la pipeline

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer


# Each object of the pipeline is identified by a key ('std_scaler', 'linreg').
# You can use any value as the key of an object in the pipeline
pipeline =  Pipeline([
    ('iterative_imputer', IterativeImputer(sample_posterior=True)),
    ('std_scaler', StandardScaler()),
    ('linreg', SGDRegressor())
])

# Train_test_split

In [18]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

test_size = 0.2
y = df['Potability'] #mettre le bon df
X = df.drop('Potability', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
# The function fit is used to train the linear regressor on the training data.
# The result is a line with a given intercept and slope.
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("The prediction error (RMSE) is {}".format(np.sqrt(mse)))


The prediction error (RMSE) is 0.4843710527056671
