## Imports

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Scaleres
from sklearn.preprocessing import RobustScaler

#train
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

#Imputer
from sklearn.impute import SimpleImputer

#PipeLine
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

#Model
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

#Grid Search
from sklearn.model_selection import GridSearchCV

In [4]:
cooked_data = pd.read_pickle("/home/ecapi/code/sebvey/water_pollution/cooked_data/2011_2021_pc_saone_df.pickle")

##### cooked data

In [7]:
cooked_data.head(5)

Unnamed: 0,1295,1301,1302,1303,1305,1311,1312,1313,1314,1319,1350,1335,1339,1340,1433,1841,1342
0,2.4,6.6,8.2,70.0,2.0,10.7,90.0,0.9,6.5,1.0,0.04,0.05,0.02,4.5,0.09,1.4,8.908333
1,7.9,2.0,7.1,54.0,2.6,14.1,104.0,1.1,5.1,1.0,0.02,0.05,0.02,4.3,0.05,2.3,8.908333
2,17.0,6.7,8.4,58.0,11.0,13.1,108.0,2.8,9.7,1.1,0.03,0.08,0.02,3.4,0.02,2.6,8.908333
3,17.0,7.5,5.29,57.0,11.0,11.7,100.0,2.8,9.7,1.1,0.03,0.08,0.02,3.4,0.02,2.6,8.908333
4,13.0,12.1,7.7,60.0,6.8,9.0,85.0,0.7,7.8,1.0,0.03,0.05,0.02,2.7,0.03,3.5,8.908333


In [6]:
cooked_data.shape

(2043, 17)

## Create DF

In [3]:
data = pd.read_csv("/home/ecapi/code/sebvey/water_pollution/raw_data/2011_2021_caluire_phys_v0.csv")

In [4]:
data.DatePrel = pd.to_datetime(data.DatePrel)

In [5]:
df = data.set_index("DatePrel")

## Features and Data selection

In [6]:
X = df.drop(columns="1340") #drop the target
y = df["1340"] # Series of the target

## Train, test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Linear Regresion

## Linear PipeLine

In [10]:
numerical_transformer = Pipeline([
    ("numerical_impute", SimpleImputer(strategy='mean')),
    ("scaler", RobustScaler()) #take out the outliers 
    ])

preprocessor = ColumnTransformer([
    ("num_transformer", numerical_transformer)
])

pipe_linear = Pipeline([
    ("num-transformer", numerical_transformer),
    ("model", LinearRegression())
])

#Initiate the model
baseline_linear_pipe = pipe_linear

## model test function

In [57]:
def model_testing(pipe_line, X, y, cv=5, scoring="r2"):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    test_model = pipe_line
    test_model.fit(X_train, y_train)
    test_score = test_model.score(X_test, y_test)
    
    cross_val_results = cross_val_score(test_model, X, y, cv=cv, scoring=scoring)
    
    
    
    return print(f"Results: \nCross Val Score: {cross_val_results} \nScore Baseline: {cross_val_results.mean()} \nModel Score: {test_score}\nScoring={scoring} & CV={cv}")
    

In [58]:
model_testing(baseline_linear_pipe,X,y,)

Results: 
Cross Val Score: [-1.99120883  0.44430933  0.39767356  0.78722584  0.63916276] 
Score Baseline: 0.05543253258729384 
Model Score: 0.6053762999229704
Scoring=r2 & CV=5


In [64]:
# Scoring RMSE for prediction with the same target value
model_testing(baseline_linear_pipe,X,y,scoring="neg_root_mean_squared_error")

Results: 
Cross Val Score: [-3.28967638 -2.00772747 -2.78438924 -2.12505462 -2.23377853] 
Score Baseline: -2.4881252465763994 
Model Score: 0.5403923165111582
Scoring=neg_root_mean_squared_error & CV=5


In [37]:
cv_results = cross_validate(baseline_linear_pipe, X, y, cv=3, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error']
                           )
pd.DataFrame(cv_results)

Unnamed: 0,fit_time,score_time,test_max_error,test_r2,test_neg_mean_absolute_error,test_neg_mean_squared_error
0,0.012134,0.00436,-10.010241,-0.478564,-2.386658,-9.764184
1,0.009854,0.002923,-8.534862,0.339298,-1.969927,-6.449402
2,0.00664,0.002568,-7.069878,0.645312,-2.046151,-7.161412


can't use an AdaBooster nor a forestregressor or a decision tree

# Gradient Boosting Model

## Gradient Boosting pipe

In [71]:
numerical_transformer_grad = Pipeline([
    ("numerical_impute", SimpleImputer(strategy='mean')),
    ("scaler", RobustScaler()) #take out the outliers 
    ])

preprocessor_grad = ColumnTransformer([
    ("num_transformer", numerical_transformer_grad)
])

pipe_gradient = Pipeline([
    ("num-transformer", numerical_transformer_grad),
    ("model", GradientBoostingRegressor())
])

#Initiate the model
baseline_gradient_pipe = pipe_gradient

## model test function

In [79]:
model_testing(baseline_gradient_pipe,X,y)

Results: 
Cross Val Score: [-0.08376101  0.22653292  0.59749264  0.71614535  0.72734861] 
Score Baseline: 0.4367517028364839 
Model Score: 0.560632744468252
Scoring=r2 & CV=5


In [73]:
model_testing(baseline_gradient_pipe,X,y,scoring="neg_root_mean_squared_error")

Results: 
Cross Val Score: [-1.94677258 -2.37450756 -2.50263868 -2.42576642 -1.98433367] 
Score Baseline: -2.2468037839508153 
Model Score: 0.5813658524431594
Scoring=neg_root_mean_squared_error & CV=5


# KNN

In [86]:
numerical_transformer_KNN = Pipeline([
    ("numerical_impute", SimpleImputer(strategy='mean')),
    ("scaler", RobustScaler()) #take out the outliers 
    ])

preprocessor_KNN = ColumnTransformer([
    ("num_transformer", numerical_transformer_KNN)
])

pipe_KNN = Pipeline([
    ("num-transformer", numerical_transformer_KNN),
    ("model", KNeighborsRegressor())
])

#Initiate the model
baseline_KNN_pipe = pipe_KNN

In [88]:
model_testing(baseline_KNN_pipe,X,y)

Results: 
Cross Val Score: [-0.70076038  0.22945237  0.24103641  0.62561658  0.32164065] 
Score Baseline: 0.14339712667068388 
Model Score: 0.5646174355688313
Scoring=r2 & CV=5


In [89]:
model_testing(baseline_KNN_pipe,X,y, scoring="neg_root_mean_squared_error")

Results: 
Cross Val Score: [-2.48056935 -2.36422034 -3.12553563 -2.8188305  -3.06276972] 
Score Baseline: -2.7703851096791245 
Model Score: 0.6215453917366439
Scoring=neg_root_mean_squared_error & CV=5


# SVR

In [93]:
numerical_transformer_svr = Pipeline([
    ("numerical_impute", SimpleImputer(strategy='mean')),
    ("scaler", RobustScaler()) #take out the outliers 
    ])

preprocessor_svr = ColumnTransformer([
    ("num_transformer", numerical_transformer_svr)
])

pipe_svr = Pipeline([
    ("num-transformer", numerical_transformer_svr),
    ("model", SVR())
])

#Initiate the model
baseline_svr_pipe = pipe_svr

In [94]:
model_testing(baseline_svr_pipe, X,y)

Results: 
Cross Val Score: [-0.21767033 -0.20983572 -0.07914441 -0.09657258 -0.10180656] 
Score Baseline: -0.141005918878336 
Model Score: 0.005408605540370481
Scoring=r2 & CV=5


In [95]:
model_testing(baseline_svr_pipe,X,y,scoring="neg_root_mean_squared_error")

Results: 
Cross Val Score: [-2.09891413 -2.96245259 -3.72695171 -4.82424494 -3.90334655] 
Score Baseline: -3.503181984374547 
Model Score: -0.0664076470698094
Scoring=neg_root_mean_squared_error & CV=5
