# Exercice 4

## Load data with the parameters kept on exercice 2

In [27]:
import pandas as pd
df_wind = pd.read_csv('df_wind_reduced.csv')

In [None]:
df_wind.columns

## K-fold cross validation

### Preprocessing

In [29]:
from sklearn.model_selection import train_test_split

X = df_wind[[col for col in df_wind.columns if col != 'Wind_speed']]
y = df_wind['Wind_speed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training

In [30]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

pipe_knn = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", KNeighborsRegressor())])

pipe_lr = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", LinearRegression())])

pipe_ridge = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", Ridge())])

pipe_lasso = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", Lasso())])


grid_knn = {
    'model__n_neighbors' : [k for k in range(1, 70)]
}

grid_lr = {}

grid_ridge = {
    'model__alpha' : [1e-9, 1e-6, 1e-3, 1, 10, 100, 1000],
    'model__max_iter' : [10, 100, 1000, 10000, 100000, 1000000]
}

grid_lasso = {
    'model__alpha' : [1e-9, 1e-6, 1e-3, 1, 10, 100, 1000],
    'model__max_iter' : [10, 100, 1000, 10000, 100000, 1000000]
}

models_name = ['K-NN', 'Linear regression', 'Ridge', 'Lasso']
grids = [grid_knn,grid_lr, grid_ridge, grid_lasso]

pipelines = []
models = [
    GridSearchCV(pipe_knn, grid_knn, cv=10, n_jobs=-1, scoring='neg_mean_squared_error'),
    GridSearchCV(pipe_lr, grid_lr, cv=10, n_jobs=-1),
    GridSearchCV(pipe_ridge, grid_ridge, cv=10, n_jobs=-1, scoring='neg_mean_squared_error'),
    GridSearchCV(pipe_lasso, grid_lasso, cv=10, n_jobs=-1, scoring='neg_mean_squared_error'),
]


In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import numpy as np

import warnings
warnings.filterwarnings('ignore')

results = pd.DataFrame(columns=["Model", "MAPE", "r2", "MSE"])

for i, (model_name, model, grid) in enumerate(zip(models_name, models, grids)):
    print(f"Training {model_name}")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = np.sqrt(mean_squared_error(y_test, y_pred))

    results.loc[i] = [model_name, f"{mape:.3f}", f"{r2:.3f}", f"{mse:.2f}"]

In [32]:
for model_name, model in zip(models_name, models):
    print(f"Best parameters for {model_name}: {model.best_params_}")

Best parameters for K-NN: {'model__n_neighbors': 60}
Best parameters for Linear regression: {}
Best parameters for Ridge: {'model__alpha': 0.001, 'model__max_iter': 10}
Best parameters for Lasso: {'model__alpha': 1e-06, 'model__max_iter': 10}


In [33]:
print(results.to_string(index=False))

            Model                MAPE    r2  MSE
             K-NN 178501472658740.062 0.375 1.03
Linear regression 223685969094966.375 0.145 1.21
            Ridge 223686003346738.250 0.145 1.21
            Lasso 223687068686517.375 0.145 1.21


We got a very high mape score and a very bad r2 so the training is not relevant here.

## TimeSeriesSplit cross validation

### Preprocessing (T = 7)

In [34]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression

nb_data = df_wind.shape[0]
tscv_7 = TimeSeriesSplit(n_splits=nb_data//24//14, test_size=24)

### Training

In [35]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

pipe_knn_7 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", KNeighborsRegressor())])

pipe_lr_7 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", LinearRegression())])

pipe_ridge_7 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", Ridge())])

pipe_lasso_7 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", Lasso())])


grid_knn = {
    'model__n_neighbors' : [k for k in range(1, 70, 5)]
}

grid_lr = {}

grid_ridge = {
    'model__alpha' : [1e-9, 1e-6, 1e-3, 1, 10, 100, 1000],
    'model__max_iter' : [10, 100, 1000, 10000, 100000, 1000000]
}

grid_lasso = {
    'model__alpha' : [1e-9, 1e-6, 1e-3, 1, 10, 100, 1000],
    'model__max_iter' : [10, 100, 1000, 10000, 100000, 1000000]
}

models_name = ['K-NN', 'Linear regression', 'Ridge', 'Lasso']
grids = [grid_knn,grid_lr, grid_ridge, grid_lasso]

pipelines = []
models = [
    GridSearchCV(pipe_knn_7, grid_knn, cv=tscv_7, n_jobs=-1, scoring='neg_mean_squared_error'),
    GridSearchCV(pipe_lr_7, grid_lr, cv=tscv_7, n_jobs=-1),
    GridSearchCV(pipe_ridge_7, grid_ridge, cv=tscv_7, n_jobs=-1, scoring='neg_mean_squared_error'),
    GridSearchCV(pipe_lasso_7, grid_lasso, cv=tscv_7, n_jobs=-1, scoring='neg_mean_squared_error'),
]


In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import numpy as np

import warnings
warnings.filterwarnings('ignore')

results = pd.DataFrame(columns=["Model", "MAPE", "r2", "MSE"])

for i, (model_name, model, grid) in enumerate(zip(models_name, models, grids)):
    print(f"Training {model_name}")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = np.sqrt(mean_squared_error(y_test, y_pred))

    results.loc[i] = [model_name, f"{mape:.3f}", f"{r2:.3f}", f"{mse:.2f}"]

In [37]:
for model_name, model in zip(models_name, models):
    print(f"Best parameters for {model_name}: {model.best_params_}")

Best parameters for K-NN: {'model__n_neighbors': 56}
Best parameters for Linear regression: {}
Best parameters for Ridge: {'model__alpha': 1e-09, 'model__max_iter': 10}
Best parameters for Lasso: {'model__alpha': 1e-09, 'model__max_iter': 10}


In [38]:
print(results.to_string(index=False))

            Model                MAPE    r2  MSE
             K-NN 178742444994287.250 0.374 1.04
Linear regression 223685969094966.375 0.145 1.21
            Ridge 223685969095000.875 0.145 1.21
            Lasso 223685942694162.000 0.145 1.21


We don't have better results here. The prediction of the field Wind_speed is hard and require a deeper analysis of the variables and the data.

### Preprocessing (T = 14)

In [39]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression

nb_data = df_wind.shape[0]
tscv_14 = TimeSeriesSplit(n_splits=nb_data//24//14, test_size=24)

### Training

In [40]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

pipe_knn_14 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", KNeighborsRegressor())])

pipe_lr_14 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", LinearRegression())])

pipe_ridge_14 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", Ridge())])

pipe_lasso_14 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", Lasso())])


grid_knn = {
    'model__n_neighbors' : [k for k in range(1, 70, 5)]
}

grid_lr = {}

grid_ridge = {
    'model__alpha' : [1e-9, 1e-6, 1e-3, 1, 10, 100, 1000],
    'model__max_iter' : [10, 100, 1000, 10000, 100000, 1000000]
}

grid_lasso = {
    'model__alpha' : [1e-9, 1e-6, 1e-3, 1, 10, 100, 1000],
    'model__max_iter' : [10, 100, 1000, 10000, 100000, 1000000]
}

models_name = ['K-NN', 'Linear regression', 'Ridge', 'Lasso']
grids = [grid_knn,grid_lr, grid_ridge, grid_lasso]

pipelines = []
models = [
    GridSearchCV(pipe_knn_14, grid_knn, cv=tscv_14, n_jobs=-1, scoring='neg_mean_squared_error'),
    GridSearchCV(pipe_lr_14, grid_lr, cv=tscv_14, n_jobs=-1),
    GridSearchCV(pipe_ridge_14, grid_ridge, cv=tscv_14, n_jobs=-1, scoring='neg_mean_squared_error'),
    GridSearchCV(pipe_lasso_14, grid_lasso, cv=tscv_14, n_jobs=-1, scoring='neg_mean_squared_error'),
]


In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import numpy as np

import warnings
warnings.filterwarnings('ignore')

results = pd.DataFrame(columns=["Model", "MAPE", "r2", "MSE"])

for i, (model_name, model, grid) in enumerate(zip(models_name, models, grids)):
    print(f"Training {model_name}")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = np.sqrt(mean_squared_error(y_test, y_pred))

    results.loc[i] = [model_name, f"{mape:.3f}", f"{r2:.3f}", f"{mse:.2f}"]

In [42]:
for model_name, model in zip(models_name, models):
    print(f"Best parameters for {model_name}: {model.best_params_}")

Best parameters for K-NN: {'model__n_neighbors': 56}
Best parameters for Linear regression: {}
Best parameters for Ridge: {'model__alpha': 1e-09, 'model__max_iter': 10}
Best parameters for Lasso: {'model__alpha': 1e-09, 'model__max_iter': 10}


In [43]:
print(results.to_string(index=False))

            Model                MAPE    r2  MSE
             K-NN 178742444994287.250 0.374 1.04
Linear regression 223685969094966.375 0.145 1.21
            Ridge 223685969095000.875 0.145 1.21
            Lasso 223685942694162.000 0.145 1.21


### Preprocessing (T = 21)

In [44]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression

nb_data = df_wind.shape[0]
tscv_21 = TimeSeriesSplit(n_splits=nb_data//24//14, test_size=24)

### Training

In [45]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

pipe_knn_21 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", KNeighborsRegressor())])

pipe_lr_21 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", LinearRegression())])

pipe_ridge_21 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", Ridge())])

pipe_lasso_21 = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("model", Lasso())])


grid_knn = {
    'model__n_neighbors' : [k for k in range(1, 70, 5)]
}

grid_lr = {}

grid_ridge = {
    'model__alpha' : [1e-9, 1e-6, 1e-3, 1, 10, 100, 1000],
    'model__max_iter' : [10, 100, 1000, 10000, 100000, 1000000]
}

grid_lasso = {
    'model__alpha' : [1e-9, 1e-6, 1e-3, 1, 10, 100, 1000],
    'model__max_iter' : [10, 100, 1000, 10000, 100000, 1000000]
}

models_name = ['K-NN', 'Linear regression', 'Ridge', 'Lasso']
grids = [grid_knn,grid_lr, grid_ridge, grid_lasso]

pipelines = []
models = [
    GridSearchCV(pipe_knn_21, grid_knn, cv=tscv_21, n_jobs=-1, scoring='neg_mean_squared_error'),
    GridSearchCV(pipe_lr_21, grid_lr, cv=tscv_21, n_jobs=-1),
    GridSearchCV(pipe_ridge_21, grid_ridge, cv=tscv_21, n_jobs=-1, scoring='neg_mean_squared_error'),
    GridSearchCV(pipe_lasso_21, grid_lasso, cv=tscv_21, n_jobs=-1, scoring='neg_mean_squared_error'),
]

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import numpy as np

import warnings
warnings.filterwarnings('ignore')

results = pd.DataFrame(columns=["Model", "MAPE", "r2", "MSE"])

for i, (model_name, model, grid) in enumerate(zip(models_name, models, grids)):
    print(f"Training {model_name}")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = np.sqrt(mean_squared_error(y_test, y_pred))

    results.loc[i] = [model_name, f"{mape:.3f}", f"{r2:.3f}", f"{mse:.2f}"]

In [47]:
for model_name, model in zip(models_name, models):
    print(f"Best parameters for {model_name}: {model.best_params_}")

Best parameters for K-NN: {'model__n_neighbors': 56}
Best parameters for Linear regression: {}
Best parameters for Ridge: {'model__alpha': 1e-09, 'model__max_iter': 10}
Best parameters for Lasso: {'model__alpha': 1e-09, 'model__max_iter': 10}


In [48]:
print(results.to_string(index=False))

            Model                MAPE    r2  MSE
             K-NN 178742444994287.250 0.374 1.04
Linear regression 223685969094966.375 0.145 1.21
            Ridge 223685969095000.875 0.145 1.21
            Lasso 223685942694162.000 0.145 1.21


Were are supposed to have better results with T = 7 because we predict with less time. Or we can imagine that as far as we are from the day of evaluation, the train set is less and less relevant.

We don't have better results here. The prediction of the field Wind_speed is hard and require a deeper analysis of the variables and the data.