<table><tr>
<td> <img src="https://upload.wikimedia.org/wikipedia/fr/thumb/e/e5/Logo_%C3%A9cole_des_ponts_paristech.svg/676px-Logo_%C3%A9cole_des_ponts_paristech.svg.png" width="200"  height="200" hspace="200"/> </td>
<td> <img src="https://pbs.twimg.com/profile_images/1156541928193896448/5ihYIbCQ_200x200.png" width="200" height="200" /> </td>
</tr></table>

<br/>

<h1><center>Session 7 - Model Evaluation and Selection</center></h1>



<font size="3">This session is divided into **2** parts:
- **Model selection**
- **Model optimization:**
>  * 1-Features selection
>  * 2-Hyperparameters optimization

In each of these parts, some **guidelines** and **hints** are given for each task. 
Do not hesitate to check the links to documentation to understand the functions you use. 
    
The goal of this session is to **select a model** that you will use as your best candidate and optimize it to get the best out of it.
</font>

# Session 5 modelling wrap up 

In [None]:
#basics
import pandas as pd
import numpy as np

#processing
from sklearn.model_selection import train_test_split

#metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics._regression import _check_reg_targets

#models
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
def load_dataset(path):
    print(f"loading raw data..")
    data = pd.read_csv(path)
    return data

def clean_data(data, drop_2020=True):
    print(f"cleaning data..")
    data = data.dropna()
    if drop_2020:
        data = data.query("year != 2020")
    data = data.sort_values(by='release_date')
    data.release_date = pd.to_datetime(data.release_date)
    data.index = data.release_date
    data = data.drop(columns = ['index', 'release_date', 'year'], errors='ignore')
    return data

def train_test_split_by_date(df: pd.DataFrame, split_date: str):
    """Split dataset according to a split date in format "YYYY-MM-DD"
    - train: [:split_date[
    - test: [split_date:]
    """
    train = df.loc[:split_date].copy()
    test = df.loc[split_date:].copy()
    return train, test

def get_x_y(dataset):
    target = dataset.sales
    target = target.astype(float)
    features = dataset.drop(columns = ['sales'], errors='ignore')
    return features, target
          
def transform_target(target, forward = True):
    if forward == True: target_tf = [np.log(x) for x in target]
    else: target_tf = [np.exp(x) for x in target]
    return target_tf  
          
def get_evaluation_metrics(y_test, y_pred, y_train=None) -> dict:
    metrics = {
        'mape': mean_absolute_percentage_error(y_test, y_pred),
        'rmse': mean_squared_error(y_test, y_pred, squared=False),
        'mae': mean_absolute_error(y_test, y_pred),
    }
    return metrics

def mean_absolute_percentage_error(y_true, y_pred):
    """in percent"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred)/y_true)) * 100

def prettify_metrics(metrics: dict) -> str:
    output = [f"Evaluation:\n{'-'*10}"]
    for name, metric in metrics.items():
        output.append((f'- {name.upper()}: {round(metric, 2)}'))
    return '\n'.join(output) +'\n'

def train(lr, features, target, transformer = None):
    print(f"start fitting a {lr.__class__}...")
    if transformer:
        lr = lr.fit(features, transformer(target, forward = True))
    predicted_target = lr.predict(features)
    if transformer:
        predicted_target = transformer(predicted_target, forward= False)
    print(prettify_metrics(get_evaluation_metrics(target, predicted_target)))
    
    return lr

def evaluate(lr, features, target, transformer=None, ret=False):
    predicted_target = lr.predict(features)
    if transformer:
        predicted_target = transformer(predicted_target, forward= False)
    
    print(prettify_metrics(get_evaluation_metrics(target, predicted_target)))
    if ret==True:
        return get_evaluation_metrics(target, predicted_target)

# Model Selection

## Q1 - Train/Test --> Train/Validation/Test

#### What if we want to use 2020 as testing set ?
Update the split function so that we can keep 2020 as testing set
> -  Modify the function **train_test_split_date** to produce a **train_validation_test_split_date**
> -  Hint: add another date split to the function or use the function twice
> - Your function should return 3 datasets

In [None]:
def train_test_split_by_date(df: pd.DataFrame, split_date: str ...):
    
    ...
                             
    return train, validation, test

## Model training

In [None]:
path = '/Users/yaguethiam/Ponts/data_prepared_ponts_v4.csv'
raw_data = load_dataset(path)

In [None]:
data = clean_data(raw_data)
train_data, validation_data, test_data = train_test_split_by_date(data,
                                                                  '2018-01-01',
                                                                  '2020-01-01')
train_x, train_y = get_x_y(train_data)
validation_x, validation_y = get_x_y(validation_data)
test_x, test_y = get_x_y(test_data)

# Models training
> - run a linear regression, a Random Forest and a LGBM on the dataset without testing
> - what metrics is more representative for our problem?
> - compare the models
> - select the best model

#### Linear regression

In [None]:
lr = 
lr = train(learner, train_x, train_y, transformer = transform_target)
print("Evaluate on validation set ...")
evaluate(lr, validation_x, validation_y, transformer = transform_target)

#### Random Forest 

#### LGBM Regressor

### What can you say about the performance of the different model on the 2 datasets?

## What are your thoughts?
> - Which model is overfitting ?
> - Which model is underfittin?
> - Which model is the best?

### Train the selected model, validate and test

# Model Optimization

> From this step and forward we will only focus on the MAPE to monitor the performance of our model

In [None]:
def get_evaluation_metrics(y_test, y_pred, y_train=None) -> dict:
    metrics = {
        'mape': mean_absolute_percentage_error(y_test, y_pred)
    }
    return metrics

## 1 - Features selection : Importance Based

In [None]:
#save the importance in a dataframe in descending order (from most important to less important)
#plot the feature importance for the selected model 

In [None]:
# Loop on all features from 1 to n and keep track of the MAPE

In [None]:
#plot MAPE vs number of features 
#what do you observe

In [None]:
# What is the top k features given the MAPE on the validation set?

In [None]:
#Run the model with the K best features and check the results

## 2 - Select the best Hyperparameters

In this part we will implement the grid search hyperparameter algorithm from scratch

#### Grid Search from scratch

In [None]:
from sklearn.model_selection import ParameterGrid

In [None]:
#print lgbm parameters
#hint place your cursor after the dot and hit tab
lgbm_learner.

In [None]:
param_grid = {'n_estimators': [50, 100, 150],
              'num_leaves': [27, 31, 35]}

In [None]:
#use ParameterGrid to print all the combinations of hyperparameters

In [None]:
## loop into this list of hyperparameters and get the best subset by logging MAPE

In [None]:
# Run a model with the best set of features 

# Try other hyperparameters search algorithm

#### Randomized search

In [None]:
# how would you bring randomization on the search?

#### Bayesian search