### Introduction
This notebook is designed to showcase the usage of the ML trainin workflow, while does not seek to fit the best model.
<br> 100K sample is sliced and used from both test and train datasets for the simplicity.

In [43]:
# Setting Up Directory

import os
from os.path import join, dirname

initial_dir = os.getcwd()

if initial_dir.endswith('notebooks'):
    proj_dir = dirname(initial_dir)

data_dir = join(proj_dir,'data')
os.chdir(proj_dir)


In [44]:
# Importing Libraries
import pandas as pd
import numpy as np

from model_development.utils import *
from model_development.FeatureEngineering import FeatureEngineering
from model_development.ModelTraining import ModelTraining

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [45]:
# Importing Train and Test

train = pd.read_csv(join(data_dir,'train.csv'))
test =pd.read_csv(join(data_dir,'test.csv'))

train.columns = train.columns.str.lower()
train.drop(columns='id', inplace=True)

test.columns = test.columns.str.lower()
test_id = test.id
test.drop(columns='id', inplace=True)

# For the sake of simplicity, let's take 100K sample from train data
train = train.iloc[:100000,:]

X = train.drop(columns='floodprobability')
y = train.floodprobability

### Training a base model
Here we train a baseline model with original features and default parameters, to see whether our tuned with engineered features can make any difference.

In [46]:
# Split into train tesat
X_train, X_test, y_train, y_test = train_test_ind(np.array(X),np.array(y))

# Train a LGBM Regressor
base_catb = CatBoostRegressor(verbose=False)
base_catb.fit(X_train,y_train)
y_pred_train = base_catb.predict(X_train)
y_pred_test = base_catb.predict(X_test)

mse_train = mean_squared_error(y_pred_train, y_train)
mse_test = mean_squared_error(y_pred_test, y_test)

mae_train = mean_absolute_error(y_pred_train, y_train)
mae_test = mean_absolute_error(y_pred_test, y_test)

r2_train = r2_score(y_pred_train, y_train)
r2_test = r2_score(y_pred_test, y_test)

print(f'MSE Train: {mse_train:.4f} \nMSE Test: {mse_test:.4f} \n \nMAE Train: {mae_train:.4f} \nMAE Test: {mae_test:.4f} \n \nR2 Train: {r2_train:.4f} \nR2 Test: {r2_test:.4f}')

MSE Train: 0.0004 
MSE Test: 0.0004 
 
MAE Train: 0.0150 
MAE Test: 0.0158 
 
R2 Train: 0.8374 
R2 Test: 0.8129


### Feature Engineering

In [47]:
feat_engineering = FeatureEngineering()
feat_engineering.fit(X)

X_transformed = feat_engineering.transform(X)
X_test_transformed = feat_engineering.transform(test)

### Hyperparameter Tuning and Final Model Fitting
We feed the workflow with a set of parameters, and set feature engineering method to *pca*.

In [48]:
# Define the grid search parameters
grid_search_params = {
    'learning_rate': [0.05, 0.075, 0.1],
    'depth': [3,5],
    'iterations': [1000]
}


# Instantiate the ModelTraining class and fit
model_trainer = ModelTraining(
                            feature_selection=True,
                            grid_search_params=grid_search_params,
                            feature_selection_method='pca',
                            cv = 3)


model_trainer.fit(X_transformed, y)

Feature Selection has been started

Hyperparameter Tuning has been started
Fitting 3 folds for each of 6 candidates, totalling 18 fits

Hyperparameter tuning has been done

Best model is being fitted..


In [50]:
# Checking hyperparameter tuning results
model_trainer.grid_search_result.sort_values('rank_test_score')

Unnamed: 0,param_depth,param_iterations,param_learning_rate,mean_test_score,std_test_score,rank_test_score
2,3,1000,0.1,-0.000404,4e-06,1
4,5,1000,0.075,-0.000404,4e-06,2
5,5,1000,0.1,-0.000404,4e-06,3
3,5,1000,0.05,-0.000406,4e-06,4
1,3,1000,0.075,-0.000407,4e-06,5
0,3,1000,0.05,-0.000427,5e-06,6


In [51]:
# Getting train test performance of the final/best model (rank=1)
model_trainer.get_train_test_performance()

MSE Train: 0.0004 
MSE Test: 0.0004 
 
MAE Train: 0.0157 
MAE Test: 0.0159 
 
R2 Train: 0.8182 
R2 Test: 0.8089


In [52]:
# We change the final model to be the second best model according to grid search results
model_trainer.set_final_model(rank=2)

In [53]:
# Get train test performance for the updated final model
model_trainer.get_train_test_performance()

MSE Train: 0.0004 
MSE Test: 0.0004 
 
MAE Train: 0.0154 
MAE Test: 0.0159 
 
R2 Train: 0.8284 
R2 Test: 0.8115


In [54]:
# Make predictions
model_trainer.predict(X_test_transformed)[0:10]

array([0.5721399 , 0.45706709, 0.45203093, 0.46599608, 0.46881078,
       0.50567626, 0.53448679, 0.52936694, 0.47165999, 0.56920645])