### Introduction
This notebook is designed to showcase the usage of the ML trainin workflow, while does not seek to fit the best model.
<br> 5000 sample is sliced and used from both test and train datasets for the simplicity.

In [43]:
# Setting Up Directory

import os
from os.path import join, dirname

initial_dir = os.getcwd()

if initial_dir.endswith('notebooks'):
    proj_dir = dirname(initial_dir)

data_dir = join(proj_dir,'data')
os.chdir(proj_dir)


In [44]:
# Importing Libraries
import pandas as pd
import numpy as np

from model_development.utils import *
from model_development.FeatureEngineering import FeatureEngineering
from model_development.ModelTraining import ModelTraining

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [45]:
# Importing Train and Test

train = pd.read_csv(join(data_dir,'train.csv'))
test =pd.read_csv(join(data_dir,'test.csv'))

train.columns = train.columns.str.lower()
train.drop(columns='id', inplace=True)

test.columns = test.columns.str.lower()
test_id = test.id
test.drop(columns='id', inplace=True)

# For the sake of simplicity, let's take 100K sample from train data
train = train.iloc[:100000,:]

X = train.drop(columns='floodprobability')
y = train.floodprobability

### Training a base model

In [46]:
# Split into train tesat
X_train, X_test, y_train, y_test = train_test_ind(np.array(X),np.array(y))

# Train a LGBM Regressor
base_catb = CatBoostRegressor(verbose=False)
base_catb.fit(X_train,y_train)
y_pred_train = base_catb.predict(X_train)
y_pred_test = base_catb.predict(X_test)

mse_train = mean_squared_error(y_pred_train, y_train)
mse_test = mean_squared_error(y_pred_test, y_test)

mae_train = mean_absolute_error(y_pred_train, y_train)
mae_test = mean_absolute_error(y_pred_test, y_test)

r2_train = r2_score(y_pred_train, y_train)
r2_test = r2_score(y_pred_test, y_test)

print(f'MSE Train: {mse_train:.4f} \nMSE Test: {mse_test:.4f} \n \nMAE Train: {mae_train:.4f} \nMAE Test: {mae_test:.4f} \n \nR2 Train: {r2_train:.4f} \nR2 Test: {r2_test:.4f}')

MSE Train: 0.0004 
MSE Test: 0.0004 
 
MAE Train: 0.0150 
MAE Test: 0.0158 
 
R2 Train: 0.8374 
R2 Test: 0.8129


### Feature Engineering

In [None]:
feat_engineering = FeatureEngineering()
feat_engineering.fit(X)

X_transformed = feat_engineering.transform(X)
X_test_transformed = feat_engineering.transform(test)

### Hyperparameter Tuning and Final Model Fitting

In [35]:
# Define the grid search parameters
grid_search_params = {
    'learning_rate': [0.05, 0.075, 0.1],
    'depth': [3,5],
    'iterations': [1000]
}


# Instantiate the ModelTraining class and fit
model_trainer = ModelTraining(
                            feature_selection=True,
                            grid_search_params=grid_search_params,
                            feature_selection_method='pca',
                            cv = 3)


model_trainer.fit(X_transformed, y)

Feature Selection has been started

Hyperparameter Tuning has been started
Fitting 3 folds for each of 6 candidates, totalling 18 fits

Hyperparameter tuning has been done

Best model is being fitted..


In [36]:
# Checking hyperparameter tuning results
model_trainer.grid_search_result.sort_values('rank_test_score')

Unnamed: 0,param_depth,param_iterations,param_learning_rate,mean_test_score,std_test_score,rank_test_score
1,3,1000,0.075,-0.000658,3.5e-05,1
2,3,1000,0.1,-0.000663,3.4e-05,2
0,3,1000,0.05,-0.000684,4.4e-05,3
3,5,1000,0.05,-0.000738,4.5e-05,4
4,5,1000,0.075,-0.000745,5.3e-05,5
5,5,1000,0.1,-0.000747,6.1e-05,6


In [37]:
# Getting train test performance of the best mode(rank=1)
model_trainer.get_train_test_performance()

MSE Train: 0.0001 
MSE Test: 0.0007 
 
MAE Train: 0.0071 
MAE Test: 0.0207 
 
R2 Train: 0.9612 
R2 Test: 0.5916


In [38]:
# Change the final model
model_trainer.set_final_model(rank=2)

In [39]:
# Get train test performance for the updated final model
model_trainer.get_train_test_performance()

MSE Train: 0.0001 
MSE Test: 0.0007 
 
MAE Train: 0.0058 
MAE Test: 0.0205 
 
R2 Train: 0.9732 
R2 Test: 0.6115


In [42]:
# Make predictions
model_trainer.predict(X_test_transformed)[0:10]

array([0.5545748 , 0.46230044, 0.48320268, 0.46598247, 0.46285634,
       0.50834408, 0.53977213, 0.52498314, 0.47598248, 0.57664392])