# Modeling

In [1]:
#Libs
import os
import warnings

import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from tpot import TPOTRegressor
from sklearn.linear_model import LinearRegression,QuantileRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
from utils.features.build import build_distance, build_hour_group
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import cross_validate
from dotenv import load_dotenv, find_dotenv
warnings.filterwarnings('ignore')



In [2]:
# Env variables and data
load_dotenv(find_dotenv())
DATA_INPUT_PATH = os.getenv('DATA_PROCESSED_PATH')
DATA_TRAIN_NAME = 'train_best_features'
DATA_TEST_NAME = 'test'
# Data
df_orders_train = pd.read_parquet(os.path.join(DATA_INPUT_PATH, DATA_TRAIN_NAME))
df_orders_test = pd.read_parquet(os.path.join(DATA_INPUT_PATH, DATA_TEST_NAME))

# 1) Modeling

In [3]:
X_train = df_orders_train.drop('total_minutes', axis=1)
y_train = df_orders_train['total_minutes']
all_columns = X_train.columns.tolist()

## 1.1) Preprocessing

In [4]:
# We will have 3 imputers: Median for some numerical, Mode for categorical and 0 for some numerical
median_impute_columns_indexes = [all_columns.index(x) for x in ['n_distinct_items', 'distance_km', 'found_rate']]
mode_impute_columns_indexes = [all_columns.index(x) for x in ['on_demand', 'hour_group']]
zero_impute_columns_indexes = [all_columns.index(x) for x in ['sum_kgs', 'sum_unities']]
cat_columns = df_orders_train.select_dtypes(include=['O']).columns.tolist()
cat_columns_indexes = [all_columns.index(x) for x in cat_columns]
num_columns = df_orders_train.drop('total_minutes', axis=1).select_dtypes(include=['int32', 'int64', 'float32', 'float64']).columns.tolist()
num_columns_indexes = [all_columns.index(x) for x in num_columns]
# Feature engineering steps
distance_transformer = FunctionTransformer(func=build_distance)
hour_group_transformer = FunctionTransformer(func=build_hour_group)
pipe_feature_engineering = Pipeline(steps=[('distance_transformer', distance_transformer),
                                           ('hor_group_transformer', hour_group_transformer)])
# Imputation Steps
impute_transformer = ColumnTransformer(
                     transformers=[
                         ('impute_median', SimpleImputer(strategy='median'), 
                                                        median_impute_columns_indexes),
                         ('impute_mode', SimpleImputer(strategy='most_frequent'),
                                                       mode_impute_columns_indexes),
                         ('impute_zero', SimpleImputer(strategy='constant', fill_value=0),
                                                       zero_impute_columns_indexes)
                                        ]
                                            )
# Categorical encoder step                                              
cat_column_transformer = ColumnTransformer(
                        transformers=[
                            ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_columns_indexes)
                        ])
preprocessor = Pipeline(steps=[('impute_transformer', impute_transformer),
                               ('cat_column_transformer', cat_column_transformer)
                            ])

## 1.1) Baseline

In [5]:
model_baseline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', LinearRegression())])

In [6]:
df_baseline_cv_metrics = pd.DataFrame(cross_validate(model_baseline,
                                        X_train,
                                        y_train, 
                                        scoring=['neg_mean_absolute_error', 'neg_mean_absolute_percentage_error'], 
                                        return_train_score=True, cv=10))

In [16]:
df_baseline_cv_metrics

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_mean_absolute_percentage_error,train_neg_mean_absolute_percentage_error
0,0.094491,0.012526,-22.598843,-15.139074,-0.318207,-0.225784
1,0.07995,0.013039,-22.25522,-15.21801,-0.319276,-0.227202
2,0.075916,0.010082,-23.447508,-15.023926,-0.321289,-0.224323
3,0.074584,0.011932,-21.815441,-15.12063,-0.295688,-0.226014
4,0.070109,0.009916,-23.117906,-14.931582,-0.332377,-0.222903
5,0.07106,0.009642,-23.361094,-14.928853,-0.310007,-0.224506
6,0.062238,0.009479,-23.994952,-15.060264,-0.329593,-0.224692
7,0.069963,0.00949,-24.381147,-14.862106,-0.331012,-0.222591
8,0.06323,0.010679,-22.656765,-15.194771,-0.320348,-0.227428
9,0.072297,0.011081,-24.394854,-14.842159,-0.3319,-0.222485


In [9]:
df_orders_test = pipe_feature_engineering.fit_transform(df_orders_test)

In [10]:
X_test = df_orders_test.loc[:, all_columns]
y_test = df_orders_test['total_minutes'] 

In [20]:
model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', RandomForestRegressor(random_state=123, max_depth=15, n_estimators=200, criterion='squared_error'))])

In [21]:
df_rf_cv_metrics = pd.DataFrame(cross_validate(model_rf,
                                        X_train,
                                        y_train, 
                                        scoring=['neg_mean_absolute_error', 'neg_mean_absolute_percentage_error'], 
                                        return_train_score=True, cv=10))

## 1.3) Quantile Regression