# **Modeling and Evaluation**

## Imports

In [105]:
# data analysis and data wrangling
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier  # enbedded method

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import xgboost as xgb

# metrics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

# Other
from IPython.display import Image
import configparser
import subprocess
import warnings
import pprint
import time
import os

## Prepare Principal Directory

In [106]:
def path_to_work(end_directory: str='notebooks'):
    curr_dir = os.path.dirname(os.path.realpath ("__file__")) 
    
    if curr_dir.endswith(end_directory):
        os.chdir('..')
        return f'Change directory to: {curr_dir}'
    
    return f'Current working directory: {curr_dir}'

In [107]:
path_to_work(end_directory='notebooks')

'Current working directory: /home/campos/projects/allstate-claims-severity'

## Set Config 

In [108]:
# Visualization inside the jupyter
%matplotlib inline

# Load the "autoreload" extension so that code can change
%load_ext autoreload

# ----------
# Plot
# ----------
# graph style
sns.set_style("darkgrid")
plt.style.use('fivethirtyeight')

# ----------
# Pandas
# ----------
# Floating point
pd.options.display.float_format = '{:.2f}'.format

# Print xxxx rows and all columns
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', None)

# ----------
# Python
# ----------
# pretty print
pp = pprint.PrettyPrinter(indent=4)

# Supress unnecessary warnings so that presentation looks clean
warnings.filterwarnings('ignore')

# ----------
# XGB
# ----------
xgb.set_config(verbosity=0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dataset

In [109]:
%%time

df_train = pd.read_csv("data/cleansing/train.csv", 
                       encoding='utf-8')
df_test = pd.read_csv("data/cleansing/test.csv", 
                      encoding='utf-8')

CPU times: user 2.64 s, sys: 483 ms, total: 3.12 s
Wall time: 4.4 s


In [110]:
df_train.head(2)

Unnamed: 0,cat1,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat23,cat24,cat25,cat26,cat27,cat28,cat29,cat30,cat31,cat32,cat33,cat34,cat35,cat36,cat37,cat38,cat39,cat40,cat41,cat42,cat43,cat44,cat45,cat46,cat47,cat48,cat49,cat50,cat51,cat52,cat53,cat54,cat55,cat56,cat57,cat58,cat59,cat60,cat61,cat62,cat63,cat64,cat65,cat66,cat67,cat68,cat69,cat70,cat71,cat72,cat73,cat74,cat75,cat76,cat77,cat78,cat79,cat80,cat81,cat82,cat83,cat84,cat85,cat86,cat87,cat88,cat89,cat90,cat91,cat92,cat93,cat95,cat96,cat97,cat98,cat99,cat100,cat101,cat102,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont1,cont2,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont14,loss
0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,1,1,3,3,1,3,2,1,3,1,0,0,0,0,0,3,2,4,0,2,15,1,6,0,8,4,6,9,6,45,28,2,19,55,0,14,269,0.73,0.25,0.79,0.31,0.72,0.34,0.3,0.67,0.84,0.57,0.59,0.71,2213.18
1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,3,3,0,1,2,1,3,1,0,0,0,0,0,3,2,4,4,3,15,11,5,0,4,4,8,10,10,33,65,0,22,38,0,14,85,0.33,0.74,0.61,0.89,0.44,0.44,0.6,0.35,0.44,0.34,0.37,0.3,1283.6


### Global Variables

In [111]:
# Lists that will be manipulated in the data processing
list_columns = []
list_categorical_col = []
list_numerical_col = []

In [112]:
def get_col(df: 'dataframe', type_descr: 'numpy') -> list:
    """
    Function get list columns 
    
    Args:
    type_descr
        np.number, np.object -> return list with all columns
        np.number            -> return list numerical columns 
        np.object            -> return list object columns
    """
    try:
        col = (df.describe(include=type_descr).columns)  # pandas.core.indexes.base.Index  
    except ValueError:
        print(f'Dataframe not contains {type_descr} columns !', end='\n')    
    else:
        return col.tolist() 

In [113]:
list_numerical_col = get_col(df=df_train,
                             type_descr=np.number)
list_categorical_col = get_col(df=df_train,
                               type_descr=np.object)
list_columns = get_col(df=df_train,
                       type_descr=[np.object, np.number])

Dataframe not contains <class 'object'> columns !


---

## Kaggle: prepare submission file
Use function to submission. In this way it is guaranteed a default.

In [114]:
# First, check how is file sample
sample = pd.read_csv('data/raw/sample_submission.csv')
sample.head(10)

Unnamed: 0,id,loss
0,4,0
1,6,0
2,9,0
3,12,0
4,15,0
5,17,0
6,21,0
7,28,0
8,32,0
9,43,0


In [115]:
test_ids = df_test['id']

submissions_folder = 'data/submissions-kaggle/'

In [116]:
def save_predictions(ids, predictions, file) -> None:
    # prepare file
    submission = pd.DataFrame({'id': ids,
                               'loss': predictions})
    submission.to_csv(path_or_buf=file,
                      index=False,
                      encoding='utf8')
    return "saved data!"

---

## **Split in train and test**
- The variable Shift is applied to the log transformation.

In [117]:
list_columns.remove('loss')

In [118]:
df_train[list_columns].head(2)

Unnamed: 0,cat1,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat23,cat24,cat25,cat26,cat27,cat28,cat29,cat30,cat31,cat32,cat33,cat34,cat35,cat36,cat37,cat38,cat39,cat40,cat41,cat42,cat43,cat44,cat45,cat46,cat47,cat48,cat49,cat50,cat51,cat52,cat53,cat54,cat55,cat56,cat57,cat58,cat59,cat60,cat61,cat62,cat63,cat64,cat65,cat66,cat67,cat68,cat69,cat70,cat71,cat72,cat73,cat74,cat75,cat76,cat77,cat78,cat79,cat80,cat81,cat82,cat83,cat84,cat85,cat86,cat87,cat88,cat89,cat90,cat91,cat92,cat93,cat95,cat96,cat97,cat98,cat99,cat100,cat101,cat102,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont1,cont2,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont14
0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,1,1,3,3,1,3,2,1,3,1,0,0,0,0,0,3,2,4,0,2,15,1,6,0,8,4,6,9,6,45,28,2,19,55,0,14,269,0.73,0.25,0.79,0.31,0.72,0.34,0.3,0.67,0.84,0.57,0.59,0.71
1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,3,3,0,1,2,1,3,1,0,0,0,0,0,3,2,4,4,3,15,11,5,0,4,4,8,10,10,33,65,0,22,38,0,14,85,0.33,0.74,0.61,0.89,0.44,0.44,0.6,0.35,0.44,0.34,0.37,0.3


In [119]:
# split into training and test sets
shift = 200

# create target label
X_train = df_train[list_columns]
y_train = np.log(df_train['loss'] + shift)

In [120]:
X_test = df_test[list_columns]

In [121]:
X_train.head(2)

Unnamed: 0,cat1,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat23,cat24,cat25,cat26,cat27,cat28,cat29,cat30,cat31,cat32,cat33,cat34,cat35,cat36,cat37,cat38,cat39,cat40,cat41,cat42,cat43,cat44,cat45,cat46,cat47,cat48,cat49,cat50,cat51,cat52,cat53,cat54,cat55,cat56,cat57,cat58,cat59,cat60,cat61,cat62,cat63,cat64,cat65,cat66,cat67,cat68,cat69,cat70,cat71,cat72,cat73,cat74,cat75,cat76,cat77,cat78,cat79,cat80,cat81,cat82,cat83,cat84,cat85,cat86,cat87,cat88,cat89,cat90,cat91,cat92,cat93,cat95,cat96,cat97,cat98,cat99,cat100,cat101,cat102,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont1,cont2,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont14
0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,1,1,3,3,1,3,2,1,3,1,0,0,0,0,0,3,2,4,0,2,15,1,6,0,8,4,6,9,6,45,28,2,19,55,0,14,269,0.73,0.25,0.79,0.31,0.72,0.34,0.3,0.67,0.84,0.57,0.59,0.71
1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,3,3,0,1,2,1,3,1,0,0,0,0,0,3,2,4,4,3,15,11,5,0,4,4,8,10,10,33,65,0,22,38,0,14,85,0.33,0.74,0.61,0.89,0.44,0.44,0.6,0.35,0.44,0.34,0.37,0.3


In [122]:
X_test.head(2)

Unnamed: 0,cat1,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat23,cat24,cat25,cat26,cat27,cat28,cat29,cat30,cat31,cat32,cat33,cat34,cat35,cat36,cat37,cat38,cat39,cat40,cat41,cat42,cat43,cat44,cat45,cat46,cat47,cat48,cat49,cat50,cat51,cat52,cat53,cat54,cat55,cat56,cat57,cat58,cat59,cat60,cat61,cat62,cat63,cat64,cat65,cat66,cat67,cat68,cat69,cat70,cat71,cat72,cat73,cat74,cat75,cat76,cat77,cat78,cat79,cat80,cat81,cat82,cat83,cat84,cat85,cat86,cat87,cat88,cat89,cat90,cat91,cat92,cat93,cat95,cat96,cat97,cat98,cat99,cat100,cat101,cat102,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont1,cont2,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont14
0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,3,3,1,1,2,1,3,1,0,0,0,0,0,3,2,4,2,3,15,7,6,0,6,4,8,11,10,30,26,0,35,23,0,16,169,0.32,0.3,0.4,0.28,0.47,0.32,0.61,0.34,0.38,0.38,0.37,0.39
1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,3,1,1,3,3,1,1,2,1,1,1,0,0,0,0,0,3,3,4,0,0,12,1,3,0,6,6,6,5,1,30,58,3,32,58,0,11,173,0.63,0.62,0.95,0.84,0.48,0.44,0.71,0.52,0.6,0.69,0.68,0.21


In [123]:
y_train.head(2)

0   7.79
1   7.30
Name: loss, dtype: float64

In [124]:
# check distribuition
print ("Xtrain shape:", X_train.shape[0])
print ("ytrain shape:", X_train.shape[1])
print ("Xtest shape:", X_test.shape[0])
print ("ytest shape:", X_test.shape[1])

Xtrain shape: 188318
ytrain shape: 125
Xtest shape: 125546
ytest shape: 125


### K-Folds Cross Validation
KFold divides all the samples in  groups of samples, called folds, of equal sizes (if possible). The prediction function is learned using  folds, and the fold left out is used for test.

In [125]:
# replicate the results
random_state = 16

# folds
k = 5

---

## **Evaluation**

### Mean absolute error (MAE) 
- The models in this project use the mean absolute error (MAE) between the predicted loss and the actual loss for each claim in the test set.
- The goal was to minimize the MAE in our model’s predictions. 

In [126]:
# Custom eval metric
def eval_error(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))

---

## **Modeling**


### **Tranning**
- The traning and test it's in function because garanted reuse.
- The predictions running in validation set in each fold.
- Garanted with array is the inverse of the log transformation about column loss.
- Calculate time
- Calculate MAE

In [127]:
def train_model(model, num_folds):
    """Function by Train model"""
        
    print("Begin training")
    start = time.time()
    
    # declare a KFold instance
    kfold = KFold(n_splits = num_folds)
    
    # number of models
    num_models = 1
    
    # array to store results after each fold
    results = np.zeros((X_test.shape[0], k))
        
    # train K-1 Random Forests
    for i, (train, val) in enumerate(kfold.split(X_train)):
        # get smaller training set and create validation set
        X_train_mini, X_val = X_train.iloc[train], X_train.iloc[val]
        y_train_mini, y_val = y_train[train], y_train[val]

        # train model
        model.fit(X_train_mini, y_train_mini)

        # make predictions 
        preds = model.predict(X_val)
        
        # absolute error
        error = mean_absolute_error(np.exp(y_val) - shift, np.exp(preds) - shift)
        print("MAE on fold {} is {}".format(i, error))
        
        # Predict on test set
        test_predictions = np.exp(model.predict(X_test)) - shift
        
        # Sum predictions
        results[:,i] = test_predictions

    end = time.time()
    print("\nTraining done! Time Elapsed:", end - start, " seconds.")

    # Error over k folds
    avg_error = np.mean(results)

    return test_predictions

---

### **Select Modeling Techniques**

We will test and execute the models:
- Linear Regression
- Random Forest (Bagging)
- XGBoost

#### **Linear Regression**

In [128]:
# Visualize params
LinearRegression()

LinearRegression()

In [129]:
# Linear Regression
model_lr = LinearRegression(n_jobs=-1, normalize=False)

# training
result_lr = train_model(model = model_lr, num_folds=5)

Begin training
MAE on fold 0 is 1277.7284358977993
MAE on fold 1 is 1271.529825649213
MAE on fold 2 is 1292.0982766761954
MAE on fold 3 is 1291.5650798269508
MAE on fold 4 is 1270.1448923152147

Training done! Time Elapsed: 29.32361125946045  seconds.


In [130]:
# Linear Regression normalized
model_lr_normalized = LinearRegression(n_jobs=-1, normalize=True)

# training
result_lr_normalized = train_model(model = model_lr_normalized, num_folds = 5)

Begin training
MAE on fold 0 is 1277.7284358978006
MAE on fold 1 is 1271.5298256492126
MAE on fold 2 is 1292.098276676194
MAE on fold 3 is 1291.5650798269535
MAE on fold 4 is 1270.1448923152152

Training done! Time Elapsed: 30.96899700164795  seconds.


#### Analysis of Results
- Without difference data normalized and not normalized
- The best result MAE on fold 4: 1270

#### Save Predictions

In [131]:
save_predictions(ids=test_ids, 
                 predictions=result_lr_normalized, 
                 file=submissions_folder + 'lin_regression_submission.csv')

'saved data!'

#### View file

In [132]:
sub = pd.read_csv(submissions_folder + 'lin_regression_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1340.95
1,6,1602.34
2,9,12405.65
3,12,4465.59
4,15,776.43


#### **Random Forest**

#### Process trainning
- The number of estimators go is tested.
- Update the number of estimators to 20, 50 and 100 to see how the model performs. 

In [133]:
# 1st model Random Forest
rf_regressor_one = RandomForestRegressor(n_jobs = -1)

# training
result_rf_1 = train_model(model = rf_regressor_one, num_folds = 3)

Begin training
MAE on fold 0 is 1212.1560142767128
MAE on fold 1 is 1218.5278108485495
MAE on fold 2 is 1210.3258404477351

Training done! Time Elapsed: 434.2089123725891  seconds.


In [134]:
# 2st model Random Forest
rf_regressor_two = RandomForestRegressor(n_estimators = 50, 
                                         n_jobs = -1,
                                         max_depth = 30)

# training
result_rf_2 = train_model(model = rf_regressor_two, num_folds = 3)

Begin training
MAE on fold 0 is 1216.5483103113957
MAE on fold 1 is 1221.743177324072
MAE on fold 2 is 1215.8453331617086

Training done! Time Elapsed: 242.66162109375  seconds.


In [135]:
# 3st model Random Forest
rf_regressor_three = RandomForestRegressor(n_estimators = len(df_train.columns), 
                                          n_jobs = -1,
                                          verbose = 1, 
                                          max_depth = len(df_train.columns))

# training
result_rf_3 = train_model(model = rf_regressor_three, num_folds = 5)

Begin training


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 126 out of 126 | elapsed:  4.7min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 126 out of 126 | elapsed:    1.6s finished


MAE on fold 0 is 1209.9959361443862


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 126 out of 126 | elapsed:    5.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 126 out of 126 | elapsed:  4.1min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 126 out of 126 | elapsed:    1.1s finished


MAE on fold 1 is 1206.388042284142


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 126 out of 126 | elapsed:    4.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 126 out of 126 | elapsed:  3.7min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 126 out of 126 | elapsed:    1.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 2 is 1217.930176704426


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 126 out of 126 | elapsed:    3.9s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 126 out of 126 | elapsed:  3.7min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 126 out of 126 | elapsed:    1.2s finished


MAE on fold 3 is 1216.6966442290088


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 126 out of 126 | elapsed:    4.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 126 out of 126 | elapsed:  3.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 126 out of 126 | elapsed:    1.2s finished


MAE on fold 4 is 1203.158204807229


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.4s



Training done! Time Elapsed: 1210.1899542808533  seconds.


[Parallel(n_jobs=8)]: Done 126 out of 126 | elapsed:    4.1s finished


#### Analysis of Results
- The increasing the number of estimators improved score.
- The best results produced with n_estimators = 100 on fold 1.
- The best result MAE on fold 1 is 1854.7308563695506
- The problem is time process which grows in the same proportion. 
- The model random forest were worse than linear regression

#### Submission

In [136]:
save_predictions(ids=test_ids, 
                 predictions=result_rf_3, 
                 file=submissions_folder + 'random_forest_submission.csv')

'saved data!'

#### View submission

In [137]:
sub = pd.read_csv(submissions_folder + 'random_forest_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1811.03
1,6,1867.34
2,9,9888.18
3,12,5172.35
4,15,724.65


#### **XGBoost**
- Model very robust
- descentent gradient 
- regularization parameter: help avoid overfitting
- parallelizable

The model XGBRegressor will be trained with threee model using difference parameters.

#### Otimize XGBoost

DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.

In [138]:
# Data Matrix used in XGBoost.
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

#### Function  train_test_xgboost
- The function process data to:
 - calculate time process
 - shuffle the data during each fold
 - run predictions
 - store these predictions in a numpy array
 - average the predictions over k number of folds.

In [139]:
def train_test_xgboost(model, early_stopping_rounds):
    kf = KFold(n_splits = k, shuffle = True, random_state = random_state)
    results = np.zeros((X_test.shape[0], k))
    
    print("Begin training")
    start = time.time()
    
    for i, (train_index, val_index) in enumerate(kf.split(X_train)):
        print("Begin training and testing base model on fold {}".format(i))
        start = time.time()
        
        X_train_mini, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_mini, y_val = y_train[train_index], y_train[val_index]

        # train model
        model.fit(X_train_mini, 
                   y_train_mini, 
                   eval_metric = eval_error, 
                   eval_set = [(X_train_mini, y_train_mini), (X_val, y_val)], 
                   early_stopping_rounds = early_stopping_rounds,
                   verbose = False)

        end = time.time()
        print("Training time elapsed on fold {} is {}".format(i, end - start))
        
        # Predict on validation set 
        val_predictions = model.predict(X_val, ntree_limit = model.best_ntree_limit)
        error = mean_absolute_error(np.exp(y_val) - shift, np.exp(val_predictions) - shift)
        print("Error on fold {} is {} \n".format(i, error))
                
        # Predict on test set
        test_predictions = np.exp(model.predict(X_test, ntree_limit = model.best_ntree_limit)) - shift
        # Sum predictions
        results[:,i] = test_predictions
        
        end = time.time()
        print("\nTraining done! Time Elapsed:", end - start, " seconds.")

    # Average predictions
    mean_results = results.mean(axis = 1)
    return mean_results

In [140]:
# Visualize params
XGBRegressor()

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)

#### Chooce parameters
It´s possible reduce error and overfitting with analysis of parameters. Here is a list of the key parameters below:

- max_depth - Max tree depth for boosted trees
- gamma - Minimum loss reduction required to make a further partition on a leaf node of the tree.
- min_child_weight - Minimum sum of instance weight(hessian) needed in a child.

**NOTE**: keep someone parameters default, how by example, learning_rate=0.1.<br/>
**NOTE about n_estimators**: when tested there aren´t effect

In [141]:
# Model 1 XGB_regressor
xgb_one = XGBRegressor(nthread = -1)

# training
results_xgb_1 = train_test_xgboost(model = xgb_one,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 28.29695177078247
Error on fold 0 is 1157.2838077878346 


Training done! Time Elapsed: 28.71126937866211  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 27.926844358444214
Error on fold 1 is 1159.2839719068063 


Training done! Time Elapsed: 28.24869656562805  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 27.090129137039185
Error on fold 2 is 1155.720143617745 


Training done! Time Elapsed: 27.431737661361694  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 27.984951972961426
Error on fold 3 is 1163.8393367775336 


Training done! Time Elapsed: 28.30882740020752  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 28.435564041137695
Error on fold 4 is 1168.2847724354365 


Training done! Time Elapsed: 28.74915552139

In [142]:
# Model 2 XGB_regressor
xgb_two = XGBRegressor(learning_rate=0.1,
                       n_estimators = 1000,
                       max_depth = 5,
                       min_child_weight = len(df_train.columns),
                       gamma = 1,
                       subsample = 1.0,
                       colsample_bytree = 1.0,
                       reg_alpha = 1.0,
                       silent = True, 
                       seed = random_state, 
                       nthread = -1)
# training
results_xgb_2 = train_test_xgboost(model = xgb_two,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 71.26755952835083
Error on fold 0 is 1145.8912580200688 


Training done! Time Elapsed: 71.80622506141663  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 87.26467537879944
Error on fold 1 is 1142.3725830216845 


Training done! Time Elapsed: 87.79010200500488  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 70.73590016365051
Error on fold 2 is 1148.3008597994458 


Training done! Time Elapsed: 71.29445481300354  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 88.92648458480835
Error on fold 3 is 1153.0072119727918 


Training done! Time Elapsed: 89.42409682273865  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 91.67970085144043
Error on fold 4 is 1150.6308686780524 


Training done! Time Elapsed: 92.18710422515869 

In [143]:
# Model 3 XGB_regressor
xgb_three = XGBRegressor(learning_rate=0.1,
                        n_estimators = 1000,
                        max_depth = 9,
                        min_child_weight = 6,
                        gamma = 1,
                        subsample = 1.0,
                        colsample_bytree = 0.5,
                        reg_alpha = 1.0,
                        silent = True, 
                        seed = random_state, 
                        nthread = -1)

# training
results_xgb_3 = train_test_xgboost(model = xgb_three,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 83.80049729347229
Error on fold 0 is 1135.904304830248 


Training done! Time Elapsed: 84.50379586219788  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 78.96569442749023
Error on fold 1 is 1136.3357193137715 


Training done! Time Elapsed: 79.5474374294281  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 112.82652449607849
Error on fold 2 is 1139.2954321801792 


Training done! Time Elapsed: 113.93322205543518  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 133.1305968761444
Error on fold 3 is 1147.0293979535486 


Training done! Time Elapsed: 133.8790729045868  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 105.58224964141846
Error on fold 4 is 1148.204167819036 


Training done! Time Elapsed: 106.985600233078  

#### Submission

In [144]:
save_predictions(ids = test_ids, 
                 predictions = results_xgb_1, 
                 file = submissions_folder + 'xgb_submission.csv')

'saved data!'

#### View submission

In [145]:
sub = pd.read_csv(submissions_folder + 'xgb_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1548.55
1,6,1779.07
2,9,11293.15
3,12,6102.1
4,15,834.7


---

## **Evaluate Results**
- The XGBoost have better performace in comparain Random Forest and Linear Regression
- The best result MAE on fold 0 is 1135 in three model
- I tested parameters difference in each model XGBoost
- So, I chooce file xgb_submission to submission in competition.

---