# Algoritm Selection

## Imports

In [45]:
# data analysis and data wrangling
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

# PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

# Preprocessing
from sklearn.preprocessing import LabelEncoder

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import xgboost as xgb

# metrics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

# Other
from IPython.display import Image
import configparser
import subprocess
import warnings
import pprint
import time
import os

## Prepare Principal Directory

In [46]:
def exit_current_directory(end_directory: str='notebooks'):
    # PATH current
    curr_dir = os.path.dirname (os.path.realpath ("__file__")) 
    
    if curr_dir.endswith(end_directory):
        os.chdir('..')
        return curr_dir
    
    return f'Current working directory: {curr_dir}'

In [47]:
exit_current_directory(end_directory='notebooks')

'Current working directory: /home/campos/projetos/challenges/kaggle/allstate-claims-severity'

### Load dataset

In [48]:
%%time

# load data in dataset
df_train = pd.read_csv("data/cleansing/train.csv", 
                       encoding='utf-8')
df_test = pd.read_csv("data/cleansing/test.csv", 
                      encoding='utf-8')

CPU times: user 34 s, sys: 702 ms, total: 34.7 s
Wall time: 34.7 s


### Merge Data Sets

Merged the training and test sets temporarily.

In [49]:
# Merge datasets
frames = [df_train, df_test]
data = pd.concat(frames)

print("The merge datasets train and test:\n{} rows\n{} columns".format(data.shape[0],
                                                                      data.shape[1]))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


The merge datasets train and test:
313864 rows
132 columns


### Global Variables

In [50]:
# Lists that will be manipulated in the data processing
list_columns = []
list_categorical_col = []
list_numerical_col = []

In [51]:
def get_col(df: 'dataframe', type_descr: 'numpy') -> list:
    """
    Function get list columns 
    
    Args:
    type_descr
        np.number, np.object -> return list with all columns
        np.number            -> return list numerical columns 
        np.object            -> return list object columns
    """
    try:
        col = (df.describe(include=type_descr).columns)  # pandas.core.indexes.base.Index  
    except ValueError:
        print(f'Dataframe not contains {type_descr} columns !', end='\n')    
    else:
        return col.tolist() 

In [52]:
list_numerical_col = get_col(df=data,
                             type_descr=np.number)
list_categorical_col = get_col(df=data,
                               type_descr=np.object)
list_columns = get_col(df=data,
                       type_descr=[np.object, np.number])

---

## Prepare Submission File
Use function to submission. In this way it is guaranteed a default.

In [53]:
# First, check how is file sample
sample = pd.read_csv('data/raw/sample_submission.csv')
sample.head()

Unnamed: 0,id,loss
0,4,0
1,6,0
2,9,0
3,12,0
4,15,0


In [54]:
test_ids = df_test['id']

submissions_folder = 'data/submissions-kaggle/'

In [55]:
def save_predictions(ids = None, predictions = None, file = None):
    
    # prepare file
    submission = pd.DataFrame({'id': ids, 'loss': predictions})
    
    # CSV
    submission.to_csv(path_or_buf = file, index = False, encoding='utf8')
    print("Data storage!")

---

## Split train and test
- The variable Shift is applied to the log transformation.

In [56]:
list_numerical_col.remove('loss')

In [57]:
# split into training and test sets
shift = 200

# create target label
X_train = df_train[list_numerical_col]
y_train = np.log(df_train['loss'] + shift)

In [58]:
X_test = df_test[list_numerical_col]

In [59]:
X_train.head()

Unnamed: 0,cont1,cont10,cont11,cont12,cont13,cont14,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,id
0,0.7263,0.8351,0.569745,0.594646,0.822493,0.714843,0.245921,0.187583,0.789639,0.310061,0.718367,0.33506,0.3026,0.67135,1
1,0.330514,0.43919,0.338312,0.366307,0.611431,0.304496,0.737068,0.592681,0.614134,0.885834,0.438917,0.436585,0.60087,0.35127,2
2,0.261841,0.32446,0.381398,0.373424,0.195709,0.774425,0.358319,0.484196,0.236924,0.397069,0.289648,0.315545,0.2732,0.26076,5
3,0.321594,0.44467,0.327915,0.32157,0.605077,0.602642,0.555782,0.527991,0.373816,0.422268,0.440945,0.391128,0.31796,0.32128,10
4,0.273204,0.2123,0.204687,0.202213,0.246011,0.432606,0.15999,0.527991,0.473202,0.704268,0.178193,0.247408,0.24564,0.22089,11


In [60]:
X_test.head()

Unnamed: 0,cont1,cont10,cont11,cont12,cont13,cont14,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,id
0,0.321594,0.38016,0.377724,0.369858,0.704052,0.392562,0.299102,0.246911,0.402922,0.281143,0.466591,0.317681,0.61229,0.34365,4
1,0.634734,0.60401,0.689039,0.675759,0.453468,0.208045,0.620805,0.65431,0.946616,0.836443,0.482425,0.44376,0.7133,0.5189,6
2,0.290813,0.30529,0.24541,0.241676,0.258586,0.297232,0.737068,0.711159,0.412789,0.718531,0.212308,0.325779,0.29758,0.34365,9
3,0.268622,0.3148,0.348867,0.341872,0.592264,0.555955,0.681761,0.592681,0.354893,0.397069,0.36993,0.342355,0.40028,0.33237,12
4,0.553846,0.50556,0.359572,0.352251,0.301535,0.825823,0.299102,0.26357,0.696873,0.302678,0.398862,0.391833,0.23688,0.43731,15


In [61]:
display(y_train.head())

0    7.788701
1    7.302227
2    8.072495
3    7.038652
4    7.994244
Name: loss, dtype: float64

In [62]:
# check distribuition
print ("Xtrain shape:", X_train.shape[0])
print ("ytrain shape:", X_train.shape[1])
print ("Xtest shape:", X_test.shape[0])
print ("ytest shape:", X_test.shape[1])

Xtrain shape: 188318
ytrain shape: 15
Xtest shape: 125546
ytest shape: 15


## Mean absolute error (MAE) 
- The models in this project use the mean absolute error (MAE) between the predicted loss and the actual loss for each claim in the test set.
- The goal was to minimize the MAE in our model’s predictions. 

In [63]:
# Custom eval metric
def eval_error(preds, dtrain):
    """evaluation"""
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))

## K-Folds Cross Validation
KFold divides all the samples in  groups of samples, called folds, of equal sizes (if possible). The prediction function is learned using  folds, and the fold left out is used for test.

In [64]:
# replicate the results
random_state = 16

# folds
k = 5

## Function tranning
- The traning and test it`s in function because garanted reuse.
- The predictions running in validation set in each fold.
- Garanted with array is the inverse of the log transformation about column loss.
- Calculate time
- Calculate MAE

In [65]:
def train_model(model, num_folds):
    """Function by Train model"""
        
    print("Begin training")
    start = time.time()
    
    # declare a KFold instance
    kfold = KFold(n_splits = num_folds, random_state = 10)
    
    # number of models
    num_models = 1
    
    # array to store results after each fold
    results = np.zeros((X_test.shape[0], k))
        
    # train K-1 Random Forests
    for i, (train, val) in enumerate(kfold.split(X_train)):
        # get smaller training set and create validation set
        X_train_mini, X_val = X_train.iloc[train], X_train.iloc[val]
        y_train_mini, y_val = y_train[train], y_train[val]

        # train model
        model.fit(X_train_mini, y_train_mini)

        # make predictions 
        preds = model.predict(X_val)
        
        # absolute error
        error = mean_absolute_error(np.exp(y_val) - shift, np.exp(preds) - shift)
        print("MAE on fold {} is {}".format(i, error))

        
        # Predict on test set
        test_predictions = np.exp(model.predict(X_test)) - shift
        
        # Sum predictions
        results[:,i] = test_predictions
        

    end = time.time()
    print("\nTraining done! Time Elapsed:", end - start, " seconds.")

    # Error over k folds
    avg_error = np.mean(results)

    return test_predictions

## Benchmarks

We will test and execute the models:
- Linear Regression
- Random Forest (Bagging)
- XGBoost

## Linear Regression

In [66]:
# Visualize params
LinearRegression()

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [67]:
# Linear Regression
model_lr = LinearRegression(n_jobs=-1, normalize=False)

# training
result_lr = train_model(model = model_lr, num_folds = 5)

Begin training
MAE on fold 0 is 1807.396658381662
MAE on fold 1 is 1800.4029992628616
MAE on fold 2 is 1813.0439601729026
MAE on fold 3 is 1807.6566114103166
MAE on fold 4 is 1791.162453327296

Training done! Time Elapsed: 0.5459587574005127  seconds.


In [68]:
# Linear Regression normalized
model_lr_normalized = LinearRegression(n_jobs=-1, normalize=True)

# training
result_lr_normalized = train_model(model = model_lr_normalized, num_folds = 5)

Begin training
MAE on fold 0 is 1807.3966583802503
MAE on fold 1 is 1800.4029992584976
MAE on fold 2 is 1813.0439601730936
MAE on fold 3 is 1807.656611415883
MAE on fold 4 is 1791.1624533259428

Training done! Time Elapsed: 0.5864481925964355  seconds.


#### Analysis of Results
- Without difference data normalized and not normalized
- The best result MAE on fold 4 is 1791.1369743102152

#### Submission

In [69]:
save_predictions(ids = test_ids, 
                 predictions = result_lr_normalized, 
                 file = submissions_folder + 'lin_regression_submission.csv')

Data storage!


#### View file

In [70]:
sub = pd.read_csv(submissions_folder + 'lin_regression_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1930.8614
1,6,2132.890589
2,9,2366.765041
3,12,2359.890051
4,15,1875.534329


## Random Forest

#### Process trainning
- The number of estimators go is tested.
- Update the number of estimators to 20, 50 and 100 to see how the model performs. 

In [71]:
# Visualize params
RandomForestRegressor()

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators='warn',
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [72]:
# 1st model Random Forest
rf_regressor_one = RandomForestRegressor(n_estimators = 20, 
                                         n_jobs = -1,
                                         verbose = 1, 
                                         max_depth = 30)

# training
result_rf_1 = train_model(model = rf_regressor_one, num_folds = 5)

Begin training


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.1s finished


MAE on fold 0 is 1873.543310215544


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 1 is 1831.2919442558862


[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.1s finished


MAE on fold 2 is 1853.8327158887018


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   12.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 3 is 1848.2543931209425


[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 4 is 1864.352650260471

Training done! Time Elapsed: 50.911779165267944  seconds.


[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.3s finished


In [73]:
# 2st model Random Forest
rf_regressor_two = RandomForestRegressor(n_estimators = 50, 
                                         n_jobs = -1,
                                         verbose = 1, 
                                         max_depth = 30)

# training
result_rf_2 = train_model(model = rf_regressor_two, num_folds = 5)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


Begin training


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   18.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 0 is 1838.4192755063007


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   18.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 1 is 1812.7877604276591


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   22.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 2 is 1829.4944207424408


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    1.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   22.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 3 is 1825.2128103758703


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   18.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 4 is 1840.6596219641438


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s



Training done! Time Elapsed: 105.63935804367065  seconds.


[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.7s finished


In [74]:
# 3st model Random Forest
rf_regressor_three = RandomForestRegressor(n_estimators = 100, 
                                          n_jobs = -1,
                                          verbose = 1, 
                                          max_depth = 30)

# training
result_rf_3 = train_model(model = rf_regressor_three, num_folds = 5)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


Begin training


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   34.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 0 is 1829.3372457151506


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   34.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 1 is 1808.009241533649


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   35.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 2 is 1825.4406777642123


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   35.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 3 is 1821.8506207840203


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   35.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 4 is 1830.7366784623036


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s



Training done! Time Elapsed: 183.84282684326172  seconds.


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.4s finished


#### Analysis of Results
- The increasing the number of estimators improved score.
- The best results produced with n_estimators = 100 on fold 1.
- The best result MAE on fold 1 is 1854.7308563695506
- The problem is time process which grows in the same proportion. 
- The model random forest were worse than linear regression

#### Submission

In [75]:
save_predictions(ids = test_ids, 
                 predictions = result_rf_1, 
                 file = submissions_folder + 'random_forest_submission.csv')

Data storage!


#### View submission

In [76]:
sub = pd.read_csv(submissions_folder + 'random_forest_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1383.504434
1,6,3477.484493
2,9,1297.510206
3,12,2174.067219
4,15,2129.520373


## XGBoost
- Model very robust
- descentent gradient 
- regularization parameter: help avoid overfitting
- parallelizable

The model XGBRegressor will be trained with threee model using difference parameters.

#### Otimize XGBoost

DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.

In [77]:
# Data Matrix used in XGBoost.
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

#### Function  train_test_xgboost
- The function process data to:
 - calculate time process
 - shuffle the data during each fold
 - run predictions
 - store these predictions in a numpy array
 - average the predictions over k number of folds.

In [78]:
def train_test_xgboost(model, early_stopping_rounds):
    kf = KFold(n_splits = k, shuffle = True, random_state = random_state)
    results = np.zeros((X_test.shape[0], k))
    
    print("Begin training")
    start = time.time()
    
    for i, (train_index, val_index) in enumerate(kf.split(X_train)):
        print("Begin training and testing base model on fold {}".format(i))
        start = time.time()
        
        X_train_mini, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_mini, y_val = y_train[train_index], y_train[val_index]

        # train model
        model.fit(X_train_mini, 
                   y_train_mini, 
                   eval_metric = eval_error, 
                   eval_set = [(X_train_mini, y_train_mini), (X_val, y_val)], 
                   early_stopping_rounds = early_stopping_rounds,
                   verbose = False)

        end = time.time()
        print("Training time elapsed on fold {} is {}".format(i, end - start))
        
        # Predict on validation set 
        val_predictions = model.predict(X_val, ntree_limit = model.best_ntree_limit)
        error = mean_absolute_error(np.exp(y_val) - shift, np.exp(val_predictions) - shift)
        print("Error on fold {} is {} \n".format(i, error))
                
        # Predict on test set
        test_predictions = np.exp(model.predict(X_test, ntree_limit = model.best_ntree_limit)) - shift
        # Sum predictions
        results[:,i] = test_predictions
        
        end = time.time()
        print("\nTraining done! Time Elapsed:", end - start, " seconds.")

    # Average predictions
    mean_results = results.mean(axis = 1)
    return mean_results

In [79]:
# Visualize params
XGBRegressor()

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

#### Chooce parameters
It´s possible reduce error and overfitting with analysis of parameters. Here is a list of the key parameters below:

- max_depth - Max tree depth for boosted trees
- gamma - Minimum loss reduction required to make a further partition on a leaf node of the tree.
- min_child_weight - Minimum sum of instance weight(hessian) needed in a child.

**NOTE**: keep someone parameters default, how by example, learning_rate=0.1.<br/>
**NOTE about n_estimators**: when tested there aren´t effect

In [80]:
# Model 1 XGB_regressor
xgb_one = XGBRegressor(learning_rate=0.1,
                       n_estimators = 1000,
                       max_depth = 7,
                       min_child_weight = 5.0,
                       gamma = 0.0,
                       subsample = 1.0,
                       colsample_bytree = 1.0,
                       reg_alpha = 1.0,
                       silent = True, 
                       seed = random_state, 
                       nthread = -1)

# training
results_xgb_1 = train_test_xgboost(model = xgb_one,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 19.946734189987183
Error on fold 0 is 1754.559046624507 


Training done! Time Elapsed: 20.529937505722046  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 20.0942804813385
Error on fold 1 is 1755.429965583004 


Training done! Time Elapsed: 20.670930862426758  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 19.636873960494995
Error on fold 2 is 1760.557956199484 


Training done! Time Elapsed: 20.192492485046387  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 19.9440758228302
Error on fold 3 is 1782.0809438433262 


Training done! Time Elapsed: 20.47978925704956  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 19.225709438323975
Error on fold 4 is 1778.0141856138237 


Training done! Time Elapsed: 19.73502945899963

In [81]:
# Model 2 XGB_regressor
xgb_two = XGBRegressor(learning_rate=0.1,
                       n_estimators = 1000,
                       max_depth = 5,
                       min_child_weight = 6.0,
                       gamma = 1,
                       subsample = 1.0,
                       colsample_bytree = 1.0,
                       reg_alpha = 1.0,
                       silent = True, 
                       seed = random_state, 
                       nthread = -1)
# training
results_xgb_2 = train_test_xgboost(model = xgb_two,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 34.12290406227112
Error on fold 0 is 1755.3403018872984 


Training done! Time Elapsed: 35.414214849472046  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 33.95343518257141
Error on fold 1 is 1755.8692282969848 


Training done! Time Elapsed: 35.29435753822327  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 14.676625728607178
Error on fold 2 is 1761.7904716788576 


Training done! Time Elapsed: 15.072965860366821  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 43.03805375099182
Error on fold 3 is 1781.5988537624557 


Training done! Time Elapsed: 44.98973250389099  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 14.993620872497559
Error on fold 4 is 1779.9570332933317 


Training done! Time Elapsed: 15.46026873588

In [82]:
# Model 3 XGB_regressor
xgb_three = XGBRegressor(learning_rate=0.1,
                        n_estimators = 1000,
                        max_depth = 9,
                        min_child_weight = 6,
                        gamma = 1,
                        subsample = 1.0,
                        colsample_bytree = 0.5,
                        reg_alpha = 1.0,
                        silent = True, 
                        seed = random_state, 
                        nthread = -1)

# training
results_xgb_3 = train_test_xgboost(model = xgb_three,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 15.513749361038208
Error on fold 0 is 1753.4517356248984 


Training done! Time Elapsed: 16.233611345291138  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 15.292062044143677
Error on fold 1 is 1754.5780858068213 


Training done! Time Elapsed: 15.986721277236938  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 15.103558778762817
Error on fold 2 is 1758.9631848066745 


Training done! Time Elapsed: 15.828425168991089  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 21.901537656784058
Error on fold 3 is 1781.5551458401699 


Training done! Time Elapsed: 24.715834379196167  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 14.840239524841309
Error on fold 4 is 1777.6049556045605 


Training done! Time Elapsed: 15.523191

#### Submission

In [83]:
save_predictions(ids = test_ids, 
                 predictions = results_xgb_1, 
                 file = submissions_folder + 'xgb_submission.csv')

Data storage!


#### View submission

In [84]:
sub = pd.read_csv(submissions_folder + 'xgb_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1919.221924
1,6,2585.073291
2,9,2193.696191
3,12,2240.229053
4,15,1994.406738


#### Analysis of Results
- The XGBoost have better performace in comparain Random Forest and Linear Regression
- The best result MAE on fold 0 is 1753.6550894377651 in three model
- I tested parameters difference in each model XGBoost
- So, I chooce file xgb_submission to submission in competition.

---

#### Copyright
<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">
    <img alt="Creative Commons License" align="right" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" />

</a><br />This work by 
    <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Bruno A. R. M. Campos</span> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.