# Algoritm Selection and Tranning Model

## Imports

In [34]:
# data analysis and data wrangling
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

# Preprocessing
from sklearn.preprocessing import LabelEncoder

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import xgboost as xgb

# metrics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

# Other
from IPython.display import Image
import configparser
import subprocess
import warnings
import pprint
import time
import os

In [35]:
warnings.filterwarnings('ignore')

## Prepare Principal Directory

In [36]:
def exit_current_directory(end_directory: str='notebooks'):
    # PATH current
    curr_dir = os.path.dirname (os.path.realpath ("__file__")) 
    
    if curr_dir.endswith(end_directory):
        os.chdir('..')
        return curr_dir
    
    return f'Current working directory: {curr_dir}'

In [37]:
exit_current_directory(end_directory='notebooks')

'Current working directory: /home/campos/projetos/challenges/kaggle/allstate-claims-severity'

### Load dataset

In [38]:
%%time

# load data in dataset
df_train = pd.read_csv("data/cleansing/train.csv", 
                       encoding='utf-8')
df_test = pd.read_csv("data/cleansing/test.csv", 
                      encoding='utf-8')

CPU times: user 2.56 s, sys: 108 ms, total: 2.67 s
Wall time: 2.67 s


In [39]:
df_train.head()

Unnamed: 0,cat1,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,...,cat111,cat112,cat113,cat114,cat115,cat116,cont2,cont7,cont11,loss
0,0,0,1,0,0,0,0,1,0,1,...,2,19,55,0,14,269,0.245921,0.33506,0.569745,2213.18
1,0,0,0,0,0,0,0,1,1,0,...,0,22,38,0,14,85,0.737068,0.436585,0.338312,1283.6
2,0,0,0,1,0,0,0,1,1,1,...,0,28,5,0,8,153,0.358319,0.315545,0.381398,3005.09
3,1,0,1,0,0,0,0,1,0,0,...,2,39,4,0,14,79,0.555782,0.391128,0.327915,939.85
4,0,0,1,0,0,0,0,1,1,0,...,2,50,38,0,10,55,0.15999,0.247408,0.204687,2763.85


### Global Variables

In [40]:
# Lists that will be manipulated in the data processing
list_columns = []
list_categorical_col = []
list_numerical_col = []

In [41]:
def get_col(df: 'dataframe', type_descr: 'numpy') -> list:
    """
    Function get list columns 
    
    Args:
    type_descr
        np.number, np.object -> return list with all columns
        np.number            -> return list numerical columns 
        np.object            -> return list object columns
    """
    try:
        col = (df.describe(include=type_descr).columns)  # pandas.core.indexes.base.Index  
    except ValueError:
        print(f'Dataframe not contains {type_descr} columns !', end='\n')    
    else:
        return col.tolist() 

In [42]:
list_numerical_col = get_col(df=df_train,
                             type_descr=np.number)
list_categorical_col = get_col(df=df_train,
                               type_descr=np.object)
list_columns = get_col(df=df_train,
                       type_descr=[np.object, np.number])

Dataframe not contains <class 'object'> columns !


---

## Prepare Submission File
Use function to submission. In this way it is guaranteed a default.

In [43]:
# First, check how is file sample
sample = pd.read_csv('data/raw/sample_submission.csv')
sample.head(10)

Unnamed: 0,id,loss
0,4,0
1,6,0
2,9,0
3,12,0
4,15,0
5,17,0
6,21,0
7,28,0
8,32,0
9,43,0


In [44]:
test_ids = df_test['id']

submissions_folder = 'data/submissions-kaggle/'

In [45]:
def save_predictions(ids = None, predictions = None, file = None):
    
    # prepare file
    submission = pd.DataFrame({'id': ids, 'loss': predictions})
    
    # CSV
    submission.to_csv(path_or_buf = file, index = False, encoding='utf8')
    print("Data storage!")

---

## Split train and test
- The variable Shift is applied to the log transformation.

In [46]:
list_columns.remove('loss')

In [47]:
df_train[list_columns].head()

Unnamed: 0,cat1,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,...,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont2,cont7,cont11
0,0,0,1,0,0,0,0,1,0,1,...,28,2,19,55,0,14,269,0.245921,0.33506,0.569745
1,0,0,0,0,0,0,0,1,1,0,...,65,0,22,38,0,14,85,0.737068,0.436585,0.338312
2,0,0,0,1,0,0,0,1,1,1,...,85,0,28,5,0,8,153,0.358319,0.315545,0.381398
3,1,0,1,0,0,0,0,1,0,0,...,67,2,39,4,0,14,79,0.555782,0.391128,0.327915
4,0,0,1,0,0,0,0,1,1,0,...,50,2,50,38,0,10,55,0.15999,0.247408,0.204687


In [48]:
# split into training and test sets
shift = 200

# create target label
X_train = df_train[list_columns]
y_train = np.log(df_train['loss'] + shift)

In [49]:
X_test = df_test[list_columns]

In [50]:
X_train.head()

Unnamed: 0,cat1,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,...,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont2,cont7,cont11
0,0,0,1,0,0,0,0,1,0,1,...,28,2,19,55,0,14,269,0.245921,0.33506,0.569745
1,0,0,0,0,0,0,0,1,1,0,...,65,0,22,38,0,14,85,0.737068,0.436585,0.338312
2,0,0,0,1,0,0,0,1,1,1,...,85,0,28,5,0,8,153,0.358319,0.315545,0.381398
3,1,0,1,0,0,0,0,1,0,0,...,67,2,39,4,0,14,79,0.555782,0.391128,0.327915
4,0,0,1,0,0,0,0,1,1,0,...,50,2,50,38,0,10,55,0.15999,0.247408,0.204687


In [51]:
X_test.head()

Unnamed: 0,cat1,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,...,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont2,cont7,cont11
0,0,0,0,0,0,0,0,1,0,1,...,26,0,35,23,0,16,169,0.299102,0.317681,0.377724
1,0,0,1,0,0,0,0,1,0,0,...,58,3,32,58,0,11,173,0.620805,0.44376,0.689039
2,0,0,1,1,0,1,0,1,1,0,...,62,2,46,4,0,10,51,0.737068,0.325779,0.24541
3,0,0,0,1,0,0,0,0,0,0,...,61,0,25,9,0,15,76,0.681761,0.342355,0.348867
4,1,0,0,0,1,0,0,0,0,0,...,100,0,30,45,2,9,163,0.299102,0.391833,0.359572


In [52]:
y_train.head()

0    7.788701
1    7.302227
2    8.072495
3    7.038652
4    7.994244
Name: loss, dtype: float64

In [53]:
display(y_train.head())

0    7.788701
1    7.302227
2    8.072495
3    7.038652
4    7.994244
Name: loss, dtype: float64

In [54]:
# check distribuition
print ("Xtrain shape:", X_train.shape[0])
print ("ytrain shape:", X_train.shape[1])
print ("Xtest shape:", X_test.shape[0])
print ("ytest shape:", X_test.shape[1])

Xtrain shape: 188318
ytrain shape: 116
Xtest shape: 125546
ytest shape: 116


## Mean absolute error (MAE) 
- The models in this project use the mean absolute error (MAE) between the predicted loss and the actual loss for each claim in the test set.
- The goal was to minimize the MAE in our model’s predictions. 

In [55]:
# Custom eval metric
def eval_error(preds, dtrain):
    """evaluation"""
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))

---

## K-Folds Cross Validation
KFold divides all the samples in  groups of samples, called folds, of equal sizes (if possible). The prediction function is learned using  folds, and the fold left out is used for test.

In [56]:
# replicate the results
random_state = 16

# folds
k = 5

---

## Function Tranning
- The traning and test it`s in function because garanted reuse.
- The predictions running in validation set in each fold.
- Garanted with array is the inverse of the log transformation about column loss.
- Calculate time
- Calculate MAE

In [57]:
def train_model(model, num_folds):
    """Function by Train model"""
        
    print("Begin training")
    start = time.time()
    
    # declare a KFold instance
    kfold = KFold(n_splits = num_folds, random_state = 10)
    
    # number of models
    num_models = 1
    
    # array to store results after each fold
    results = np.zeros((X_test.shape[0], k))
        
    # train K-1 Random Forests
    for i, (train, val) in enumerate(kfold.split(X_train)):
        # get smaller training set and create validation set
        X_train_mini, X_val = X_train.iloc[train], X_train.iloc[val]
        y_train_mini, y_val = y_train[train], y_train[val]

        # train model
        model.fit(X_train_mini, y_train_mini)

        # make predictions 
        preds = model.predict(X_val)
        
        # absolute error
        error = mean_absolute_error(np.exp(y_val) - shift, np.exp(preds) - shift)
        print("MAE on fold {} is {}".format(i, error))

        
        # Predict on test set
        test_predictions = np.exp(model.predict(X_test)) - shift
        
        # Sum predictions
        results[:,i] = test_predictions
        

    end = time.time()
    print("\nTraining done! Time Elapsed:", end - start, " seconds.")

    # Error over k folds
    avg_error = np.mean(results)

    return test_predictions

---

## Benchmarks

We will test and execute the models:
- Linear Regression
- Random Forest (Bagging)
- XGBoost

### Linear Regression

In [58]:
# Visualize params
LinearRegression()

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [59]:
# Linear Regression
model_lr = LinearRegression(n_jobs=-1, normalize=False)

# training
result_lr = train_model(model = model_lr, num_folds = 5)

Begin training
MAE on fold 0 is 1283.3236822179983
MAE on fold 1 is 1277.1983953096687
MAE on fold 2 is 1297.1363273336137
MAE on fold 3 is 1297.171697748075
MAE on fold 4 is 1276.4240479112714

Training done! Time Elapsed: 10.346903324127197  seconds.


In [60]:
# Linear Regression normalized
model_lr_normalized = LinearRegression(n_jobs=-1, normalize=True)

# training
result_lr_normalized = train_model(model = model_lr_normalized, num_folds = 5)

Begin training
MAE on fold 0 is 1283.3236822179965
MAE on fold 1 is 1277.1983953096653
MAE on fold 2 is 1297.1363273336128
MAE on fold 3 is 1297.1716977480721
MAE on fold 4 is 1276.4240479112705

Training done! Time Elapsed: 10.811718225479126  seconds.


#### Analysis of Results
- Without difference data normalized and not normalized
- The best result MAE on fold 4: 1267.692474560776

#### Submission

In [61]:
save_predictions(ids = test_ids, 
                 predictions = result_lr_normalized, 
                 file = submissions_folder + 'lin_regression_submission.csv')

Data storage!


#### View file

In [62]:
sub = pd.read_csv(submissions_folder + 'lin_regression_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1379.498338
1,6,1724.162112
2,9,13098.861284
3,12,4361.357058
4,15,742.746876


### Random Forest

#### Process trainning
- The number of estimators go is tested.
- Update the number of estimators to 20, 50 and 100 to see how the model performs. 

In [63]:
# Visualize params
RandomForestRegressor()

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators='warn',
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [64]:
# 1st model Random Forest
rf_regressor_one = RandomForestRegressor(n_jobs = -1)

# training
result_rf_1 = train_model(model = rf_regressor_one, num_folds = 3)

Begin training
MAE on fold 0 is 1268.7003218271013
MAE on fold 1 is 1275.8857494648464
MAE on fold 2 is 1264.6146415604733

Training done! Time Elapsed: 24.74856209754944  seconds.


In [65]:
# 2st model Random Forest
rf_regressor_two = RandomForestRegressor(n_estimators = 50, 
                                         n_jobs = -1,
                                         max_depth = 30)

# training
result_rf_2 = train_model(model = rf_regressor_two, num_folds = 3)

Begin training
MAE on fold 0 is 1221.7223838252594
MAE on fold 1 is 1225.255613080646
MAE on fold 2 is 1219.6717287332085

Training done! Time Elapsed: 99.46683812141418  seconds.


In [66]:
# 3st model Random Forest
rf_regressor_three = RandomForestRegressor(n_estimators = len(df_train.columns), 
                                          n_jobs = -1,
                                          verbose = 1, 
                                          max_depth = len(df_train.columns))

# training
result_rf_3 = train_model(model = rf_regressor_three, num_folds = 5)

Begin training


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done 117 out of 117 | elapsed:  1.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 117 out of 117 | elapsed:    0.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 0 is 1215.2519690661982


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 117 out of 117 | elapsed:    2.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done 117 out of 117 | elapsed:  1.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 117 out of 117 | elapsed:    0.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 1 is 1214.4773069628784


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 117 out of 117 | elapsed:    2.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 117 out of 117 | elapsed:  1.6min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 117 out of 117 | elapsed:    0.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 2 is 1222.6650974418244


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 117 out of 117 | elapsed:    1.9s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 117 out of 117 | elapsed:  1.6min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 117 out of 117 | elapsed:    0.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 3 is 1223.2426756470622


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 117 out of 117 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 117 out of 117 | elapsed:  1.6min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 117 out of 117 | elapsed:    0.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


MAE on fold 4 is 1206.3174311755456


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s



Training done! Time Elapsed: 483.4851098060608  seconds.


[Parallel(n_jobs=8)]: Done 117 out of 117 | elapsed:    1.8s finished


#### Analysis of Results
- The increasing the number of estimators improved score.
- The best results produced with n_estimators = 100 on fold 1.
- The best result MAE on fold 1 is 1854.7308563695506
- The problem is time process which grows in the same proportion. 
- The model random forest were worse than linear regression

#### Submission

In [169]:
save_predictions(ids = test_ids, 
                 predictions = result_rf_3, 
                 file = submissions_folder + 'random_forest_submission.csv')

Data storage!


#### View submission

In [170]:
sub = pd.read_csv(submissions_folder + 'random_forest_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1832.449311
1,6,1952.597419
2,9,7698.187451
3,12,5026.097134
4,15,701.259216


### XGBoost
- Model very robust
- descentent gradient 
- regularization parameter: help avoid overfitting
- parallelizable

The model XGBRegressor will be trained with threee model using difference parameters.

#### Otimize XGBoost

DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.

In [171]:
# Data Matrix used in XGBoost.
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

#### Function  train_test_xgboost
- The function process data to:
 - calculate time process
 - shuffle the data during each fold
 - run predictions
 - store these predictions in a numpy array
 - average the predictions over k number of folds.

In [172]:
def train_test_xgboost(model, early_stopping_rounds):
    kf = KFold(n_splits = k, shuffle = True, random_state = random_state)
    results = np.zeros((X_test.shape[0], k))
    
    print("Begin training")
    start = time.time()
    
    for i, (train_index, val_index) in enumerate(kf.split(X_train)):
        print("Begin training and testing base model on fold {}".format(i))
        start = time.time()
        
        X_train_mini, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_mini, y_val = y_train[train_index], y_train[val_index]

        # train model
        model.fit(X_train_mini, 
                   y_train_mini, 
                   eval_metric = eval_error, 
                   eval_set = [(X_train_mini, y_train_mini), (X_val, y_val)], 
                   early_stopping_rounds = early_stopping_rounds,
                   verbose = False)

        end = time.time()
        print("Training time elapsed on fold {} is {}".format(i, end - start))
        
        # Predict on validation set 
        val_predictions = model.predict(X_val, ntree_limit = model.best_ntree_limit)
        error = mean_absolute_error(np.exp(y_val) - shift, np.exp(val_predictions) - shift)
        print("Error on fold {} is {} \n".format(i, error))
                
        # Predict on test set
        test_predictions = np.exp(model.predict(X_test, ntree_limit = model.best_ntree_limit)) - shift
        # Sum predictions
        results[:,i] = test_predictions
        
        end = time.time()
        print("\nTraining done! Time Elapsed:", end - start, " seconds.")

    # Average predictions
    mean_results = results.mean(axis = 1)
    return mean_results

In [173]:
# Visualize params
XGBRegressor()

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

#### Chooce parameters
It´s possible reduce error and overfitting with analysis of parameters. Here is a list of the key parameters below:

- max_depth - Max tree depth for boosted trees
- gamma - Minimum loss reduction required to make a further partition on a leaf node of the tree.
- min_child_weight - Minimum sum of instance weight(hessian) needed in a child.

**NOTE**: keep someone parameters default, how by example, learning_rate=0.1.<br/>
**NOTE about n_estimators**: when tested there aren´t effect

In [174]:
# Model 1 XGB_regressor
xgb_one = XGBRegressor(nthread = -1)

# training
results_xgb_1 = train_test_xgboost(model = xgb_one,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 39.96898078918457
Error on fold 0 is 1197.4674083660452 


Training done! Time Elapsed: 43.21244168281555  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 40.5793342590332
Error on fold 1 is 1202.493184987978 


Training done! Time Elapsed: 41.42169761657715  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 36.93010973930359
Error on fold 2 is 1205.5432956313762 


Training done! Time Elapsed: 37.781001567840576  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 36.94909143447876
Error on fold 3 is 1220.0516001939693 


Training done! Time Elapsed: 37.73948526382446  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 37.48702597618103
Error on fold 4 is 1211.609699922047 


Training done! Time Elapsed: 38.30442237854004  s

In [175]:
# Model 2 XGB_regressor
xgb_two = XGBRegressor(learning_rate=0.1,
                       n_estimators = 1000,
                       max_depth = 5,
                       min_child_weight = len(df_train.columns),
                       gamma = 1,
                       subsample = 1.0,
                       colsample_bytree = 1.0,
                       reg_alpha = 1.0,
                       silent = True, 
                       seed = random_state, 
                       nthread = -1)
# training
results_xgb_2 = train_test_xgboost(model = xgb_two,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 193.17253065109253
Error on fold 0 is 1142.7964779392785 


Training done! Time Elapsed: 195.85378646850586  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 181.6424901485443
Error on fold 1 is 1146.391869262073 


Training done! Time Elapsed: 184.14358115196228  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 240.06945896148682
Error on fold 2 is 1142.2250607343233 


Training done! Time Elapsed: 243.42751932144165  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 177.31632542610168
Error on fold 3 is 1156.8934365532687 


Training done! Time Elapsed: 179.72164702415466  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 234.12422633171082
Error on fold 4 is 1151.4243277095923 


Training done! Time Elapsed: 237.4938635

In [176]:
# Model 3 XGB_regressor
xgb_three = XGBRegressor(learning_rate=0.1,
                        n_estimators = 1000,
                        max_depth = 9,
                        min_child_weight = 6,
                        gamma = 1,
                        subsample = 1.0,
                        colsample_bytree = 0.5,
                        reg_alpha = 1.0,
                        silent = True, 
                        seed = random_state, 
                        nthread = -1)

# training
results_xgb_3 = train_test_xgboost(model = xgb_three,
                                   early_stopping_rounds = 50)

Begin training
Begin training and testing base model on fold 0
Training time elapsed on fold 0 is 177.40572309494019
Error on fold 0 is 1135.0246459901953 


Training done! Time Elapsed: 182.23723602294922  seconds.
Begin training and testing base model on fold 1
Training time elapsed on fold 1 is 210.20324206352234
Error on fold 1 is 1136.8920325804445 


Training done! Time Elapsed: 216.19404697418213  seconds.
Begin training and testing base model on fold 2
Training time elapsed on fold 2 is 217.86508679389954
Error on fold 2 is 1141.1367079498368 


Training done! Time Elapsed: 224.11951684951782  seconds.
Begin training and testing base model on fold 3
Training time elapsed on fold 3 is 191.74747109413147
Error on fold 3 is 1144.891182674283 


Training done! Time Elapsed: 197.07398200035095  seconds.
Begin training and testing base model on fold 4
Training time elapsed on fold 4 is 197.99417996406555
Error on fold 4 is 1147.5982337357605 


Training done! Time Elapsed: 203.645170

#### Submission

In [177]:
save_predictions(ids = test_ids, 
                 predictions = results_xgb_1, 
                 file = submissions_folder + 'xgb_submission.csv')

Data storage!


#### View submission

In [178]:
sub = pd.read_csv(submissions_folder + 'xgb_submission.csv')
sub.head()

Unnamed: 0,id,loss
0,4,1768.795117
1,6,1872.308472
2,9,7818.926172
3,12,4803.654395
4,15,1007.5573


#### Analysis of Results
- The XGBoost have better performace in comparain Random Forest and Linear Regression
- The best result MAE on fold 0 is 1136.8920325804445 in three model
- I tested parameters difference in each model XGBoost
- So, I chooce file xgb_submission to submission in competition.

---

#### Copyright
<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">
    <img alt="Creative Commons License" align="right" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" />

This work by Bruno A. R. M. Campos is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.