### Import packages

In [1]:
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
import datetime
import itertools as it
import pickle
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

### 1. Read data

In [2]:
data = pd.read_pickle("Data/final_data_with_mean_encoding.pkl")

### 2. Lists of features by groups

In [3]:
# Identifiers
identifiers = ['card_id','train_or_test','first_active_month']

# Target
target = 'target'

# Features from original Train/Test.csv
features_train_csv = ['feature_1','feature_2','feature_3','year','month','months_to_2018_02']

# Features from all_transactions_merchants_groupby_cards
features_transactions = [
'month_lag__mean','month_lag__std','month_lag__min','month_lag__max','month_lag__range',
'purchase_date__range','purchase_date__days_diff_next_purchase_mean',
'purchase_date__days_diff_next_purchase_std','purchase_date_year__mean','purchase_date_year__std',
'purchase_date_year__min','purchase_date_year__max','purchase_date_year__range',
'purchase_date_month__mean','purchase_date_month__std','purchase_date_month__min',
'purchase_date_month__max','purchase_date_month__range','purchase_date_day__mean',
'purchase_date_day__std','purchase_date_day__min','purchase_date_day__max','purchase_date_day__range',
'purchase_date_hour__mean','purchase_date_hour__std','purchase_date_hour__min','purchase_date_hour__max',
'purchase_date_hour__range','purchase_date_days_to_2018_02__mean','purchase_date_days_to_2018_02__std',
'purchase_date_days_to_2018_02__min','purchase_date_days_to_2018_02__max',
'purchase_date_days_to_2018_02__range','authorized_flag_N__mean','authorized_flag_N__var',
'authorized_flag_Y__mean','authorized_flag_Y__var','authorized_flag_-9999__mean',
'authorized_flag_-9999__var','category_3_A__mean','category_3_A__var','category_3_B__mean',
'category_3_B__var','category_3_C__mean','category_3_C__var','category_3_-9999__mean',
'category_3_-9999__var','installments__mean','installments__std','installments__min','installments__max',
'installments__range','category_1_N__mean','category_1_N__var','category_1_Y__mean','category_1_Y__var',
'category_1_-9999__mean','category_1_-9999__var','merchant_category_id__nunique',
'merchant_category_id__mode','subsector_id__nunique','subsector_id__mode','merchant_id__nunique',
'merchant_id__mode','purchase_amount__mean','purchase_amount__std','purchase_amount__min',
'purchase_amount__max','purchase_amount__range','city_id__nunique','city_id__mode','state_id__nunique',
'state_id__mode','category_2_1__mean','category_2_1__var','category_2_2__mean','category_2_2__var',
'category_2_3__mean','category_2_3__var','category_2_4__mean','category_2_4__var','category_2_5__mean',
'category_2_5__var','category_2_-9999__mean','category_2_-9999__var',
'merchant_group_id_merchants__nunique','merchant_group_id_merchants__mode',
'merchant_category_id_merchants__nunique','merchant_category_id_merchants__mode',
'subsector_id_merchants__nunique','subsector_id_merchants__mode','numerical_1_merchants__mean',
'numerical_1_merchants__std','numerical_1_merchants__min','numerical_1_merchants__max',
'numerical_1_merchants__range','numerical_2_merchants__mean','numerical_2_merchants__std',
'numerical_2_merchants__min','numerical_2_merchants__max','numerical_2_merchants__range',
'category_1_merchants_N__mean','category_1_merchants_N__var','category_1_merchants_Y__mean',
'category_1_merchants_Y__var','category_1_merchants_-9999__mean','category_1_merchants_-9999__var',
'most_recent_sales_range_merchants_A__mean','most_recent_sales_range_merchants_A__var',
'most_recent_sales_range_merchants_B__mean','most_recent_sales_range_merchants_B__var',
'most_recent_sales_range_merchants_C__mean','most_recent_sales_range_merchants_C__var',
'most_recent_sales_range_merchants_D__mean','most_recent_sales_range_merchants_D__var',
'most_recent_sales_range_merchants_E__mean','most_recent_sales_range_merchants_E__var',
'most_recent_sales_range_merchants_-9999__mean','most_recent_sales_range_merchants_-9999__var',
'most_recent_purchases_range_merchants_A__mean','most_recent_purchases_range_merchants_A__var',
'most_recent_purchases_range_merchants_B__mean','most_recent_purchases_range_merchants_B__var',
'most_recent_purchases_range_merchants_C__mean','most_recent_purchases_range_merchants_C__var',
'most_recent_purchases_range_merchants_D__mean','most_recent_purchases_range_merchants_D__var',
'most_recent_purchases_range_merchants_E__mean','most_recent_purchases_range_merchants_E__var',
'most_recent_purchases_range_merchants_-9999__mean','most_recent_purchases_range_merchants_-9999__var',
'avg_sales_lag3_merchants__mean','avg_sales_lag3_merchants__std','avg_sales_lag3_merchants__min',
'avg_sales_lag3_merchants__max','avg_sales_lag3_merchants__range','avg_purchases_lag3_merchants__mean',
'avg_purchases_lag3_merchants__std','avg_purchases_lag3_merchants__min',
'avg_purchases_lag3_merchants__max','avg_purchases_lag3_merchants__range',
'active_months_lag3_merchants__mean','active_months_lag3_merchants__std',
'active_months_lag3_merchants__min','active_months_lag3_merchants__max',
'active_months_lag3_merchants__range','avg_sales_lag6_merchants__mean','avg_sales_lag6_merchants__std',
'avg_sales_lag6_merchants__min','avg_sales_lag6_merchants__max','avg_sales_lag6_merchants__range',
'avg_purchases_lag6_merchants__mean','avg_purchases_lag6_merchants__std',
'avg_purchases_lag6_merchants__min','avg_purchases_lag6_merchants__max',
'avg_purchases_lag6_merchants__range','active_months_lag6_merchants__mean',
'active_months_lag6_merchants__std','active_months_lag6_merchants__min',
'active_months_lag6_merchants__max','active_months_lag6_merchants__range',
'avg_sales_lag12_merchants__mean','avg_sales_lag12_merchants__std','avg_sales_lag12_merchants__min',
'avg_sales_lag12_merchants__max','avg_sales_lag12_merchants__range',
'avg_purchases_lag12_merchants__mean','avg_purchases_lag12_merchants__std',
'avg_purchases_lag12_merchants__min','avg_purchases_lag12_merchants__max',
'avg_purchases_lag12_merchants__range','active_months_lag12_merchants__mean',
'active_months_lag12_merchants__std','active_months_lag12_merchants__min',
'active_months_lag12_merchants__max','active_months_lag12_merchants__range','category_4_merchants_N__mean',
'category_4_merchants_N__var','category_4_merchants_Y__mean','category_4_merchants_Y__var',
'category_4_merchants_-9999__mean','category_4_merchants_-9999__var','city_id_merchants__nunique',
'city_id_merchants__mode','state_id_merchants__nunique','state_id_merchants__mode',
'category_2_merchants_1__mean','category_2_merchants_1__var','category_2_merchants_2__mean',
'category_2_merchants_2__var','category_2_merchants_3__mean','category_2_merchants_3__var',
'category_2_merchants_4__mean','category_2_merchants_4__var','category_2_merchants_5__mean',
'category_2_merchants_5__var','category_2_merchants_-9999__mean','category_2_merchants_-9999__var']

# Features from mean_encoding_all_transactions_merchants_groupby_card
features_mean_encoding = [
    'authorized_flag__mean_encoded','category_3__mean_encoded','category_1__mean_encoded',
    'merchant_category_id__mean_encoded','subsector_id__mean_encoded',
    'merchant_id__mean_encoded','city_id__mean_encoded','state_id__mean_encoded',
    'category_2__mean_encoded','merchant_group_id_merchants__mean_encoded',
    'merchant_category_id_merchants__mean_encoded','subsector_id_merchants__mean_encoded',
    'category_1_merchants__mean_encoded','most_recent_sales_range_merchants__mean_encoded',
    'most_recent_purchases_range_merchants__mean_encoded','category_4_merchants__mean_encoded',
    'city_id_merchants__mean_encoded','state_id_merchants__mean_encoded',
    'category_2_merchants__mean_encoded']

# Features with 1 unique value that should be removed
features_1_unique = [
    "purchase_date_year__min","authorized_flag_-9999__mean","authorized_flag_-9999__var",
    "category_1_-9999__mean","category_1_-9999__var","active_months_lag3_merchants__max",
    "active_months_lag6_merchants__max"]

# Features mode of IDs
features_numeric_IDs_mode = [
    'merchant_category_id__mode','subsector_id__mode','city_id__mode',
    'state_id__mode','merchant_group_id_merchants__mode','merchant_category_id_merchants__mode',
    'subsector_id_merchants__mode','city_id_merchants__mode','state_id_merchants__mode']

features_categorical_IDs_mode = ['merchant_id__mode']

### 3. For XGBRegressor data must be int, float or bool

Let's modify features type category and object except *merchant_id__mode* which is the unique ID feature no numerical, and we will simply not include it in the model.

In [4]:
features_to_modify = ["feature_1","feature_2","feature_3"]+features_numeric_IDs_mode
for col in features_to_modify:
    data[col] = data[col].astype('int')

### 4. Select features for the model

We will exclude features with 1 unique value and *merchant_id__mode*.

In [5]:
features = np.concatenate((features_train_csv,features_transactions,features_mean_encoding))
features = [col for col in features if col not in features_1_unique+features_categorical_IDs_mode]

### 5. Target metric: RMSE

In [6]:
rmse = lambda predictions,target: np.mean((predictions-target)**2)**0.5

### 6. Split Train into 2 validation folds

First separate Train & Test.

In [7]:
train = data.loc[data["train_or_test"]=="train",:]
test = data.loc[data["train_or_test"]=="test",:]

Then split Train into 2 validation folds.

In [8]:
def folds(data,k=2,seed=1):
    np.random.seed(seed)
    data = data.iloc[np.random.permutation(data.index),:]
    data_folds_list = [data.iloc[int(data.shape[0]*i/k):int(data.shape[0]*(i+1)/k),:] for i in range(k)]
    return(data_folds_list)

k = 2
train_folds_list = folds(train,k)

### 7. Validation with grid search

Define all parameters combinations for grid search.

In [9]:
models = {
    'XGB': XGBRegressor,
}

parameters_names = {
    'XGB': ['max_depth','learning_rate','n_estimators',"booster","n_jobs","min_child_weight","subsample",
            "colsample_bytree","random_state","missing"],
}

parameters_values = {
    # XGB: max_depth, learning_rate, n_estimators, booster, n_jobs, min_child_weight, subsample,
    #      colsample_bytree, random_state, missing
    'XGB': [[5,10,15,20,25],
            [0.01],
            [25,50,100,250,500],
            ["gbtree","gblinear"],
            [6],
            [10,20,30,40],
            [0.75,1],
            [0.75,1],
            [1],
            [-9999]
           ],
}

grid = {}
for model in models.keys():
    grid[model] = list(it.product(*parameters_values[model]))
    for idx in range(len(grid[model])):
        arguments = {}
        for parameter_index, parameter_value in enumerate(grid[model][idx]):
            arguments[parameters_names[model][parameter_index]] = parameter_value
        grid[model][idx] = arguments

Grid search: calulate the RMSE in validation splits for all parametrizations.

In [None]:
InteractiveShell.ast_node_interactivity = 'none'

In [None]:
RMSEs = {}
for model in models.keys():
    RMSEs[model] = pd.DataFrame(data=None,columns=parameters_names[model]+["Fold_"+str(i) for i in \
                                                                           range(k)],["Mean","Std"])
    for idx in range(len(grid[model])):
        for fold in range(k):
            # Define train and validation data for the ith fold
            data_train = pd.concat([train_folds_list[i] for i in range(k) if i!=fold],axis=0)
            data_val = train_folds_list[fold]        
            x_train, y_train = data_train[features], data_train[target]
            x_validation, y_validation = data_val[features], data_val[target]
            # Train model
            print("Parametrization "+str(idx)+" for Fold_"+str(fold)+" starts at ",
                  datetime.datetime.now())
            regressor = models[model](**grid[model][idx])
            regressor.fit(x_train,y_train)
            rmse_validation = rmse(regressor.predict(x_validation),y_validation)
            print("and finishes at ",datetime.datetime.now()," with RMSE ",rmse_validation,"\n")
            # Add the RMSE to results
            RMSEs[model].loc[idx,parameters_names[model]] = grid[model][idx]
            RMSEs[model].loc[idx,"Fold_"+str(fold)] = rmse_validation
        # Add the mean and std of the parametrization RMSE
        RMSEs[model].loc[idx,"Mean"] = np.mean(RMSEs[model].loc[idx,["Fold_"+str(i) for i in range(k)]])
        RMSEs[model].loc[idx,"Std"] = np.std(RMSEs[model].loc[idx,["Fold_"+str(i) for i in range(k)]])

InteractiveShell.ast_node_interactivity = 'all'

Save the validation results in a *pickle* file.

In [None]:
# Sort the results by RMSE mean
RMSEs[model] = RMSEs[model].loc[np.argsort(RMSEs[model]["Mean"]),:]
pickle.dump(RMSEs,open('Validations/Results_validation_XGBoost.dat','wb'))

### 8. Train optimal model with all Train and create submission

In [10]:
RMSEs = pd.read_pickle("Validations/Results_validation_XGBoost.dat")

In [11]:
RMSEs["XGB"].head(5)

Unnamed: 0,max_depth,learning_rate,n_estimators,booster,n_jobs,min_child_weight,subsample,colsample_bytree,random_state,missing,Fold_0,Fold_1,Mean,Std
457,15,0.01,500,gbtree,6,30,0.75,1,1,-9999,2.76425,2.67121,2.71773,0.0465233
461,15,0.01,500,gbtree,6,40,0.75,1,1,-9999,2.76705,2.6744,2.72072,0.0463221
613,20,0.01,500,gbtree,6,20,0.75,1,1,-9999,2.76883,2.67314,2.72098,0.0478485
453,15,0.01,500,gbtree,6,20,0.75,1,1,-9999,2.76894,2.67455,2.72175,0.0471922
297,10,0.01,500,gbtree,6,30,0.75,1,1,-9999,2.77346,2.67827,2.72587,0.0475953


#### submission_XGBoost_parametrization#.csv

Train all chosen parametrizations with all Train data. *Note:* We will use the 5 parametrizations with lowest mean RMSE.

In [None]:
InteractiveShell.ast_node_interactivity = 'none'

In [None]:
model = "XGB"
parametrization_indexes = RMSEs[model].index[0:5]
# Define train data
x_train, y_train = train[features], train[target]
# Train model for all chosen parametrizations
for idx in parametrization_indexes:
    print("Parametrization "+str(idx)+" with all Train starts at ",datetime.datetime.now())
    regressor = models[model](**grid[model][idx])
    regressor.fit(x_train,y_train)
    print("and finishes at ",datetime.datetime.now(),"\n")
    # Save model
    pickle.dump(regressor,open('Models/Model_XGBoost_parametrization'+str(idx)+'.dat','wb'))

InteractiveShell.ast_node_interactivity = 'all'

Predict Test data and create *.csv* file for submitting to Kaggle for all chosen parametrizations.

In [12]:
submission = pd.read_csv('Data/sample_submission.csv')
print("Order of card_id in submission.csv is OK?",np.mean(submission["card_id"].values==test["card_id"].\
                                                          values)==1)

Order of card_id in submission.csv is OK? True


In [13]:
parametrization_indexes = RMSEs[model].index[0:5]
x_test = test[features]
for idx in parametrization_indexes:
    regressor = pickle.load(open('Models/Model_XGBoost_parametrization'+str(idx)+'.dat','rb'))
    submission["target"] = regressor.predict(x_test)
    submission.to_csv('Submissions/Submission_XGBoost_parametrization'+str(idx)+'.csv',index=False)

#### submission_XGBoost_parametrization_wo_mean_encoding.csv

Let's try doing feature selection, excluding the mean encoded features.

In [14]:
features_wo_mean_encoding = [col for col in features if col not in features_mean_encoding]

Train all chosen parametrizations with all Train data. *Note:* We will use the 5 parametrizations with lowest mean RMSE.

In [None]:
InteractiveShell.ast_node_interactivity = 'none'

In [None]:
model = "XGB"
parametrization_indexes = RMSEs[model].index[0:5]
# Define train data
x_train, y_train = train[features_wo_mean_encoding], train[target]
# Train model for all chosen parametrizations
for idx in parametrization_indexes:
    print("Parametrization "+str(idx)+" with all Train starts at ",datetime.datetime.now())
    regressor = models[model](**grid[model][idx])
    regressor.fit(x_train,y_train)
    print("and finishes at ",datetime.datetime.now(),"\n")
    # Save model
    pickle.dump(regressor,open('Models/Model_XGBoost_parametrization'+str(idx)+'_wo_mean_encoding.dat',
                               'wb'))

InteractiveShell.ast_node_interactivity = 'all'

Predict Test data and create *.csv* file for submitting to Kaggle for all chosen parametrizations without mean encoded features.

In [15]:
submission = pd.read_csv('Data/sample_submission.csv')
print("Order of card_id in submission.csv OK?",np.mean(submission["card_id"].values==test["card_id"].\
                                                       values)==1)

Order of card_id in submission.csv OK? True


In [16]:
x_test = test[features_wo_mean_encoding]
for idx in parametrization_indexes:
    regressor = pickle.load(open('Models/Model_XGBoost_parametrization'+str(idx)+'_wo_mean_encoding.dat',
                                 'rb'))
    submission["target"] = regressor.predict(x_test)
    submission.to_csv('Submissions/Submission_XGBoost_parametrization'+str(idx)+'_wo_mean_encoding.csv',
                      index=False)