# Elo Merchant Category Recommendation - LightGBM with outlier detection
End date: _2019. february 19._<br/>

This tutorial notebook is the second part of a seriers for [Elo Mechant Category Recommendation](https://www.kaggle.com/c/elo-merchant-category-recommendation) contest organized by Elo, one of the largest payment brands in Brazil. It has built partnerships with merchants in order to offer promotions or discounts to cardholders. The objective of the competition is to identify and serve the most relevant opportunities to individuals, by uncovering signals in customer loyalty. The input files are available from the [download](https://www.kaggle.com/c/elo-merchant-category-recommendation/data) section of the contest:

- **train.csv**,  **test.csv**: list of `card_ids` that can be used for training and testing
- **historical_transactions.csv**: contains up to 3 months' worth of transactions for every card at any of the provided `merchant_ids`
- **new_merchant_transactions.csv**: contains the transactions at new merchants (`merchant_ids` that this particular `card_id` 
has not yet visited) over a period of two months
- **merchants.csv**: contains aggregate information for each `merchant_id` represented in the data set

In [1]:
import os
import gc
import math
import random
import warnings
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from keras.models import load_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")

random.seed(1)
threshold = 0.5

Using TensorFlow backend.


In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Starting memory usage: {:5.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Reduced memory usage: {:5.2f} MB ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Merchants

In [3]:
df_merch = pd.read_csv("input/merchants.csv")
print("{:,} records and {} features in merchant set.".format(df_merch.shape[0], df_merch.shape[1]))

334,696 records and 22 features in merchant set.


In [4]:
df_merch[:3]

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,N,E,E,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,N,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,N,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,N,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,N,E,E,-82.13,...,-82.13,260.0,2,-82.13,260.0,2,N,-1,5,5.0


In [5]:
df_merch['category_1'] = df_merch['category_1'].map({'N': 0, 'Y': 1})
df_merch['category_2'] = pd.to_numeric(df_merch['category_2'])
df_merch['category_4'] = df_merch['category_4'].map({'N': 0, 'Y': 1})
df_merch['most_recent_sales_range'] = df_merch['most_recent_sales_range'].map({'E': 0, 'D': 1, 'C': 2, 'B': 3, 'A': 4})
df_merch['most_recent_purchases_range'] = df_merch['most_recent_purchases_range'].map({'E': 0, 'D': 1, 'C': 2, 'B': 3, 'A': 4})

In [6]:
df_merch[['merchant_id', 'category_1', 'category_2', 'category_4', 'most_recent_sales_range', 'most_recent_purchases_range']][:3]

Unnamed: 0,merchant_id,category_1,category_2,category_4,most_recent_sales_range,most_recent_purchases_range
0,M_ID_838061e48c,0,1.0,0,0,0
1,M_ID_9339d880ad,0,1.0,0,0,0
2,M_ID_e726bbae1e,0,5.0,0,0,0


In [7]:
dropping = ['city_id', 'state_id']
for var in dropping:
    df_merch = df_merch.drop(var, axis=1)

In [8]:
df_merch = reduce_mem_usage(df_merch)

Starting memory usage: 51.07 MB
Reduced memory usage: 20.43 MB (60.0% reduction)


## Transactions

In [9]:
df_new_trans = pd.read_csv("input/trans_merch_new_agg.csv", index_col=0)
df_new_trans = reduce_mem_usage(df_new_trans)

df_hist_trans = pd.read_csv("input/trans_merch_hist_agg.csv", index_col=0)
df_hist_trans = reduce_mem_usage(df_hist_trans)

Starting memory usage: 354.01 MB
Reduced memory usage: 84.35 MB (76.2% reduction)
Starting memory usage: 397.39 MB
Reduced memory usage: 112.08 MB (71.8% reduction)


In [10]:
df_new_trans[:3]

Unnamed: 0,card_id,new_transactions_count,new_authorized_flag_sum,new_authorized_flag_mean,new_active_months_lag3_sum,new_active_months_lag3_mean,new_active_months_lag6_sum,new_active_months_lag6_mean,new_active_months_lag12_sum,new_active_months_lag12_mean,...,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_dayofweek_std,new_purchase_dayofweek_mode,new_purchase_quarter_mean,new_purchase_quarter_median,new_purchase_quarter_max,new_purchase_quarter_min,new_purchase_quarter_std,new_purchase_quarter_mode
0,C_ID_00007093c1,1,3,1,9.0,3.0,18.0,6.0,31.0,10.335938,...,1,0,0.577148,0,2.0,2.0,2,2,0.0,2
1,C_ID_0001238066,1,27,1,78.0,3.0,156.0,6.0,304.0,11.695312,...,6,0,1.764648,4,1.333008,1.0,2,1,0.480469,1
2,C_ID_0001506ef0,1,2,1,3.0,3.0,6.0,6.0,12.0,12.0,...,4,3,0.707031,3,1.0,1.0,1,1,0.0,1


In [11]:
df_hist_trans[:3]

Unnamed: 0,card_id,hist_transactions_count,hist_authorized_flag_sum,hist_authorized_flag_mean,hist_active_months_lag3_sum,hist_active_months_lag3_mean,hist_active_months_lag6_sum,hist_active_months_lag6_mean,hist_active_months_lag12_sum,hist_active_months_lag12_mean,...,hist_purchase_dayofweek_max,hist_purchase_dayofweek_min,hist_purchase_dayofweek_std,hist_purchase_dayofweek_mode,hist_purchase_quarter_mean,hist_purchase_quarter_median,hist_purchase_quarter_max,hist_purchase_quarter_min,hist_purchase_quarter_std,hist_purchase_quarter_mode
0,C_ID_00007093c1,1,114.0,0.765137,447.0,3.0,894.0,6.0,1776.0,11.921875,...,6,0,1.869141,0,2.476562,2.0,4,1,1.100586,2
1,C_ID_0001238066,1,120.0,0.975586,369.0,3.0,738.0,6.0,1476.0,12.0,...,6,0,1.90918,5,2.763672,4.0,4,1,1.47168,4
2,C_ID_0001506ef0,1,64.0,0.941406,204.0,3.0,408.0,6.0,806.0,11.851562,...,6,0,1.787109,5,2.558594,3.0,4,1,1.407227,4


## Train and test data

In [12]:
df_train = pd.read_csv("input/train.csv")
df_train = reduce_mem_usage(df_train)

df_test = pd.read_csv("input/test.csv")
df_test = reduce_mem_usage(df_test)

print("{:,} records and {} features in train set.".format(df_train.shape[0], df_train.shape[1]))
print("{:,} records and {} features in test set.".format(df_test.shape[0], df_test.shape[1]))

Starting memory usage:  9.24 MB
Reduced memory usage:  4.04 MB (56.2% reduction)
Starting memory usage:  4.72 MB
Reduced memory usage:  2.24 MB (52.5% reduction)
201,917 records and 6 features in train set.
123,623 records and 5 features in test set.


In [13]:
df_train[:3]

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820312
1,2017-01,C_ID_3d0044924f,4,1,0,0.392822
2,2016-08,C_ID_d639edf6cd,2,2,0,0.687988


In [14]:
df_test[:3]

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1


## Merging

Join the data of the merchants and the transactions to the training and test set.

In [15]:
df_train = pd.merge(df_train, df_hist_trans, on='card_id',how='left')
df_train = pd.merge(df_train, df_new_trans, on='card_id', how='left')

In [16]:
df_test = pd.merge(df_test, df_hist_trans, on='card_id',how='left')
df_test = pd.merge(df_test, df_new_trans, on='card_id', how='left')

In [17]:
del df_hist_trans
del df_new_trans
gc.collect()

35

In [18]:
df_test[:3]

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,hist_transactions_count,hist_authorized_flag_sum,hist_authorized_flag_mean,hist_active_months_lag3_sum,hist_active_months_lag3_mean,...,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_dayofweek_std,new_purchase_dayofweek_mode,new_purchase_quarter_mean,new_purchase_quarter_median,new_purchase_quarter_max,new_purchase_quarter_min,new_purchase_quarter_std,new_purchase_quarter_mode
0,2017-04,C_ID_0ab67a22ab,3,3,1,1,47.0,0.662109,213.0,3.0,...,5.0,2.0,1.527344,2.0,1.0,1.0,1.0,1.0,0.0,1.0
1,2017-01,C_ID_130fd0cbdd,2,3,0,1,77.0,0.987305,234.0,3.0,...,6.0,0.0,2.359375,0.0,1.400391,1.0,2.0,1.0,0.516602,1.0
2,2017-08,C_ID_b709037bc5,5,1,1,1,9.0,0.692383,39.0,3.0,...,3.0,1.0,1.414062,1.0,1.0,1.0,1.0,1.0,0.0,1.0


## Outlier detection

In [19]:
cols = ['feature_1', 'feature_2', 'feature_3', 'hist_authorized_flag_sum', 'hist_active_months_lag3_sum', 'hist_active_months_lag6_sum', 'hist_active_months_lag12_sum', 'hist_avg_sales_lag3_sum', 'hist_avg_sales_lag3_mean', 'hist_avg_sales_lag6_sum', 'hist_avg_sales_lag6_mean', 'hist_avg_sales_lag12_sum', 'hist_avg_sales_lag12_mean', 'hist_category_1_trans_sum', 'hist_category_1_merch_sum', 'hist_category_2_trans_sum', 'hist_category_2_trans_mean', 'hist_category_2_trans_nancnt', 'hist_category_2_trans_nanperc', 'hist_category_2_merch_sum', 'hist_category_2_merch_mean', 'hist_category_2_merch_nancnt', 'hist_category_2_merch_nanperc', 'hist_category_3_sum', 'hist_category_3_nanperc', 'hist_category_4_sum', 'hist_city_id_nunique', 'hist_city_id_mode', 'hist_installments_sum', 'hist_installments_mean', 'hist_installments_max', 'hist_installments_min', 'hist_installments_std', 'hist_installments_mode', 'hist_merchant_id_nunique', 'hist_merchant_id_nancnt', 'hist_merchant_category_id_trans_nunique', 'hist_merchant_category_id_trans_mode', 'hist_merchant_group_id_nunique', 'hist_merchant_group_id_mode', 'hist_merchant_category_id_merch_nunique', 'hist_merchant_category_id_merch_mode', 'hist_month_lag_min', 'hist_month_lag_max', 'hist_month_lag_mean', 'hist_most_recent_sales_range_sum', 'hist_most_recent_sales_range_max', 'hist_most_recent_sales_range_mode', 'hist_most_recent_purchases_range_sum', 'hist_most_recent_purchases_range_max', 'hist_most_recent_purchases_range_mode', 'hist_numerical_1_mean', 'hist_numerical_1_std', 'hist_numerical_1_mode', 'hist_numerical_2_mean', 'hist_numerical_2_std', 'hist_state_id_nunique', 'hist_state_id_mode', 'hist_subsector_id_trans_nunique', 'hist_subsector_id_trans_mode', 'hist_subsector_id_merch_nunique', 'hist_subsector_id_merch_mode', 'hist_purchase_amount_sum', 'hist_purchase_amount_mean', 'hist_purchase_amount_max', 'hist_purchase_amount_std', 'hist_purchase_year_mean', 'hist_purchase_year_median', 'hist_purchase_year_max', 'hist_purchase_year_mode', 'hist_purchase_month_mean', 'hist_purchase_month_median', 'hist_purchase_month_max', 'hist_purchase_month_min', 'hist_purchase_month_std', 'hist_purchase_month_mode', 'hist_purchase_day_mean', 'hist_purchase_day_median', 'hist_purchase_day_max', 'hist_purchase_day_min', 'hist_purchase_day_std', 'hist_purchase_day_mode', 'hist_purchase_hour_mean', 'hist_purchase_hour_median', 'hist_purchase_hour_max', 'hist_purchase_hour_min', 'hist_purchase_hour_std', 'hist_purchase_hour_mode', 'hist_purchase_weekofyear_mean', 'hist_purchase_weekofyear_median', 'hist_purchase_weekofyear_max', 'hist_purchase_weekofyear_min', 'hist_purchase_weekofyear_std', 'hist_purchase_weekofyear_mode', 'hist_purchase_dayofweek_median', 'hist_purchase_dayofweek_max', 'hist_purchase_dayofweek_min', 'hist_purchase_dayofweek_mode', 'hist_purchase_quarter_median', 'hist_purchase_quarter_max', 'hist_purchase_quarter_min', 'hist_purchase_quarter_mode', 'new_authorized_flag_sum', 'new_active_months_lag3_sum', 'new_active_months_lag3_mean', 'new_active_months_lag6_sum', 'new_active_months_lag12_sum', 'new_avg_sales_lag3_sum', 'new_avg_sales_lag3_mean', 'new_avg_sales_lag6_sum', 'new_avg_sales_lag6_mean', 'new_avg_sales_lag12_sum', 'new_avg_sales_lag12_mean', 'new_avg_purchases_lag3_sum', 'new_avg_purchases_lag3_mean', 'new_avg_purchases_lag6_sum', 'new_avg_purchases_lag6_mean', 'new_avg_purchases_lag12_sum', 'new_avg_purchases_lag12_mean', 'new_category_1_trans_sum', 'new_category_1_merch_sum', 'new_category_2_trans_sum', 'new_category_2_trans_mean', 'new_category_2_trans_nanperc', 'new_category_2_merch_sum', 'new_category_2_merch_nancnt', 'new_category_2_merch_nanperc', 'new_category_3_sum', 'new_category_3_nanperc', 'new_category_4_sum', 'new_city_id_nunique', 'new_city_id_mode', 'new_installments_sum', 'new_installments_max', 'new_installments_min', 'new_installments_mode', 'new_merchant_id_nunique', 'new_merchant_id_nancnt', 'new_merchant_id_nanperc', 'new_merchant_category_id_trans_nunique', 'new_merchant_category_id_trans_mode', 'new_merchant_group_id_nunique', 'new_merchant_group_id_mode', 'new_merchant_group_id_nanperc', 'new_merchant_category_id_merch_nunique', 'new_merchant_category_id_merch_mode', 'new_merchant_category_id_merch_nanperc', 'new_month_lag_min', 'new_month_lag_max', 'new_most_recent_sales_range_sum', 'new_most_recent_sales_range_max', 'new_most_recent_sales_range_mode', 'new_most_recent_purchases_range_sum', 'new_most_recent_purchases_range_max', 'new_most_recent_purchases_range_min', 'new_most_recent_purchases_range_mode', 'new_numerical_1_mean', 'new_numerical_1_median', 'new_numerical_1_max', 'new_numerical_1_std', 'new_numerical_2_mean', 'new_numerical_2_median', 'new_numerical_2_max', 'new_numerical_2_std', 'new_state_id_nunique', 'new_state_id_mode', 'new_subsector_id_trans_nunique', 'new_subsector_id_trans_mode', 'new_subsector_id_merch_nunique', 'new_subsector_id_merch_mode', 'new_subsector_id_merch_nanperc', 'new_purchase_amount_sum', 'new_purchase_year_mean', 'new_purchase_year_median', 'new_purchase_year_max', 'new_purchase_year_min', 'new_purchase_year_mode', 'new_purchase_month_mean', 'new_purchase_month_median', 'new_purchase_month_max', 'new_purchase_month_min', 'new_purchase_month_mode', 'new_purchase_day_mean', 'new_purchase_day_median', 'new_purchase_day_max', 'new_purchase_day_min', 'new_purchase_day_std', 'new_purchase_day_mode', 'new_purchase_hour_mean', 'new_purchase_hour_median', 'new_purchase_hour_max', 'new_purchase_hour_min', 'new_purchase_hour_std', 'new_purchase_hour_mode', 'new_purchase_weekofyear_mean', 'new_purchase_weekofyear_median', 'new_purchase_weekofyear_max', 'new_purchase_weekofyear_min', 'new_purchase_weekofyear_std', 'new_purchase_weekofyear_mode', 'new_purchase_dayofweek_mean', 'new_purchase_dayofweek_median', 'new_purchase_dayofweek_max', 'new_purchase_dayofweek_min', 'new_purchase_dayofweek_mode', 'new_purchase_quarter_mean', 'new_purchase_quarter_median', 'new_purchase_quarter_max', 'new_purchase_quarter_min', 'new_purchase_quarter_mode']

df_test_o = df_test.copy(deep=True)
df_test_o = df_test_o[cols].dropna(how='any', axis=0)

In [20]:
df_test[cols][:3]

Unnamed: 0,feature_1,feature_2,feature_3,hist_authorized_flag_sum,hist_active_months_lag3_sum,hist_active_months_lag6_sum,hist_active_months_lag12_sum,hist_avg_sales_lag3_sum,hist_avg_sales_lag3_mean,hist_avg_sales_lag6_sum,...,new_purchase_dayofweek_mean,new_purchase_dayofweek_median,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_dayofweek_mode,new_purchase_quarter_mean,new_purchase_quarter_median,new_purchase_quarter_max,new_purchase_quarter_min,new_purchase_quarter_mode
0,3,3,1,47.0,213.0,426.0,837.0,100.729996,1.418732,110.57,...,3.666016,4.0,5.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0
1,2,3,0,77.0,234.0,468.0,936.0,79.639999,1.021026,80.120003,...,2.699219,3.0,6.0,0.0,0.0,1.400391,1.0,2.0,1.0,1.0
2,5,1,1,9.0,39.0,78.0,156.0,13.67,1.051538,12.94,...,2.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
df_test_o[:3]

Unnamed: 0,feature_1,feature_2,feature_3,hist_authorized_flag_sum,hist_active_months_lag3_sum,hist_active_months_lag6_sum,hist_active_months_lag12_sum,hist_avg_sales_lag3_sum,hist_avg_sales_lag3_mean,hist_avg_sales_lag6_sum,...,new_purchase_dayofweek_mean,new_purchase_dayofweek_median,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_dayofweek_mode,new_purchase_quarter_mean,new_purchase_quarter_median,new_purchase_quarter_max,new_purchase_quarter_min,new_purchase_quarter_mode
0,3,3,1,47.0,213.0,426.0,837.0,100.729996,1.418732,110.57,...,3.666016,4.0,5.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0
1,2,3,0,77.0,234.0,468.0,936.0,79.639999,1.021026,80.120003,...,2.699219,3.0,6.0,0.0,0.0,1.400391,1.0,2.0,1.0,1.0
2,5,1,1,9.0,39.0,78.0,156.0,13.67,1.051538,12.94,...,2.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
df_test['hist_category_1_trans_sum'].max()

5580.0

In [98]:
def safeCalculateMeanStd(series):
    c = len(series)
    s = 0

    for i in range(0, c):
        if math.isfinite(series.iloc[i]):
            s = s + series.iloc[i]

    mean = s/c
    std = np.sqrt(np.sum(np.power(series-mean, 2))/c)
    return mean, std

In [100]:
df_test['hist_authorized_flag_sum'][:3]

0    47.0
1    77.0
2     9.0
Name: hist_authorized_flag_sum, dtype: float16

In [99]:
safeCalculateMeanStd(df_test['hist_authorized_flag_sum'])

(85.34093979275701, inf)

In [None]:
for f in df_test_o.columns:
    m, s = safeCalculateMeanStd()
    df_test_o[f] = (df_test_o[f] - m)/s
    print('{}: {:.4f} ({:.4f}), min: {:,.4f}, max:{:,}'.format(f, m, s, df_test_o[f].min(), df_test_o[f].max()))

In [23]:
for f in df_test_o.columns:
    mean = df_test_o[f].mean()
    std = df_test_o[f].std()
    if math.isnan(mean):
        mean = df_test_o[f].astype(int).mean()
        std = df_test_o[f].astype(int).std()
    
    df_test_o[f] = (df_test_o[f] - mean)/std
    print('{}: {:.4f} ({:.4f}), min: {:,.4f}, max:{:,}'.format(f, mean, std, df_test_o[f].min(), df_test_o[f].max()))

feature_1: 0.0000 (1.0000), min: -1.7465, max:1.5902910120465035
feature_2: 0.0000 (1.0000), min: -0.9736, max:1.700517940241105
feature_3: -0.0000 (1.0000), min: -1.1158, max:0.8961863550258807
hist_authorized_flag_sum: 0.0000 (0.0000), min: -inf, max:inf
hist_active_months_lag3_sum: 0.0000 (0.0000), min: -inf, max:inf
hist_active_months_lag6_sum: 0.0000 (0.0000), min: -inf, max:inf
hist_active_months_lag12_sum: 0.0000 (1.0000), min: -0.8696, max:14.036707020068151
hist_avg_sales_lag3_sum: -0.0000 (1.0000), min: -0.0804, max:87.19811248779297
hist_avg_sales_lag3_mean: -0.0000 (1.0000), min: -0.0549, max:142.02964782714844
hist_avg_sales_lag6_sum: -0.0000 (1.0000), min: -0.0821, max:85.44084930419922
hist_avg_sales_lag6_mean: 0.0000 (1.0000), min: -0.0533, max:129.2336883544922
hist_avg_sales_lag12_sum: 0.0000 (1.0000), min: -0.0816, max:82.40811920166016
hist_avg_sales_lag12_mean: 0.0000 (1.0000), min: -0.0474, max:126.52336883544922
hist_category_1_trans_sum: -0.0000 (0.0000), min: -

new_category_2_trans_mean: 2.0550 (1.3539), min: -0.7793, max:2.17578125
new_category_2_trans_nanperc: 4.9450 (12.8257), min: -0.3855, max:7.23046875
new_category_2_merch_sum: 15.7729 (19.9443), min: -0.7910, max:22.578125
new_category_2_merch_nancnt: 0.9300 (1.4278), min: -0.6514, max:29.46875
new_category_2_merch_nanperc: 13.1097 (19.6527), min: -0.6670, max:4.41796875
new_category_3_sum: 4.5891 (6.9276), min: -0.6626, max:12.484375
new_category_3_nanperc: 3.3046 (9.8295), min: -0.3362, max:9.8359375
new_category_4_sum: 3.9952 (5.1821), min: -0.7710, max:14.2734375
new_city_id_nunique: 2.5574 (1.6966), min: -0.9179, max:17.353973327552534
new_city_id_mode: 128.2978 (99.5543), min: -1.2988, max:2.196812536355067
new_installments_sum: 5.4142 (9.1464), min: -0.9199, max:109.06851601373235
new_installments_max: 1.5680 (4.1087), min: -0.6250, max:242.76109989987685
new_installments_min: 0.2205 (0.7470), min: -1.6339, max:15.769633318693558
new_installments_mode: 0.5091 (0.7395), min: -2.0

In [24]:
df_test[cols][:3]

Unnamed: 0,feature_1,feature_2,feature_3,hist_authorized_flag_sum,hist_active_months_lag3_sum,hist_active_months_lag6_sum,hist_active_months_lag12_sum,hist_avg_sales_lag3_sum,hist_avg_sales_lag3_mean,hist_avg_sales_lag6_sum,...,new_purchase_dayofweek_mean,new_purchase_dayofweek_median,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_dayofweek_mode,new_purchase_quarter_mean,new_purchase_quarter_median,new_purchase_quarter_max,new_purchase_quarter_min,new_purchase_quarter_mode
0,3,3,1,47.0,213.0,426.0,837.0,100.729996,1.418732,110.57,...,3.666016,4.0,5.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0
1,2,3,0,77.0,234.0,468.0,936.0,79.639999,1.021026,80.120003,...,2.699219,3.0,6.0,0.0,0.0,1.400391,1.0,2.0,1.0,1.0
2,5,1,1,9.0,39.0,78.0,156.0,13.67,1.051538,12.94,...,2.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
df_test_o[:3]

Unnamed: 0,feature_1,feature_2,feature_3,hist_authorized_flag_sum,hist_active_months_lag3_sum,hist_active_months_lag6_sum,hist_active_months_lag12_sum,hist_avg_sales_lag3_sum,hist_avg_sales_lag3_mean,hist_avg_sales_lag6_sum,...,new_purchase_dayofweek_mean,new_purchase_dayofweek_median,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_dayofweek_mode,new_purchase_quarter_mean,new_purchase_quarter_median,new_purchase_quarter_max,new_purchase_quarter_min,new_purchase_quarter_mode
0,-0.078127,1.700518,0.896186,-inf,-inf,-inf,-0.282577,-0.079763,-0.054461,-0.081428,...,0.886719,0.668457,0.003354,0.738822,-0.341886,-0.468018,-0.702637,-1.270005,-0.439056,-0.698217
1,-0.912336,1.700518,-1.115827,-inf,-inf,-inf,-0.2111,-0.07989,-0.054627,-0.081627,...,-0.005405,-0.059753,0.80628,-0.736183,-1.377277,0.020248,-0.702637,-0.012957,-0.439056,-0.698217
2,1.590291,-0.973629,0.896186,-inf,-inf,-inf,-0.774254,-0.080287,-0.054614,-0.082065,...,-0.650391,-0.788086,-1.602496,0.001319,-0.859581,-0.468018,-0.702637,-1.270005,-0.439056,-0.698217


In [None]:
autoencoder = load_model('models/autoencoder_3relu_30-15_t1.8.h5')

In [None]:
pred_o = autoencoder.predict(df_test_o)

In [None]:
def createReconstructionError(autoencoder, X_test):
    predictions = autoencoder.predict(X_test)
    mse = np.mean(np.power(X_test - predictions, 2), axis=1)
    error_df = pd.DataFrame({
        'reconstruction_error': mse
    })

    return error_df

In [None]:
len(df_test_o), len(pred_o)

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)

error_df = createReconstructionError(autoencoder, df_test_o)
error_df.describe()

In [None]:
df_test_o[:10]

In [None]:
df_test.shape, df_test_o.shape, pred_o.shape, error_df.shape

In [None]:
df_test[cols][:3]

In [None]:
df_test_o[:3]

## Training
### LightGBM
For more details click [here](https://lightgbm.readthedocs.io/en/latest/).

In [None]:
df_train.drop(columns=['new_merchant_id_mode', 'hist_merchant_id_mode'], inplace=True, axis=1)
df_test.drop(columns=['new_merchant_id_mode', 'hist_merchant_id_mode'], inplace=True, axis=1)

In [None]:
target = df_train['target']
drops = ['card_id', 'first_active_month', 'target']
use_cols = [c for c in df_train.columns if c not in drops]
features = list(df_train[use_cols].columns)
df_train[features][:3]

In [None]:
param = {
    'num_leaves': 50,
    'min_data_in_leaf': 30, 
    'objective':'regression',
    'max_depth': -1,
    'learning_rate': 0.005,
    "boosting": "gbdt",
    "feature_fraction": 0.9,
    "bagging_freq": 1,
    "bagging_fraction": 0.9,
    "bagging_seed": 11,
    "metric": 'rmse',
    "lambda_l1": 0.1,
    "verbosity": -1
}

folds = KFold(n_splits=5, shuffle=True, random_state=15)

oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))

feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print('-')
    print("Fold {}".format(fold_ + 1))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds=100)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

In [None]:
cross_validation_lgb = np.sqrt(mean_squared_error(target, oof))
print('Cross-validation score: ' + str(cross_validation_lgb))

In [None]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df['feature'].isin(cols)]

plt.figure(figsize=(14, 40))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [None]:
df_sub = pd.DataFrame({"card_id":df_test["card_id"].values})
df_sub["target"] = predictions
df_sub.to_csv("output/lgbm_{}.csv".format(cross_validation_lgb), index=False)