This was a team project. The team members include : Harsh Patel, Deep Talati, Akshat Vaidya, Xingyi Wang and Lei Xia.
<br>
Topic: Prediction of Customer Loyalty Score for Elo (Kaggle Competition)

# Table of Contents:

<a href='#section1'>1. Importing needed libraries</a>
<br>
<a href='#section2'>2. Data Visualization and EDA</a>
<br>
<a href='#section3'>3. Data Pre-Processing</a>
<br>
<a href='#section5'>5. Boosting</a>
<br>
<a href='#section6'>6. Random Forests</a>
<br>
<a href='#section7'>7. References </a>
<br>

<a id='section1'></a>

# Importing the necessary libraries

In [None]:
import numpy as np
import pandas as pd
import warnings
import datetime
warnings.filterwarnings('ignore')
from sklearn import preprocessing
import gc
import matplotlib.pyplot as plt

The path variable stores the path of the data.

In [None]:
path = 'original/'

The reduce_mem_usage functions is used to speed up the reading of files. 

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = reduce_mem_usage(pd.read_csv(path+'train.csv'))
test = reduce_mem_usage(pd.read_csv(path+'test.csv'))

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train['first_active_month'] = pd.to_datetime(train['first_active_month'])
test['first_active_month'] = pd.to_datetime(test['first_active_month'])
train.shape

Converted the first_active_month into a date from object datatype.

<a id='section2'></a>

# Data Visulization and Exploratory Data Analysis

In [None]:
plt.figure(figsize=[4,3])
plt.bar([0, 1], [train.shape[0], test.shape[0]], edgecolor=[0.2]*3, color=(1,0,0,0.5))
plt.xticks([0,1], ['train rows', 'test rows'], fontsize=13)
plt.title('Number of rows in train.csv and test.csv', fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
%matplotlib inline

plt.figure(figsize=[15,5])
plt.suptitle('Feature distributions in train.csv and test.csv', fontsize=20, y=1.1)
for num, col in enumerate(['feature_1', 'feature_2', 'feature_3', 'target']):
    plt.subplot(2, 4, num+1)
    if col is not 'target':
        v_c = train[col].value_counts() / train.shape[0]
        plt.bar(v_c.index, v_c, label=('train'), align='edge', width=-0.3, edgecolor=[0.2]*3)
        v_c = test[col].value_counts() / test.shape[0]
        plt.bar(v_c.index, v_c, label=('test'), align='edge', width=0.3, edgecolor=[0.2]*3)
        plt.title(col)
        plt.legend()
    else:
        plt.hist(train[col], bins = 100)
        plt.title(col)
    plt.tight_layout()
plt.tight_layout()
plt.show()

In [None]:
corrs = np.abs(train.corr())
np.fill_diagonal(corrs.values, 0)
plt.figure(figsize=[5,5])
plt.imshow(corrs, cmap='plasma', vmin=0, vmax=1)
plt.colorbar(shrink=0.7)
plt.xticks(range(corrs.shape[0]), list(corrs.columns))
plt.yticks(range(corrs.shape[0]), list(corrs.columns))
plt.title('Correlations between target and user\'s features', fontsize=17)
plt.show()

<a id='section3'></a>

# Data Preprocessing

In [None]:
train['month'] = train['first_active_month'].dt.month
test['month'] = test['first_active_month'].dt.month

train['year'] = train['first_active_month'].dt.year
test['year'] = test['first_active_month'].dt.year

train['time_diff'] = (datetime.date(2018, 2, 1) - train['first_active_month'].dt.date).dt.days
test['time_diff'] = (datetime.date(2018, 2, 1) - test['first_active_month'].dt.date).dt.days

train.head()

In [None]:
train = pd.get_dummies(train, columns=['feature_1', 'feature_2'])
test = pd.get_dummies(test, columns=['feature_1', 'feature_2'])
train.head()

In [None]:
hist = reduce_mem_usage(pd.read_csv(path+'historical_transactions.csv'))
hist.head()

In [None]:
hist['authorized_flag'] = hist['authorized_flag'].map({'Y':1, 'N':0})
hist['category_1'] = hist['category_1'].map({'Y':1, 'N':0})
hist = pd.get_dummies(hist, columns=['category_2', 'category_3'])
hist.head()

The new_aggregate_cols function is used for creating the aggregated fields. It groups transactions on card_ids.

In [None]:
def new_aggregate_cols(df, prefix):
    agg_func = {
        'authorized_flag': ['sum','mean'],
        'category_1': ['sum','mean'],
        'category_2_1.0': ['sum','mean'],
        'category_2_2.0': ['sum','mean'],
        'category_2_3.0': ['sum','mean'],
        'category_2_4.0': ['sum','mean'],
        'category_2_5.0': ['sum','mean'],
        'category_3_A': ['sum','mean'],
        'category_3_B': ['sum','mean'],
        'category_3_C': ['sum','mean'],
        'merchant_id': ['nunique','count'],
        'purchase_amount': ['sum','mean','min','max','std'],
        'installments': ['sum','mean','min','max','std'],
        'month_lag': ['sum','mean','min','max','std']
        
    }
    agg_df = df.groupby(['card_id']).agg(agg_func)
    agg_df.columns = [prefix + '_'.join(col).strip() for col in agg_df.columns.values]
    agg_df.reset_index(inplace=True)
    
    df1 = (df.groupby('card_id').size().reset_index(name='{}transactions_count'.format(prefix)))
    
    agg_df = pd.merge(df1, agg_df, on='card_id', how='left')
    return agg_df

In [None]:
import gc
history_trans = new_aggregate_cols(hist, prefix='hist_')

In [None]:
del hist

In [None]:
gc.collect()

gc is the garbage collector. It is invoked to clean up the memory. The historical transactions is large file and keeping it in memory causes performance issues!

In [None]:
train = pd.merge(train, history_trans, on='card_id', how='left')
train.head()

In [None]:
test = pd.merge(test, history_trans, on='card_id', how='left')
del history_trans
gc.collect()

In [None]:
new_merch_trans = reduce_mem_usage(pd.read_csv(path+'new_merchant_transactions.csv'))
new_merch_trans.head()

#### train and test are merged with the aggregated features of historical transactions. This is basically a join operation. It is a left join performed on card_id

In [None]:
new_merch_trans.shape

In [None]:
new_merch_trans = pd.get_dummies(new_merch_trans, columns=['category_2','category_3'])
new_merch_trans['authorized_flag'] = new_merch_trans['authorized_flag'].map({'Y':1, 'N':0})
new_merch_trans['category_1'] = new_merch_trans['category_1'].map({'Y':1, 'N':0})
new_merch_trans.head()

In [None]:
merch_new = new_aggregate_cols(new_merch_trans, 'new_merch_')

In [None]:
del new_merch_trans
gc.collect()

In [None]:
train = pd.merge(train, merch_new, on='card_id', how='left')
test = pd.merge(test, merch_new, on='card_id', how='left')
train.head()

In [None]:
del merch_new
gc.collect()

Training dataset and target score labels are created here. 

In [None]:
target = train['target']
drop_cols = ['card_id','first_active_month','target']
use_cols = [c for c in train.columns if c not in drop_cols]
features = list(train[use_cols].columns)
len(features)

In [None]:
train[features].head()

<a id='section5'></a>

# Boosting

<a href='#section5.1'>Baseline Model</a><br>
<a href='#section5.2'>Identifying top features</a><br>
<a href='#section5.3'>Tuning of Hyperparameters</a><br>
<a href='#section5.4'>Best Model</a><br>
<a href='#section5.5'>Feature importance</a><br>
<a href='#section5.6'>SHAP Feature Importance</a><br>
<a href='#section5.7'>Boosting with top 10 features (Tuned Model)</a>

<a id='section5.1'></a>

## Baseline Model

In [None]:
from sklearn import model_selection, metrics, preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb

In [None]:
param = {
    'num_leaves':50,
    'min_data_in_leaf':30,
    'objective':'regression',
    'max_depth':-1,
    'learning_rate':0.005,
    'boosting':'gbdt',
    'feature_fraction':0.9,
    'bagging_freq':1,
    'bagging_fraction':0.9,
    'bagging_seed':46,
    'metric':'rmse',
    'lambda_l1':0.1,
    'verbosity':-1
}

folds = KFold(n_splits=5, shuffle=True, random_state=23)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

The parameter values are randomly chosen here! <br>
K-Fold cross validation is performed with K=5

In [None]:
for fold_, (train_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print('-')
    print('Fold {}'.format(fold_ + 1))
    train_data = lgb.Dataset(train.iloc[train_idx][features], label=target.iloc[train_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])
    
    num_round = 10000
    clf = lgb.train(param, train_data, num_round, valid_sets=[train_data, val_data], verbose_eval=100, 
                    early_stopping_rounds=100)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

### RMSE Score for Baseline Model

In [None]:
from sklearn.metrics import mean_squared_error
val_score = np.sqrt(mean_squared_error(target,oof))
val_score

<a id='section5.2'></a>

## Top 50 most important features

In [None]:
import seaborn as sns

In [None]:
fig, ax = plt.subplots(figsize=(13,11))
lgb.plot_importance(clf, max_num_features=50, height=0.5, ax=ax, title='Feature Importance', xlabel='Importance',
                   ylabel='Features')

In [None]:
sorted(list(zip(clf.feature_importance(), features)), reverse=True)

#### Selecting the top 30 features

In [None]:
f_score, f_names = zip(*f_list)

In [None]:
top30 = list(f_names[:30])

In [None]:
f_names[:30]

### Boosting with top 30 features (untuned model)

In [None]:
param = {
    'num_leaves':50,
    'min_data_in_leaf':30,
    'objective':'regression',
    'max_depth':-1,
    'learning_rate':0.005,
    'boosting':'gbdt',
    'feature_fraction':0.9,
    'bagging_freq':1,
    'bagging_fraction':0.9,
    'bagging_seed':46,
    'metric':'rmse',
    'lambda_l1':0.1,
    'verbosity':-1
}

folds = KFold(n_splits=5, shuffle=True, random_state=23)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

In [None]:
for fold_, (train_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print('-')
    print('Fold {}'.format(fold_ + 1))
    train_data = lgb.Dataset(train.iloc[train_idx][top30], label=target.iloc[train_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][top30], label=target.iloc[val_idx])
    
    num_round = 10000
    clf = lgb.train(param, train_data, num_round, valid_sets=[train_data, val_data], verbose_eval=100, 
                    early_stopping_rounds=100)
    oof[val_idx] = clf.predict(train.iloc[val_idx][top30], num_iteration=clf.best_iteration)
    predictions += clf.predict(test[top30], num_iteration=clf.best_iteration) / folds.n_splits

#### RMSE Score

In [None]:
val_score = np.sqrt(mean_squared_error(target,oof))
val_score

In [None]:
fig, ax = plt.subplots(figsize=(13,11))
lgb.plot_importance(clf, max_num_features=50, height=0.5, ax=ax, title='Feature Importance', xlabel='Importance',
                   ylabel='Features')

<a id='section5.3'></a>

## Tuning the hyperparameters for the best model

This takes a <strong><u>long time to run (~3 hours)</u></strong> but identifies the best set of hyperparameters value for the model. It uses a Bayesian Optimization framework and is particularly well suited when cost of exploration is high. 

In [None]:
from bayes_opt import BayesianOptimization

In [None]:
def LGB_CV(
          max_depth,
          num_leaves,
          min_data_in_leaf,
          feature_fraction,
          bagging_fraction,
          lambda_l1
         ):
    
    folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(train.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                               label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train.iloc[val_idx][features],
                               label=target.iloc[val_idx])
    
        param = {
            'num_leaves': int(num_leaves),
            'min_data_in_leaf': int(min_data_in_leaf), 
            'objective':'regression',
            'max_depth': int(max_depth),
            'learning_rate': 0.01,
            "boosting": "gbdt",
            "feature_fraction": feature_fraction,
            "bagging_freq": 1,
            "bagging_fraction": bagging_fraction ,
            "bagging_seed": 11,
            "metric": 'rmse',
            "lambda_l1": lambda_l1,
            "verbosity": -1
        }
    
        clf = lgb.train(param,
                        trn_data,
                        10000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds = 200)
        
        oof[val_idx] = clf.predict(train.iloc[val_idx][features],
                                   num_iteration=clf.best_iteration)
        
        del clf, trn_idx, val_idx
        gc.collect()
        
    return -mean_squared_error(oof, target)**0.5

In [None]:
LGB_BO = BayesianOptimization(LGB_CV, {
    'max_depth': (4, 10),
    'num_leaves': (5, 130),
    'min_data_in_leaf': (10, 150),
    'feature_fraction': (0.7, 1.0),
    'bagging_fraction': (0.7, 1.0),
    'lambda_l1': (0, 6)
    })

In [None]:
print('<','-'*80,'>')
LGB_BO.maximize(init_points=2, n_iter=20, acq='ei', xi=0.0)


<a id='section5.4'></a>

### Best Model with tuned hyperparameters

In [None]:
best_param = {
    'num_leaves':111,
    'min_data_in_leaf':149,
    'objective':'regression',
    'max_depth':9,
    'learning_rate':0.005,
    'boosting':'gbdt',
    'feature_fraction':0.7522,
    'bagging_freq':1,
    'bagging_fraction':0.7083,
    'bagging_seed':11,
    'metric':'rmse',
    'lambda_l1':0.2634,
    'random_state':133,
    'verbosity':-1
}

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=21)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_imp = pd.DataFrame()

for fold_, (train_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print('<------------------------------------------------------------->')
    print('Fold {}'.format(fold_ + 1))
    train_data = lgb.Dataset(train.iloc[train_idx][features], target.iloc[train_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], target.iloc[val_idx])
    num_round = 10000
    bclf = lgb.train(best_param, train_data, num_round, valid_sets=[train_data, val_data], verbose_eval=100,
                   early_stopping_rounds=200)
    oof[val_idx] = bclf.predict(train.iloc[val_idx][features], num_iteration=bclf.best_iteration)
    
    fold_imp = pd.DataFrame()
    fold_imp['feature'] = features
    fold_imp['importance'] = bclf.feature_importance()
    fold_imp['fold'] = fold_ + 1
    feature_imp = pd.concat([feature_imp, fold_imp], axis=0, ignore_index=True)
    
    predictions+= bclf.predict(test[features], num_iteration=bclf.best_iteration) / folds.n_splits

In [None]:
print('CV Score ', np.sqrt(mean_squared_error(target, oof)))

<a id='section5.5'></a>

## Feature Importance

### Top 30 features by Split

In [None]:
fig, ax = plt.subplots(figsize=(13,11))
lgb.plot_importance(bclf, max_num_features=30, ax=ax, height=0.5, xlabel='Importance',
                   ylabel='Feature', title='Feature Importance')

### Top 30 features by Gain

In [None]:
fig, ax = plt.subplots(figsize=(13,11))
lgb.plot_importance(bclf, max_num_features=30, ax=ax, height=0.5, xlabel='Importance',
                   ylabel='Feature', title='Feature Importance', importance_type='gain')

<a id='section5.6'></a>

## Feature importance by SHAP

In [None]:
import shap

Computing SHAP values from the validation data

In [None]:
shap_values = shap.TreeExplainer(bclf).shap_values(train.iloc[val_idx][features])
shap_values.shape

In [None]:
global_importances = np.abs(shap_values).mean(0)[:-1]

In [None]:
shap.summary_plot(shap_values, train.iloc[val_idx][features])

In [None]:
shap.dependence_plot('time_diff', shap_values, train.iloc[val_idx][features])

In [None]:
shap.dependence_plot('new_merch_purchase_amount_max', shap_values, train.iloc[val_idx][features])

<a id='section5.7'></a>

## Boosting with Top 10 features (Tuned Model)

This is the model we recommend

In [None]:
top10 = feature_imp.groupby(['feature']).mean()[['importance']].sort_values(by='importance', ascending=False)
top10 = top10.reset_index()
top10f = list(top10['feature'][:10])

In [None]:
top10f

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=21)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (train_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print('<------------------------------------------------------------->')
    print('Fold {}'.format(fold_ + 1))
    train_data = lgb.Dataset(train.iloc[train_idx][top10f], target.iloc[train_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][top10f], target.iloc[val_idx])
    num_round = 10000
    clf = lgb.train(best_param, train_data, num_round, valid_sets=[train_data, val_data], verbose_eval=100,
                   early_stopping_rounds=200)
    oof[val_idx] = clf.predict(train.iloc[val_idx][top10f], num_iteration=clf.best_iteration)
    
    predictions+= clf.predict(test[top10f], num_iteration=clf.best_iteration) / folds.n_splits

#### RMSE Score

In [None]:
print('CV Score ', np.sqrt(mean_squared_error(target, oof)))

<a id='section6'></a>

# Random Forest

<a href='#section6.1'>Untuned Random Forest with all features</a><br>
<a href='#section6.2'>Hyperparameter Tuning for Random Forest</a><br>
<a href='#section6.3'>Tuned Random Forest with all features</a><br>
<a href='#section6.4'>Feature Importance</a><br>
<a href='#section6.5'>Tuned Random Forest with top 10 features</a><br>

<a id='#section6.1'></a>

## Untuned Random Forest with all the features

In [None]:
param = {
    'num_leaves':50,
    'min_data_in_leaf':30,
    'objective':'regression',
    'max_depth':-1,
    'learning_rate':0.005,
    'boosting':'gbdt',
    'feature_fraction':0.9,
    'bagging_freq':1,
    'bagging_fraction':0.9,
    'bagging_seed':46,
    'metric':'rmse',
    'lambda_l1':0.1,
    'verbosity':-1
}


In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=21)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (train_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print('<------------------------------------------------------------->')
    print('Fold {}'.format(fold_ + 1))
    train_data = lgb.Dataset(train.iloc[train_idx][features], target.iloc[train_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], target.iloc[val_idx])
    num_round = 10000
    clf = lgb.train(param, train_data, num_round, valid_sets=[train_data, val_data], verbose_eval=100,
                   early_stopping_rounds=200)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    predictions+= clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

In [None]:
print('CV Score ', np.sqrt(mean_squared_error(target, oof)))

<a id='section6.2'></a>

## Hyperparameter Tuning for Random Forests

This takes a long time to run! The learning rate value can be increased for faster computation.

In [None]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error

In [None]:
def LGB_CV(
          max_depth,
          num_leaves,
          min_data_in_leaf,
          feature_fraction,
          lambda_l1
         ):
    
    folds = KFold(n_splits=5, shuffle=False, random_state=15)
    oof = np.zeros(len(train))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                               label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train.iloc[val_idx][features],
                               label=target.iloc[val_idx])
    
        param = {
            'num_leaves': int(num_leaves),
            'min_data_in_leaf': int(min_data_in_leaf), 
            'objective':'regression',
            'max_depth': int(max_depth),
            'learning_rate': 0.01,
            "boosting": "rf",
            "feature_fraction": feature_fraction,
            "bagging_freq": 1,
            "bagging_fraction": 0.76 ,
            "bagging_seed": 11,
            "metric": 'rmse',
            "lambda_l1": lambda_l1,
            "verbosity": -1
        }
    
        clf = lgb.train(param,
                        trn_data,
                        10000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds = 200)
        
        oof[val_idx] = clf.predict(train.iloc[val_idx][features],
                                   num_iteration=clf.best_iteration)
        
        del clf, trn_idx, val_idx
        gc.collect()
        
    return -mean_squared_error(oof, target)**0.5

In [None]:
LGB_BO = BayesianOptimization(LGB_CV, {
    'max_depth': (4, 10),
    'num_leaves': (5, 130),
    'min_data_in_leaf': (10, 150),
    'feature_fraction': (0.7, 1.0),
    'lambda_l1': (0, 1)
    })

In [None]:
print('<','-'*80,'>')
LGB_BO.maximize(init_points=2, n_iter=10, acq='ei', xi=0.0)

<a id='section 6.3'></a>

## Tuned Random Forests

In [None]:
best_param = {
    'num_leaves':130,
    'min_data_in_leaf':117,
    'objective':'regression',
    'max_depth':10,
    'learning_rate':0.005,
    'boosting':'rf',
    'feature_fraction':0.7522,
    'bagging_freq':1,
    'bagging_fraction':0.9132,
    'bagging_seed':11,
    'metric':'rmse',
    'lambda_l1':4.925,
    'random_state':133,
    'verbosity':-1
}


In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=21)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (train_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print('<------------------------------------------------------------->')
    print('Fold {}'.format(fold_ + 1))
    train_data = lgb.Dataset(train.iloc[train_idx][features], target.iloc[train_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], target.iloc[val_idx])
    num_round = 10000
    clf = lgb.train(best_param, train_data, num_round, valid_sets=[train_data, val_data], verbose_eval=100,
                   early_stopping_rounds=200)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    predictions+= clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

#### RMSE Score

In [None]:
val_score = np.sqrt(mean_squared_error(target, oof))
val_score

<a href='section6.4'></a>

## Feature Importance

### Top 30 features by split

In [None]:
fig, ax = plt.subplots(figsize=(11,13))
lgb.plot_importance(clf, max_num_features=30, ax=ax, height=0.5, xlabel='Importance', ylabel='Feature',
                   title='Feature Importance')

### Top 30 features by gain

In [None]:
fig, ax = plt.subplots(figsize=(11,13))
lgb.plot_importance(clf, max_num_features=30, ax=ax, height=0.5, xlabel='Importance', ylabel='Feature',
                   title='Feature Importance')

<a id='section6.5'></a>

## Tuned Random Forests with top 10 features

In [None]:
l1 = sorted(list(zip(clf.feature_importance(), clf.feature_name())), reverse=True)
_, names = zip(*l1)
top10rf = list(names[:10])
top10rf

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=21)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (train_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print('<------------------------------------------------------------->')
    print('Fold {}'.format(fold_ + 1))
    train_data = lgb.Dataset(train.iloc[train_idx][top10rf], target.iloc[train_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][top10rf], target.iloc[val_idx])
    num_round = 10000
    clf = lgb.train(best_param, train_data, num_round, valid_sets=[train_data, val_data], verbose_eval=100,
                   early_stopping_rounds=200)
    oof[val_idx] = clf.predict(train.iloc[val_idx][top10rf], num_iteration=clf.best_iteration)
    
    predictions+= clf.predict(test[top10rf], num_iteration=clf.best_iteration) / folds.n_splits

#### RMSE Score

In [None]:
val_score = np.sqrt(mean_squared_error(target, oof))
val_score

<a id='section7'></a>

## References

For Model
1. https://www.kaggle.com/fabiendaniel/elo-world
2. https://www.kaggle.com/peterhurford/you-re-going-to-want-more-categories-lb-3-737

For SHAP
3. http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions
4. https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27
5. https://meichenlu.com/2018-11-10-SHAP-explainable-machine-learning/