# MIDS w207 - Final Project
## Elo Merchant Category Recommendation Kaggle Challenge

### Team 3
- Vinicio De Sola
- Kevin Hanna
- Pri Nonis
- Bradley Nott

### Presenting
#### Intro:
- Use the main file for presenting
- Remind class what Elo is and what our task was
- Tell the the difference from Baseline to the winning submission is quite small
- We did feature engineering and extraction
- Sparce PCA overview
- Since marginal improvements were difficult we ended up spilling off trying different approaches independently unintentionally, but effectively creating our own Kaggle competition inside the team.
- Pri designed a framework that would score our models and submit the winning model's prediction to Kaggle.

#### Models:
- Pri: Engineering, Framework and Baseline
- Brad: XGboost
- Kevin: LightGMB
- Vinicio: NN

<b>Common Imports</b>

In [1]:
import numpy   as np
import pandas  as pd
import os.path as op
import os      as os
import gc      as gc
import time    as ti

<b>Utility Functions</b>

In [2]:
def root_mean_squared_error(y_true, y_pred) :
    return np.sqrt(((y_pred - y_true) ** 2).mean())

def mean_squared_error(y_true, y_pred) :
    return ((y_pred - y_true) ** 2).mean()

def setup_environment() :
    globals()['csv_base'] = '../input'  if 'working' in os.getcwd() else './input'
    globals()['pkl_base'] = '../pickle' if 'working' in os.getcwd() else './pickle'

    os.makedirs(csv_base, exist_ok = True)
    os.makedirs(pkl_base, exist_ok = True)

    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

    import warnings
    warnings.filterwarnings('ignore')

setup_environment()

# Data Loading and Cleanup

In [3]:
def compress(df, verbose = True) :
    smu = df.memory_usage().sum() / 1024**2 / 8
    con = {'f' : {                                   np.finfo(np.float16).max : np.float16, np.finfo(np.float32).max : np.float32, np.finfo(np.float64).max : np.float64},
           'u' : {np.iinfo(np.uint8).max : np.uint8, np.iinfo(np.uint16).max  : np.uint16,  np.iinfo(np.uint32).max  : np.uint32,  np.iinfo(np.uint64).max  : np.uint64},
           'i' : {np.iinfo(np.int8).max  : np.int8,  np.iinfo(np.int16).max   : np.int16,   np.iinfo(np.int32).max   : np.int32,   np.iinfo(np.int64).max   : np.int64}}

    for c in df.columns :
        if  con.get(df[c].dtype.kind) :
            df[c] = df[c].astype(con[df[c].dtype.kind].get(min((n for n in con[df[c].dtype.kind].keys() if n > max(df[c].max(), abs(df[c].min()))))))

    emu = df.memory_usage().sum() / 1024**2 / 8

    if  verbose :
        print(f'Memory Use Decreased to {emu:5.2f} MB [{100 * (smu - emu) / emu:5.1f}% Reduction]')

    return df, 100 * (smu - emu) / emu

def read(csv_path, dates = [], brize = [], dummy = [], delna = False, index = None, regen = False) :
    pkl_path = op.join(pkl_base, csv_path).replace('.csv', '.pkl')
    csv_path = op.join(csv_base, csv_path)
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not regen :
        df       = pd.read_pickle(pkl_path)
        csv_path = pkl_path
        rp       = 0.0
    else                    :
        df = pd.read_csv(csv_path, parse_dates = dates, memory_map = True)
        if  index :
            df = df.set_index(index)
        df, rp = compress(df, verbose = False)

        df.to_pickle(pkl_path)

    mu = df.memory_usage().sum() / 1024**2
    print(f'Loading : {op.basename(csv_path):>29} in {ti.time()-srt_time:5.1f} Seconds, ' +
          f'Shape is {str(df.shape):>14}, Memory Usage is {mu / 8:6.2f} MB ' +
          f'[Reduction of {rp:5.1f} %].' +
          (' 🥒' if not regen else ''))

    return df

def load() :
    data           = {}
    data['train' ] = read('train.csv', dates = ['first_active_month'], index = 'card_id', regen = True)
    data['test'  ] = read('test.csv',  dates = ['first_active_month'], index = 'card_id', regen = True)
  # data['mercs' ] = read('merchants.csv', index = 'merchant_id')
    data['tx_new'] = read('new_merchant_transactions.csv', dates = ['purchase_date'])
    data['tx_old'] = read('historical_transactions.csv',   dates = ['purchase_date'])
    data['target'] = data['train'].pop('target')
    print(f'\nLoading : Done.')

    return data

def clean(data) :
  # replace missing first active month
    data['test'].loc['C_ID_c27b4f80f7', 'first_active_month'] = data['test']['first_active_month'].min()
    
    return data

In [4]:
data = clean(load())

Loading :                     train.csv in   0.7 Seconds, Shape is    (201917, 5), Memory Usage is   0.51 MB [Reduction of 128.6 %].
Loading :                      test.csv in   0.3 Seconds, Shape is    (123623, 4), Memory Usage is   0.28 MB [Reduction of 110.5 %].
Loading : new_merchant_transactions.pkl in   0.8 Seconds, Shape is  (1963031, 14), Memory Usage is  14.27 MB [Reduction of   0.0 %]. 🥒
Loading :   historical_transactions.pkl in  10.1 Seconds, Shape is (29112361, 14), Memory Usage is 218.64 MB [Reduction of   0.0 %]. 🥒

Loading : Done.


# Feature Engineering

<b>Draft a Plan - Feature Engineering and Training Framework</b>

In [5]:
def draft(data) :

    np.random.seed(0)
    
    plan = {}
    
    plan['scorer'        ] = root_mean_squared_error
    plan['baseline_model'] = None
    plan['best_model'    ] = None
    plan['baseline_score'] =   0.0
    plan['best_score'    ] = 100.0
    plan['baseline_feats'] = []
    plan['best_feats'    ] = []

  # split training data into 80/20 training and development buckets
    plan['train_size'] = len(data['train' ])
    plan['train_pcnt'] = 0.8
    plan['train_mask'] = np.random.rand(plan['train_size']) < plan['train_pcnt']
    plan['devel_mask'] =               ~plan['train_mask']

    plan['target' ] = data['target']
    plan['y_train'] = data['target'][plan['train_mask']]
    plan['y_devel'] = data['target'][plan['devel_mask']]

    plan['first_active_month_max'] = max(data['train' ].first_active_month.max(),
                                         data['test'  ].first_active_month.max())
    plan['purchase_date_max'     ] = max(data['tx_old'].purchase_date.max(),
                                         data['tx_new'].purchase_date.max())
    plan['purchase_date_ref'     ] =     data['tx_old'].purchase_date.max()

    return plan

In [6]:
plan = draft(data)

<b>Engineer Transactions - Aggregate Secondary Training Data<b>

In [7]:
def engineer_transactions(tf, prefix) :

  # binarize boolean Y/N flag variables
    tf['category_1'     ] = tf['category_1'     ].eq('Y').mul(1)
    tf['authorized_flag'] = tf['authorized_flag'].eq('Y').mul(1)

  # convert categorical variables to dummy/indicator, preserve original categorical variable
    tf = pd.concat([tf, pd.get_dummies(tf[['category_2', 'category_3']],
                                columns = ['category_2', 'category_3'])], axis = 1)

  # recover purchase history by denormolizing
    tf['month_diff'    ] = (plan['purchase_date_ref'] - tf['purchase_date']) \
                         // np.timedelta64(1, 'M') + (tf['month_lag'])

  # extract purchase_month from date
    tf['purchase_month'] = tf['purchase_date'].dt.month

  # convert datetime to numerical
    tf['purchase_ndate'] = tf['purchase_date'].astype(np.int64) * 1e-9

    return tf
    
def engineer_transactions_aggregated(tf, prefix, regen = False) :

    pkl_path = op.join(pkl_base, f'engineered_{prefix}_transactions_aggregated.pkl')
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not regen :

        df = pd.read_pickle(pkl_path)
        
    else :

        tf = engineer_transactions(tf, tf)

      # ───────────────────────────────────────────────────────────────────────────────────────────────
      # aggregate transactions per card
      # ───────────────────────────────────────────────────────────────────────────────────────────────

        aggregations_win = \
        {
            'category_1'           : ['mean', 'sum'],

            'category_2_1.0'       : ['mean'],
            'category_2_2.0'       : ['mean'],
            'category_2_3.0'       : ['mean'],
            'category_2_4.0'       : ['mean'],
            'category_2_5.0'       : ['mean'],
            'category_3_A'         : ['mean'],
            'category_3_B'         : ['mean'],
            'category_3_C'         : ['mean'],

            'merchant_id'          : ['nunique'],
            'merchant_category_id' : ['nunique'],
            'state_id'             : ['nunique'],
            'city_id'              : ['nunique'],
            'subsector_id'         : ['nunique'],

            'purchase_amount'      : ['mean', 'sum', 'max', 'min', 'std'],
            'installments'         : ['mean', 'sum', 'max', 'min', 'std'],
            'purchase_month'       : ['mean',        'max', 'min', 'std'],
            'purchase_ndate'       : [np.ptp,        'max', 'min'       ],
            'month_lag'            : ['mean',        'max', 'min', 'std'],
            'month_diff'           : ['mean'                            ]
        }

        aggregations_kev = \
        {
            'purchase_amount'      : ['sum', 'mean', 'median', 'min', 'max', 'std'],
            'subsector_id'         : ['nunique'],
            'merchant_category_id' : ['nunique'],
            'merchant_id'          : ['nunique'],
            'installments'         : ['sum', 'mean'],
            'city_id'              : ['nunique'],
            'state_id'             : ['nunique'], 
            'category_1'           : ['sum'],
            'category_2_1.0'       : ['sum'],
            'category_2_2.0'       : ['sum'],
            'category_2_3.0'       : ['sum'],
            'category_2_4.0'       : ['sum'],
            'category_2_5.0'       : ['sum'],
            'category_3_A'         : ['sum'],
            'category_3_B'         : ['sum'],
            'category_3_C'         : ['sum'],

            'month_lag'           : ['mean', 'min', 'max', 'std'],
            'authorized_flag'     : ['sum'],
            'month_diff'          : ['mean', 'min', 'max', 'std'],
            'purchase_date'       : [np.ptp, 'min', 'max']
        }

      # add aggregations
        df = tf.groupby(['card_id']).agg(aggregations_kev)
        df.columns = ['_'.join((prefix,) + c) for c in df.columns.values]
        df.reset_index(inplace = True)
       
      # add aggregated transaction count
        tc = tf.groupby('card_id').size().reset_index(name = f'{prefix}_transaction_count')
        df = pd.merge(tc, df, on = 'card_id', how = 'left')

        df.to_pickle(pkl_path)
    
    print(f'Engineering : {prefix:>5} Transactions in {ti.time()-srt_time:5.1f} Seconds.' +
          (' 🥒' if not regen else ''))
    
    return df

<b>Principal Component Analysis<b>

In [8]:
def engineer_cards_pca(df, aggs, plan, prefix) :
    
    from sklearn.decomposition import PCA, SparsePCA, MiniBatchSparsePCA
    from sklearn.preprocessing import StandardScaler

  # normalize the features
    df_nrm = StandardScaler().fit_transform(df._get_numeric_data().fillna(0))
  # calculate the principal components
    df_pca = MiniBatchSparsePCA(ridge_alpha = 10, n_jobs = -1).fit_transform(df_nrm)
  # create sparse representation  
    df_cop   = np.where(df_pca > 0.0001, df_pca, 0)
    sparsity = 100 - 100 * df_cop[df_cop != 0].shape[0] / (df_pca.shape[0] * df_pca.shape[1])

    print(f'Engineering : {prefix:>5} Compressed   by {sparsity:.02f}% using Sparse Representation.')

  # pick components that explain 80% of variance
    for n in range(10) :
        df['pca_{n}'] = df_pca[:,n]
        df['cop_{n}'] = df_cop[:,n]
        
    return df

<b>Engineer Cards - Primary Training Data</b>

In [9]:
def engineer_cards(df, aggs, plan, prefix, regen = False) :

    pkl_path = op.join(pkl_base, f'engineered_{prefix}_cards.pkl')
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not regen :

        df = pd.read_pickle(pkl_path)

    else :

      # convert categorical variables to dummy/indicator, preserve original categorical vars
        df = pd.concat([df, pd.get_dummies(df[['feature_1', 'feature_2']],
                                    columns = ['feature_1', 'feature_2'])], axis = 1)

      # normalized active days of card from first date the shopper made a purchase through Elo
        df['active_days'] = (plan['first_active_month_max'] - df['first_active_month']).dt.days

        for agg in aggs :
            df = pd.merge( df, aggs[agg], on = 'card_id', how = 'left')
            
        df = engineer_cards_pca(df, aggs, plan, prefix)

        df.to_pickle(pkl_path)
            
    print(f'Engineering : {prefix:>5} Cards        in {ti.time()-srt_time:5.1f} Seconds.' +
          (' 🥒' if not regen else ''))

    return df

In [10]:
def engineer(data, plan, delete = False) :

  # extract approved old transactions
    data['tx_app'] = data['tx_old'][data['tx_old']['authorized_flag'] == 'Y']

  # aggregate transaction features
    aggs           = {}
    aggs['tx_new'] = engineer_transactions_aggregated(data['tx_new'], prefix = 'new')
    aggs['tx_old'] = engineer_transactions_aggregated(data['tx_old'], prefix = 'old')
    aggs['tx_app'] = engineer_transactions_aggregated(data['tx_app'], prefix = 'app')

  # join aggregated features to train and test sets
    feat           = {}
    feat['train' ] = engineer_cards(data['train'], aggs, plan, prefix = 'train', regen = False)
    feat['test'  ] = engineer_cards(data['test' ], aggs, plan, prefix = 'test' , regen = False)

    if  delete :
        del aggs
        del data
        del globals()['data']
        d = gc.collect()
    
    print(f'\nEngineering : Done.')

    return feat

<b>Engineered Data</b>

In [11]:
feat = engineer(data, plan)

Engineering :   new Transactions in   0.1 Seconds. 🥒
Engineering :   old Transactions in   0.1 Seconds. 🥒
Engineering :   app Transactions in   0.1 Seconds. 🥒
Engineering : train Cards        in   0.2 Seconds. 🥒
Engineering :  test Cards        in   0.1 Seconds. 🥒

Engineering : Done.


In [12]:
feat['train'].head()

Unnamed: 0,card_id,first_active_month,feature_1,feature_2,feature_3,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,active_days,new_transaction_count,new_category_1_mean,new_category_1_sum,new_category_2_1.0_mean,new_category_2_2.0_mean,new_category_2_3.0_mean,new_category_2_4.0_mean,new_category_2_5.0_mean,new_category_3_A_mean,new_category_3_B_mean,new_category_3_C_mean,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_mean,new_purchase_amount_sum,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_mean,new_installments_sum,new_installments_max,new_installments_min,new_installments_std,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_month_std,new_purchase_ndate_ptp,new_purchase_ndate_max,new_purchase_ndate_min,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean,old_transaction_count,old_category_1_mean,old_category_1_sum,old_category_2_1.0_mean,old_category_2_2.0_mean,old_category_2_3.0_mean,old_category_2_4.0_mean,old_category_2_5.0_mean,old_category_3_A_mean,old_category_3_B_mean,old_category_3_C_mean,old_merchant_id_nunique,old_merchant_category_id_nunique,old_state_id_nunique,old_city_id_nunique,old_subsector_id_nunique,old_purchase_amount_mean,old_purchase_amount_sum,old_purchase_amount_max,old_purchase_amount_min,old_purchase_amount_std,old_installments_mean,old_installments_sum,old_installments_max,old_installments_min,old_installments_std,old_purchase_month_mean,old_purchase_month_max,old_purchase_month_min,old_purchase_month_std,old_purchase_ndate_ptp,old_purchase_ndate_max,old_purchase_ndate_min,old_month_lag_mean,old_month_lag_max,old_month_lag_min,old_month_lag_std,old_month_diff_mean,app_transaction_count,app_category_1_mean,app_category_1_sum,app_category_2_1.0_mean,app_category_2_2.0_mean,app_category_2_3.0_mean,app_category_2_4.0_mean,app_category_2_5.0_mean,app_category_3_A_mean,app_category_3_B_mean,app_category_3_C_mean,app_merchant_id_nunique,app_merchant_category_id_nunique,app_state_id_nunique,app_city_id_nunique,app_subsector_id_nunique,app_purchase_amount_mean,app_purchase_amount_sum,app_purchase_amount_max,app_purchase_amount_min,app_purchase_amount_std,app_installments_mean,app_installments_sum,app_installments_max,app_installments_min,app_installments_std,app_purchase_month_mean,app_purchase_month_max,app_purchase_month_min,app_purchase_month_std,app_purchase_ndate_ptp,app_purchase_ndate_max,app_purchase_ndate_min,app_month_lag_mean,app_month_lag_max,app_month_lag_min,app_month_lag_std,app_month_diff_mean,pca_{n},cop_{n}
0,C_ID_92a2005557,2017-06-01,5,2,1,0,0,0,0,1,0,1,0,245,23.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,23.0,14.0,1.0,3.0,10.0,-0.575684,-13.242188,-0.296143,-0.724609,0.135742,0.0,0.0,0.0,0.0,0.0,3.478261,4.0,3.0,0.510754,4742309.0,1525001000.0,1520259000.0,1.478261,2.0,1.0,0.510754,-0.043478,260,0.0,0,0.988462,0.0,0.0,0.0,0.011538,0.984615,0.015385,0.0,94,41,3,7,21,-0.638341,-165.968735,2.258394,-0.739395,0.212139,0.015385,4,1,0,0.123314,8.057692,12,1,3.474193,20977987.0,1519551000.0,1498573000.0,-3.911538,0,-8,2.397687,-0.05,247,0.0,0,0.987854,0.0,0.0,0.0,0.012146,1.0,0.0,0.0,93,41,3,7,21,-0.637235,-157.397018,2.258394,-0.739395,0.216518,0.0,0,0,0,0.0,7.979757,12,1,3.52857,20977987.0,1519551000.0,1498573000.0,-3.882591,0,-8,2.429155,-0.048583,0.002595,0.002595
1,C_ID_3d0044924f,2017-01-01,4,1,0,0,0,0,1,0,1,0,0,396,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,5.0,1.0,1.0,4.0,-0.726074,-4.355469,-0.70166,-0.739258,0.014381,1.0,6.0,1.0,1.0,0.0,2.5,3.0,2.0,0.547723,4887632.0,1522393000.0,1517505000.0,1.5,2.0,1.0,0.547723,1.0,350,0.088571,31,0.911429,0.0,0.0,0.0,0.0,0.0,0.788571,0.205714,142,57,3,9,24,-0.600018,-210.006332,4.6303,-0.7424,0.384967,1.551429,543,10,-1,1.510777,6.22,12,1,3.848142,33717687.0,1517438000.0,1483720000.0,-5.031429,0,-12,3.804934,0.957143,339,0.085546,29,0.914454,0.0,0.0,0.0,0.0,0.0,0.80236,0.19174,141,57,3,9,24,-0.616175,-208.883453,4.6303,-0.7424,0.355554,1.477876,501,10,-1,1.350634,6.144543,12,1,3.859177,33717687.0,1517438000.0,1483720000.0,-5.050147,0,-12,3.836969,0.961652,-0.00128,0.0
2,C_ID_d639edf6cd,2016-08-01,2,2,0,0,1,0,0,0,0,1,0,549,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,-0.700195,-0.700195,-0.700195,-0.700195,,0.0,0.0,0.0,0.0,,4.0,4.0,4.0,,0.0,1524937000.0,1524937000.0,2.0,2.0,2.0,,0.0,43,0.0,0,0.093023,0.0,0.0,0.0,0.906977,1.0,0.0,0.0,13,8,2,5,7,-0.678311,-29.167391,-0.145847,-0.730138,0.08738,0.0,0,0,0,0.0,4.55814,12,1,3.275467,35635623.0,1519759000.0,1484123000.0,-8.604651,0,-13,3.842987,-0.023256,41,0.0,0,0.097561,0.0,0.0,0.0,0.902439,1.0,0.0,0.0,13,8,2,5,7,-0.678742,-27.828424,-0.145847,-0.730138,0.08923,0.0,0,0,0,0.0,4.634146,12,1,3.329836,35635623.0,1519759000.0,1484123000.0,-8.487805,0,-13,3.893083,-0.02439,0.000266,0.000266
3,C_ID_186d6a6901,2017-09-01,4,3,0,0,0,0,1,0,0,0,1,153,7.0,0.142857,1.0,0.0,0.0,0.0,0.857143,0.0,0.0,0.857143,0.0,7.0,6.0,2.0,2.0,5.0,-0.665039,-4.65625,-0.566895,-0.734375,0.065918,0.714286,5.0,1.0,-1.0,0.755929,3.714286,4.0,3.0,0.48795,3625505.0,1524049000.0,1520424000.0,1.714286,2.0,1.0,0.48795,0.0,77,0.155844,12,0.155844,0.0,0.0,0.688312,0.0,0.0,0.883117,0.090909,50,25,5,7,13,-0.642745,-49.491364,1.445596,-0.740897,0.261624,1.090909,84,3,-1,0.588974,7.74026,12,1,3.904797,13375339.0,1519818000.0,1506443000.0,-2.831169,0,-5,1.802065,-0.038961,77,0.155844,12,0.155844,0.0,0.0,0.688312,0.0,0.0,0.883117,0.090909,50,25,5,7,13,-0.642745,-49.491364,1.445596,-0.740897,0.261624,1.090909,84,3,-1,0.588974,7.74026,12,1,3.904797,13375339.0,1519818000.0,1506443000.0,-2.831169,0,-5,1.802065,-0.038961,-0.00185,0.0
4,C_ID_cdbd2c0db2,2017-11-01,1,3,0,1,0,0,0,0,0,0,1,92,36.0,0.055556,2.0,0.055556,0.0,0.194444,0.694444,0.0,0.0,0.944444,0.027778,36.0,17.0,5.0,5.0,10.0,-0.553711,-19.921875,0.450928,-0.739258,0.223877,0.972222,35.0,2.0,-1.0,0.376913,3.555556,4.0,3.0,0.503953,4949682.0,1524941000.0,1519992000.0,1.555556,2.0,1.0,0.503953,-0.055556,133,0.112782,15,0.075188,0.0,0.0,0.804511,0.007519,0.0,0.947368,0.052632,66,26,6,6,17,-0.366073,-48.687656,7.193041,-0.746156,1.352094,1.368421,182,12,1,1.896862,5.406015,12,1,5.003086,9405641.0,1519850000.0,1510445000.0,-1.285714,0,-3,1.0267,-0.022556,128,0.09375,12,0.078125,0.0,0.0,0.820312,0.007812,0.0,0.96875,0.03125,65,26,6,6,17,-0.539379,-69.040466,6.992617,-0.746156,0.737087,1.125,144,12,1,1.003929,5.554688,12,1,5.041261,9405641.0,1519850000.0,1510445000.0,-1.320312,0,-3,1.02668,-0.023438,-0.002476,0.0


# Training

<b>Utility Functions - Data Preperation and Scoring Framework</b>

In [13]:
def select(feat, include = [], exclude = [], verbose = False) :
    columns = feat['train']._get_numeric_data().columns.values
    
    from re import match
    
    if  include :
        columns = [c for c in columns if any([    match(f'^{i}$', c) for i in include])]
    if  exclude :
        columns = [c for c in columns if all([not match(f'^{e}$', c) for e in exclude])]

    for c in columns :
        t  = str(feat['train'][c].dtype)
        if 'int' not in t and \
           'flo' not in t     :
            print('Feature Selection Error : Column {c} Type {t} is Not Numeric!')
            raise
        
    if  verbose :
        print(f'include = {include}')
        print(f'exclude = {exclude}')
        print(f'columns = {columns}\n')

    return columns

def prep(plan, feat, include = [], exclude = []) :
    play            = {}
    play['feats'  ] = select(feat, include, exclude)

    play['target' ] = plan['target']                          # full train labels
    play['train'  ] = feat['train' ][play['feats']].fillna(0) # full train data
    play['test'   ] = feat['test'  ][play['feats']].fillna(0) # full test  data

    play['x_train'] = feat['train' ][play['feats']][plan['train_mask']].fillna(0) # train x 80%
    play['y_train'] = plan['target'][plan['train_mask']]                          # train y 80%

    play['x_devel'] = feat['train' ][play['feats']][plan['devel_mask']].fillna(0) # train x 20%
    play['y_devel'] = plan['target'][plan['devel_mask']]                          # train y 20%

    play['x_test' ] = feat['test'  ][play['feats'  ]].fillna(0)
    
    return play

def grade(plan, kind, y_pred, y_test, tag = '', baseline = False) :
    if  baseline :
        plan[    'best_score'] = \
        plan['baseline_score'] = plan['scorer'](plan['y_devel'], y_pred)
        plan[    'best_ytest'] = \
        plan['baseline_ytest'] = y_test
        tag                   += '⭕'

    score   =  plan['scorer'](plan['y_devel'], y_pred)
    improve = (plan['baseline_score'] - score) / plan['baseline_score'] * 100

    if  score <= plan['best_score'] and not baseline :
        plan['best_ytest'] = y_test
        plan['best_score'] = score
        tag               += '⭐'

    print(f'{kind:<25} : Score is {score:6.3f} [{improve:+6.3f}%]' +
         (f' {tag}' if tag else ''))

    return plan

## Linear Regression Model - Baseline

In [14]:
def regression(plan, play, opts = {}, baseline = False) :
    kind    = 'Linear Regression'

    from sklearn.linear_model  import LinearRegression

    model   = LinearRegression() \
                .fit(play['x_train'], play['y_train'])

    return grade(plan, kind, model.predict(play['x_devel']), model.predict(play['x_test']), baseline = baseline)

In [15]:
def regression_play(plan, feat) :
    plan = regression(plan, play = prep(plan, feat, include = ['feature_1', 'feature_2', 'feature_3'               ]), baseline = True ) # base features in train          - baseline
    plan = regression(plan, play = prep(plan, feat, include = ['old_purchase_amount_sum', 'new_purchase_amount_sum']), baseline = False) # old vs new purchase amounts     - experiment
    plan = regression(plan, play = prep(plan, feat, exclude = ['card_id', 'first_active_month'                     ]), baseline = False) # all numeric engineered features - kitchen sink

    return plan

plan = regression_play(plan, feat)

Linear Regression         : Score is  3.777 [+0.000%] ⭕
Linear Regression         : Score is  3.789 [-0.306%]
Linear Regression         : Score is  3.775 [+0.067%] ⭐


## Lasso Model

In [27]:
def lasso(plan, play, opts) :
    kind    = 'Linear Lasso'

    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import KFold
    from sklearn.linear_model    import Lasso

    params  = [{'alpha' : opts}]
    folds   = 5

    grid    = GridSearchCV(Lasso(random_state = 0), params, cv = folds,
                           scoring = 'neg_mean_squared_error') \
                .fit(play['x_train'], play['y_train'])
    model   = grid.best_estimator_

    return grade(plan, kind, model.predict(play['x_devel']), model.predict(play['x_test']),
                 tag = f'(Alpha = {grid.best_estimator_.alpha:8.5f}) ')

In [17]:
def lasso_play(plan, feat) :
    plan = lasso(plan, play = prep(plan, feat,
                       include = ['feature_1', 'feature_2', 'feature_3'               ]),
                       opts = [0.00001, 0.001, 0.5, 10])
    plan = lasso(plan, play = prep(plan, feat,
                       include = ['old_purchase_amount_sum', 'new_purchase_amount_sum']),
                       opts = np.logspace(-4, -0.5, 30))

    return plan

plan = lasso_play(plan, feat)

Linear Lasso              : Score is  3.777 [-0.000%] (Alpha =  0.00100) 
Linear Lasso              : Score is  3.789 [-0.307%] (Alpha =  0.00010) 


## Gradient Boosting - LightGBM Library
We followed the advice of a Kaggler who had completed the competition about using lightgbm for training: 
https://www.kaggle.com/konradb/lgb-fe-lb-3-707<br>
We run the lightgbm using multiple hyperparameters, one of which, named `opts_win` we copied from a Kaggler to see how we were comparing:
https://www.kaggle.com/peterhurford/you-re-going-to-want-more-categories-lb-3-737

In [31]:
def lgbm(plan, play, opts) :
    kind = 'Gradient Boosting - LGBM'

    from sklearn.model_selection import KFold
    from lightgbm                import Dataset, train
    
    rmse    = 0
    folds   = KFold(n_splits     = opts['n_splits'     ],
                    shuffle      = opts['shuffle'      ],
                    random_state = opts['random_state' ])

    y_train = np.zeros(len(play['train']))
    y_test  = np.zeros(len(play['test' ]))

    for f, (tin, vin) in enumerate(folds.split(play['train'].values, play['target'].values)) :
        
        tdf = Dataset(play['train'].iloc[tin], label = play['target'].iloc[tin]) # train data fold
        vdf = Dataset(play['train'].iloc[vin], label = play['target'].iloc[vin]) # valid data fold
    
        clf = train(params                = opts['params'],
                    train_set             = tdf,
                    valid_sets            = [tdf, vdf],
                    verbose_eval          = opts['verbose_eval'],
                    learning_rates        = opts['learning_rates'],
                    num_boost_round       = opts['num_boost_round'],
                    keep_training_booster = opts['keep_training_booster'],
                    early_stopping_rounds = opts['early_stopping_rounds'])

        y_train[vin] = clf.predict(play['train'].iloc[vin], num_iteration = clf.best_iteration)
        y_test      += clf.predict(play['test' ], num_iteration = clf.best_iteration) / folds.n_splits
      # rmse        += root_mean_squared_error(play['target'].iloc[vin], y_pred) / folds.n_splits

    y_pred = y_train[plan['devel_mask']]
    
    rmse   = root_mean_squared_error(y_train, play['target'])

  # print(f'KFold Score on Full Training : {rmse:<8.5f}')

    return grade(plan, kind, y_pred, y_test)

In [33]:
def lgbm_play(plan, feat) :

    opts_win = \
    {   'n_splits'              : 5,
        'shuffle'               : True,
        'random_state'          : 15,

        'verbose_eval'          : 0,
        'learning_rates'        : None,
        'num_boost_round'       : 10000,
        'categorical_feature'   : 'auto',
        'keep_training_booster' : False,
        'early_stopping_rounds' : 200,

        'params'                :
         {  'num_leaves'        : 111,
            'min_data_in_leaf'  : 149, 
            'objective'         :'regression',
            'max_depth'         : 9,
            'boosting'          : 'gbdt',
            'feature_fraction'  : 0.7522,
            'learning_rate'     : 0.005,          
            'bagging_freq'      : 1,
            'bagging_fraction'  : 0.7083,
            'bagging_seed'      : 11,
            'metric'            : 'rmse',
            'lambda_l1'         : 0.2634,
            'random_state'      : 133,
            'verbosity'         : -1
         }
    }

    opts_kev = \
    {   'n_splits'              : 10,
        'shuffle'               : True,
        'random_state'          : 15,

        'verbose_eval'          : 0,
        'learning_rates'        : None,
        'num_boost_round'       : 5000,
        'categorical_feature'   : 'auto',
        'keep_training_booster' : True,
        'early_stopping_rounds' : 100,

        'params'  :
        {   'num_leaves'        : 125,
            'num_trees'         : 150,
            'objective'         : 'regression',
            'metric'            : 'rmse'
        }
    }
    
    plan = lgbm(plan, play = prep(plan, feat, exclude = ['cop_.*', 'pca_.*']),
                      opts = opts_win)
    plan = lgbm(plan, play = prep(plan, feat, exclude = ['cop_.*', 'pca_.*']),
                      opts = opts_kev)
    plan = lgbm(plan, play = prep(plan, feat, include = ['cop_.*']),
                      opts = opts_kev)

    return plan

plan = lgbm_play(plan, feat)

Gradient Boosting - LGBM  : Score is  3.612 [+4.388%] ⭐
Gradient Boosting - LGBM  : Score is  3.636 [+3.743%]
Gradient Boosting - LGBM  : Score is  3.778 [-0.006%]


# Gradient Boosting - XGBoost Library

In [34]:
def xgboost(plan, play, opts) :
    kind    = 'Gradient Boosting - XGB'

    from xgboost import train, DMatrix
    
    x_train = DMatrix(play['x_train'], label = play['y_train'])
    x_devel = DMatrix(play['x_devel'], label = play['y_devel'])
    x_test  = DMatrix(play['x_test' ]                         )

    model   = train(params                = opts['params'],
                    dtrain                = x_train,
                    verbose_eval          = opts['verbose_eval'],
                    num_boost_round       = opts['num_boost_round'],
                    early_stopping_rounds = opts['early_stopping_rounds'],
                    evals                 = [(x_devel, 'devel')])
    
    y_pred  = model.predict(x_devel)
    y_test  = model.predict(x_test )

    return grade(plan, kind, y_pred, y_test)

In [35]:
def xgboost_play(plan, feat) :

    opts_new = \
    {
        'num_boost_round' : 999,
        'early_stopping_rounds' : 10,
        'verbose_eval' : False,
        'params' :
        {
            'eval_metric'      : 'rmse',
            'max_depth'        : 5,
            'min_child_weight' : 35,
            'eta'              : 0.05,
            'subsample'        : 0.7,
            'colsample_bytree' : 0.8,
            'objective'        :'reg:linear',
        }
    }

    plan = xgboost(plan, play = prep(plan, feat, exclude = ['card_id', 'first_active_month']),
                         opts = opts_new)

    return plan

plan = xgboost_play(plan, feat)

Gradient Boosting - XGB   : Score is  3.620 [+4.173%]


# Neural Network

In [22]:
def neural(plan, play, opts = {}) :
    kind    = 'Neural Network'

    y_pred  = np.zeros(len(play['x_devel']))
    y_test  = np.zeros(len(play['x_test' ]))
    
    if  op.exists(opts['y_pred']) : y_pred = pd.read_pickle(opts['y_pred'])
    if  op.exists(opts['y_test']) : y_test = pd.read_pickle(opts['y_test'])

    return grade(plan, kind, y_pred, y_test)

In [23]:
def neural_play(plan, feat) :
    plan = neural(plan, play = prep(plan, feat), opts = {'y_pred' : 'nn1_y_pred.pkl', 'y_test' : 'nn1_y_test.pkl'})
    plan = neural(plan, play = prep(plan, feat), opts = {'y_pred' : 'nn2_y_pred.pkl', 'y_test' : 'nn2_y_test.pkl'})

    return plan

plan = neural_play(plan, feat)

Neural Network            : Score is  3.798 [-0.554%]
Neural Network            : Score is  3.798 [-0.554%]


# Make Submission

In [24]:
def submit(plan, feat) :

  # model  = plan['best_model']
    feats  = plan['best_feats']
    score  = plan['best_score']
    y_test = plan['best_ytest']

    submission = pd.DataFrame({ 'card_id' : feat['test']['card_id'].values,
                                'target'  : y_test })
    
    display(submission.head())
    
    submission.to_csv('submission.csv', index = False)  
    
    print('Making Kaggle Submission! - Thanks for Playing 💯')

In [25]:
submit(plan, feat)

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-3.291329
1,C_ID_130fd0cbdd,-0.334645
2,C_ID_b709037bc5,-1.168029
3,C_ID_d27d835a9f,-0.169879
4,C_ID_2b5e3df5c2,-1.248358


Making Kaggle Submission! - Thanks for Playing 💯


<b>Find Memory Hogs</b>

In [26]:
def hogs(n_largest = 10) :
    
    from operator import itemgetter
    from pympler  import tracker
    from gc       import collect, get_objects

    collect()

    memory = pd.DataFrame(mem.create_summary(), columns = ['object', 'number_of_objects', 'memory'])
    memory['mem_per_object'] = memory['memory'] / memory['number_of_objects']
    memory['name'] = memory.index
    
    display(memory.sort_values('memory', ascending = False).head(n_largest))

  # import multiprocessing
  # result = multiprocessing.Pool(1).map(huge_intermediate_calc, [something_])[0]
