In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
import warnings
import time
import sys
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
warnings.simplefilter(action='ignore', category=FutureWarning)

import gc

pd.options.display.max_rows = 999
pd.options.display.max_columns  = 999

<a id="1"></a> <br>
## 1. Loading the data

First, we load the `new_merchant_transactions.csv` and `historical_transactions.csv`. In practice, these two files contain the same variables and the difference between the two tables only concern the position with respect to a reference date.  Also, booleans features are made numeric:

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

<a id="2"></a> <br>
## Feature engineering

Then I define two functions that aggregate the info contained in these two tables. The first function aggregates the function by grouping on `card_id`:

In [None]:
def aggregate_transactions(history,agg_func):
    
#     if 'purchase_date' in history.columns:
#         history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).\
#                                           astype(np.int64) * 1e-9

    group_cols =['card_id']
    agg_history = history[group_cols + list(agg_func.keys())].groupby(['card_id']).agg(agg_func)
    print('groupby complete')
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    print('reset index complete')
    
#     df = (history.groupby('card_id')
#           .size()
#           .reset_index(name='transactions_count'))
    
#     agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history

<a id="3"></a> <br>
## 3. Repeat Purchase Features

In [None]:
def aggregate_transactions_by_merchant(history):
#     if 'purchase_date' in history.columns:
#         history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).\
#                                           astype(np.int64) * 1e-9
   
    group_cols = ['card_id','merchant_id']
        
    history.sort_values(by=group_cols + ['purchase_date'],inplace=True)
    history["min_purchase_date"] =history[group_cols +['purchase_date']].groupby(group_cols).transform('min')
    history['purchase_duration'] = (history['purchase_date'].dt.date - history["min_purchase_date"].dt.date).dt.days
    history['repeat_purchase_amount'] = history['purchase_amount']
    history.loc[history['purchase_duration']==0,'repeat_purchase_amount'] = 0
    
    agg_func_merch = {
        'repeat_purchase_amount' : ['sum'],  # repeat total purchase amount for specific merchant
         #purchase_duration - min:  repeat number of days lag from offer purchase to next purchase
         #purchase_duration - max:  repeat number of days lag from offer purchase to last purchase
        'purchase_duration' : ['max','min'],
        'purchase_date' : ['count'] # repeat transaction count for specific merchant
    }
    
    agg_history = history[group_cols + list(agg_func_merch.keys())].groupby(group_cols).agg(agg_func_merch)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    return agg_history

In [None]:
Path = '../input/elo-ref-2-data-conversion/'
# historical_transactions = pd.read_hdf(Path+'historical_transactions.hdf')
historical_transactions = pd.read_csv(Path+'historical_transactions.csv',index_col=0)
print('transactions read complete')

In [None]:
# unnamedcols =[col for col in historical_transactions.columns if 'Unnamed' in col]
# print(unnamedcols)
# historical_transactions.drop(unnamedcols,axis=1,inplace=True)
# # historical_transactions_sample = historical_transactions.sample(frac=0.0001)

In [None]:
historical_transactions = reduce_mem_usage(historical_transactions)

In [None]:
historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'],infer_datetime_format=True)
hist_agg_by_merchant = aggregate_transactions_by_merchant(historical_transactions)
print('aggregate_transactions_by_merchant complete')

In [None]:
# del historical_transactions
# gc.collect()

In [None]:
agg_func = {
    'repeat_purchase_amount_sum' : ['mean','min','max','std'],
    'purchase_duration_max' : ['mean','min','max','std'],
    'purchase_duration_min' : ['mean','min','max','std'],
    'purchase_date_count' : ['mean','min','max','std']
}

historical_transactions_repeats = aggregate_transactions(hist_agg_by_merchant,agg_func)
historical_transactions_repeats.columns = ['hist_' + c if c != 'card_id' else c for c in historical_transactions_repeats.columns]
print('aggregate_transactions complete')

del hist_agg_by_merchant;gc.collect()

In [None]:
# Path = '../input/elo-ref-3-preproc/'
# train = pd.read_hdf(Path+'train_preproc.hdf')
# print('train read complete')
# test = pd.read_hdf(Path+'test_preproc.hdf')
# print('test read complete')

In [None]:
Path = '../input/elo-ref-3-preproc/'
train = pd.read_csv(Path+'train_preproc.csv',index_col=0)
print('train read complete')
test = pd.read_csv(Path+'test_preproc.csv',index_col=0)
print('test read complete')

In [None]:
# unnamedcols =[col for col in train.columns if 'Unnamed' in col]
# print(unnamedcols)
# train.drop(unnamedcols,axis=1,inplace=True)
# unnamedcols =[col for col in test.columns if 'Unnamed' in col]
# print(unnamedcols)
# test.drop(unnamedcols,axis=1,inplace=True)

In [None]:
train = pd.merge(train, historical_transactions_repeats, on='card_id', how='left')
test = pd.merge(test, historical_transactions_repeats, on='card_id', how='left')
print('history merge complete')

del historical_transactions_repeats;gc.collect()

**Merchant table features**

In [None]:
merchant = pd.read_csv('../input/elo-merchant-category-recommendation/merchants.csv')

In [None]:
# unnamedcols =[col for col in new_transactions.columns if 'Unnamed' in col]
# print(unnamedcols)
# new_transactions.drop(unnamedcols,axis=1,inplace=True)

In [None]:
merchant['sum_sales_lag']  = merchant['avg_sales_lag3'] + merchant['avg_sales_lag6']  + merchant['avg_sales_lag12']
merchant['sum_purchases_lag']  = merchant['avg_purchases_lag3'] + merchant['avg_purchases_lag6']  + merchant['avg_purchases_lag12']
merchant['sum_sales_p_purchases_lag']  = merchant['sum_sales_lag'] + merchant['sum_purchases_lag']

In [None]:
# outlier inf in avg_purchases_lag3, avg_purchases_lag6, avg_purchases_lag12
merchant.replace([np.inf, -np.inf], np.nan,inplace=True)
cat_cols = ['most_recent_sales_range','most_recent_purchases_range','category_4']
for col in cat_cols:
    print(col)
    if col in ['most_recent_sales_range','most_recent_purchases_range']:
        cat = pd.Categorical(merchant[col], categories=['E','D','C','B','A'],ordered=True)
    else:
        cat = merchant[col]
    merchant[col],indexer = pd.factorize(cat)

# merchant = pd.get_dummies(merchant, columns=['category_4'])

# null in avg_sales_lag3, avg_sales_lag6, avg_sales_lag12, category_2
# merchant.fillna()

In [None]:
#merge merchant with transactions
exclude_cols = ['city_id','state_id','category_1','category_2','category_3','subsector_id','merchant_category_id']
merch_cols = [col for col in merchant.columns if col not in exclude_cols]
historical_transactions_merch = pd.merge(historical_transactions[['merchant_id','card_id']],merchant[merch_cols],how='left',on='merchant_id')
print('historical_transactions complete')
del historical_transactions;gc.collect()

In [None]:
historical_transactions_merch = reduce_mem_usage(historical_transactions_merch)

In [None]:
#wait for gc to collect
time.sleep(60)

In [None]:
agg_func = {
    'sum_sales_lag': ['sum','mean','min','max','std'],
    'sum_sales_p_purchases_lag': ['sum','mean','min','max','std'],
    'sum_purchases_lag': ['sum','mean','min','max','std'],
    
#     'most_recent_sales_range': ['mean','std','min','max'],
#     'most_recent_purchases_range': ['mean','std','min','max'],
#     'category_4': ['mean'],
#     'merchant_group_id': ['nunique']
    }

agg_cols = list(agg_func.keys())

history_merch1 = aggregate_transactions(historical_transactions_merch[['card_id']+agg_cols],agg_func)
print('historical_transactions complete')
history_merch1.columns = ['hist_' + c if c != 'card_id' else c for c in history_merch1.columns]

# del historical_transactions_merch;gc.collect()
print(history_merch1[:5])

In [None]:
history_merch['card_id'][10050:10055] #325540,27

In [None]:
agg_func = {
  
    'most_recent_sales_range': ['mean','std','min','max'],
    'most_recent_purchases_range': ['mean','std','min','max'],
    'category_4': ['mean'],
    'merchant_group_id': ['nunique']
    }

agg_cols = list(agg_func.keys())

history_merch2= aggregate_transactions(historical_transactions_merch[['card_id']+agg_cols],agg_func)
print('historical_transactions complete')
history_merch2.columns = ['hist_' + c if c != 'card_id' else c for c in history_merch2.columns]

del historical_transactions_merch;gc.collect()
print(history_merch2[:5])

In [None]:
history_merch  = pd.concat([history_merch1, history_merch2.drop('card_id', axis=1)],axis=1)

In [None]:
del history_merch1, history_merch2;gc.collect()

In [None]:
Path = '../input/elo-ref-2-data-conversion/'
# new_transactions = pd.read_hdf(Path+'new_transactions.hdf')
new_transactions = pd.read_csv(Path+'new_transactions.csv',index_col=0)
print('new transactions read complete')

In [None]:
new_transactions = reduce_mem_usage(new_transactions)

In [None]:
new_transactions_merch = pd.merge(new_transactions[['merchant_id','card_id']],merchant[merch_cols],how='left',on='merchant_id')
print('new_transactions complete')
del new_transactions;gc.collect()

In [None]:

new_merch = aggregate_transactions(new_transactions_merch[['card_id']+agg_cols],agg_func)
print('new_transactions complete')
new_merch.columns = ['new_' + c if c != 'card_id' else c for c in new_merch.columns]
new_merch[:5]

del new_transactions_merch
gc.collect()

In [None]:
# history_merch = history_merch.loc[:,~history_merch.columns.duplicated()]

In [None]:
#restore indices for merge
train_index = train.index
test_index = test.index

train = pd.merge(train, history_merch, on='card_id', how='left')
test = pd.merge(test, history_merch, on='card_id', how='left')
print('history merge complete')

train = pd.merge(train, new_merch, on='card_id', how='left')
test = pd.merge(test, new_merch, on='card_id', how='left')
print('new merge complete')

train.index = train_index
test.index = test_index

del history_merch, new_merch;gc.collect()

Save Data

In [None]:
#Caution: data should be saved before deleting target
train.to_hdf('train_preproc.hdf',key='data')
test.to_hdf('test_preproc.hdf',key='data')
print('save hdf file complete')

In [None]:
#Caution: data should be saved before deleting target
train.to_csv('train_preproc.csv')
test.to_csv('test_preproc.csv')
print('save csv file complete')

In [None]:
print(train.shape)
print(test.shape)

In [None]:
# # train_temp = pd.read_csv('../input/elo-merchant-category-recommendation/train.csv')
# target_temp = pd.read_hdf('../input/elo-ref-2-data-conversion/target.hdf')
# target = target_temp['target']

In [None]:
# train['target'] = target
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

In [None]:
target= train['target']

In [None]:
# del train['target']

In [None]:
excluded_cols = ['card_id', 'first_active_month','target','outliers']

features = [c for c in train.columns if c not in excluded_cols]
print(excluded_cols)
# categorical_feats = [c for c in features if 'feature_' in c ]
print(features)

We then set the hyperparameters of the LGBM model:

In [None]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}

We now train the model. Here, we use a standard KFold split of the dataset in order to validate the results and to stop the training. Interstingly, during the writing of this kernel, the model was enriched adding new features, which improved the CV score. The variations observed on the CV were found to be quite similar to the variations on the LB: it seems that the current competition won't give us headaches to define the correct validation scheme:

In [None]:
n_splits=5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4590)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
valid_scores =[]
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = 0

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train[features],train['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df["importance"] += clf.feature_importance() / n_splits
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    valid_scores+=[clf.best_score['valid_0']['rmse']]
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print('valid scores:',valid_scores)
print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

In [None]:
# n_splits=5
# folds = KFold(n_splits=n_splits, shuffle=True, random_state=15)
# oof = np.zeros(len(train))
# predictions = np.zeros(len(test))
# start = time.time()
# feature_importance_df = pd.DataFrame()
# valid_scores =[]
# fold_importance_df = pd.DataFrame()
# fold_importance_df["feature"] = features
# fold_importance_df["importance"] = 0
# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
#     print("fold n°{}".format(fold_))
#     trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
#     val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

#     num_round = 10000
#     clf = lgb.train(param, trn_data, num_round, valid_sets = [val_data], verbose_eval=100, early_stopping_rounds = 200)
#     oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
#     fold_importance_df["importance"] += clf.feature_importance() / n_splits
# #     fold_importance_df["fold"] = fold_ + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#     print(clf.best_score)
#     valid_scores+=[clf.best_score['valid_0']['rmse']]
#     predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

# print('valid scores:',valid_scores)
# print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

<a id="4"></a> <br>
## 4. Feature importance
Finally, we can have a look at the features that were used by the model:

In [None]:
# print(len(np.unique(oof)))
# print(train.shape)

In [None]:
np.savetxt('LGB_ref3_p_repeat.npy',oof)

In [None]:
cols = (fold_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = fold_importance_df.loc[fold_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [None]:
best_features=best_features.sort_values(by="importance", ascending=False)
best_features.to_csv('best_features.csv')
print(best_features[:100])

<a id="5"></a> <br>
## 5. Submission
Now, we just need to prepare the submission file:

In [None]:
sub_df = pd.DataFrame({"card_id":test["card_id"].values})
sub_df["target"] = predictions
sub_df.to_csv("submit_ref3_p_repeat.csv", index=False)