# Machine Learning of Elo Merchant Category Recommendation Competition

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import time
import datetime

import warnings
warnings.filterwarnings("ignore")

from sklearn import model_selection, preprocessing, metrics
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge, BayesianRidge
import lightgbm as lgb
import gc

## Data loading

In [None]:
train_df = pd.read_csv('../train.csv', parse_dates=["first_active_month"])
test_df = pd.read_csv('../test.csv', parse_dates=["first_active_month"])

print("Training data size",train_df.shape)
print("Testing data size",test_df.shape)

In [None]:
train_df['elasped_time'] = (datetime.date(2018, 2, 1) - train_df['first_active_month'].dt.date).dt.days
test_df['elasped_time'] = (datetime.date(2018, 2, 1) - test_df['first_active_month'].dt.date).dt.days

train_df["year"] = train_df["first_active_month"].dt.year
test_df["year"] = test_df["first_active_month"].dt.year
train_df["month"] = train_df["first_active_month"].dt.month
test_df["month"] = test_df["first_active_month"].dt.month

train_df['outliers'] = 0
train_df.loc[train_df['target']<-30,'outliers']=1
train_df['outliers'].value_counts()

target = train_df['target']
del train_df['target']
gc.collect

In [None]:
hist_df = pd.read_csv('../historical_transactions.csv')
new_df = pd.read_csv("../new_merchant_transactions.csv")

print("Historical transactions data size",hist_df.shape)
print("New transactions data size",new_df.shape)

In [None]:
hd = {'Unique Entry': hist_df.nunique(axis = 0),
        'Nan Entry': hist_df.isnull().any()}
pd.DataFrame(data = hd, index = hist_df.columns.values)

## Feature engineering

In [None]:
#for df in [hist_df,new_df]:
#    df['category_2'].fillna(1.0,inplace=True)
#    df['category_3'].fillna('A',inplace=True)
#    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
for df in [hist_df,new_df]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df['weekday'] = (df.purchase_date.dt.weekday <5).astype(int)
    df['hour'] = df['purchase_date'].dt.hour
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0}) 
    #https://www.kaggle.com/c/elo-merchant-category-recommendation/discussion/73244
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']

In [None]:
hist_df = pd.get_dummies(hist_df,columns=['category_2','category_3'])
new_df = pd.get_dummies(new_df,columns=['category_2','category_3'])

agg_fun = {'authorized_flag': ['mean']}
auth_mean = hist_df.groupby(['card_id']).agg(agg_fun)
auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns.values]
auth_mean.reset_index(inplace=True)

auth_df = hist_df[hist_df['authorized_flag'] == 1]
hist_df = hist_df[hist_df['authorized_flag'] == 0]

In [None]:
def aggregate_historical_transactions(history):
    
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).\
                                      astype(np.int64) * 1e-9
    
    agg_func = {
        'category_1': ['sum', 'mean'],
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        'merchant_id': ['nunique'],
        'merchant_category_id':['nunique'],
        'state_id':['nunique'],
        'city_id': ['nunique'],
        'subsector_id':['nunique'],
        'year':['nunique'],
        'month':['nunique'],
        'weekofyear':['nunique'],        
        'weekend':['sum','mean'],
        'weekday':['sum','mean'],
        # non-categorical features
        'purchase_amount': ['sum', 'median', 'max', 'min', 'std'],
        'installments': ['sum', 'median', 'max', 'min', 'std'],
        'purchase_date': [np.ptp],
        'month_lag': ['min', 'max','mean','std'],
        'month_diff':['mean'],
        'authorized_flag': ['sum', 'mean'],
        }
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() 
                           for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='hist_transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history

In [None]:
history = aggregate_historical_transactions(hist_df)
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]
history[:5]

In [None]:
authorized = aggregate_historical_transactions(auth_df)
authorized.columns = ['auth_' + c if c != 'card_id' else c for c in authorized.columns]
authorized[:5]

In [None]:
new = aggregate_historical_transactions(new_df)
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]
new[:5]

In [None]:
def aggregate_per_month(history):
    grouped = history.groupby(['card_id', 'month_lag'])

    agg_func = {
            'purchase_amount': ['count', 'sum', 'mean', 'min', 'max', 'std'],
            'installments': ['count', 'sum', 'mean', 'min', 'max', 'std'],
            }

    intermediate_group = grouped.agg(agg_func)
    intermediate_group.columns = ['_'.join(col).strip() for col in intermediate_group.columns.values]
    intermediate_group.reset_index(inplace=True)

    final_group = intermediate_group.groupby('card_id').agg(['mean', 'std'])
    final_group.columns = ['_'.join(col).strip() for col in final_group.columns.values]
    final_group.reset_index(inplace=True)
    
    return final_group
#___________________________________________________________
final_group =  aggregate_per_month(auth_df) 
final_group[:10]


In [None]:
def successive_aggregates(df, field1, field2):
    t = df.groupby(['card_id', field1])[field2].mean()
    u = pd.DataFrame(t).reset_index().groupby('card_id')[field2].agg(['mean', 'min', 'max', 'std'])
    u.columns = [field1 + '_' + field2 + '_' + col for col in u.columns.values]
    u.reset_index(inplace=True)
    return u

In [None]:
additional_fields = successive_aggregates(new_df, 'category_1', 'purchase_amount')
additional_fields = additional_fields.merge(successive_aggregates(new_df, 'installments', 'purchase_amount'),
                                            on = 'card_id', how='left')
additional_fields = additional_fields.merge(successive_aggregates(new_df, 'city_id', 'purchase_amount'),
                                            on = 'card_id', how='left')
additional_fields = additional_fields.merge(successive_aggregates(new_df, 'category_1', 'installments'),
                                            on = 'card_id', how='left')
del new_df
del auth_df
del hist_df

In [None]:
train_df = pd.merge(train_df, history, on='card_id', how='left')
test_df = pd.merge(test_df, history, on='card_id', how='left')

train_df = pd.merge(train_df, authorized, on='card_id', how='left')
test_df = pd.merge(test_df, authorized, on='card_id', how='left')

train_df = pd.merge(train_df, new, on='card_id', how='left')
test_df = pd.merge(test_df, new, on='card_id', how='left')

In [None]:
train_df = pd.merge(train_df, final_group, on='card_id', how='left')
test_df = pd.merge(test_df, final_group, on='card_id', how='left')

train_df = pd.merge(train_df, additional_fields, on='card_id', how='left')
test_df = pd.merge(test_df, additional_fields, on='card_id', how='left')

In [None]:
train_df = pd.merge(train_df, auth_mean, on='card_id', how='left')
test_df = pd.merge(test_df, auth_mean, on='card_id', how='left')

## Training the model

In [None]:
train_df.head()

In [None]:
del train_df['outliers']

In [None]:
test_df.to_csv('../test_v5_stack.csv')
train_df.to_csv('../train_v5_stack.csv')

In [None]:
features = [c for c in train_df.columns if c not in ['card_id', 'first_active_month']]
#features = [f for f in features if f not in unimportant_features]
categorical_feats = ['feature_1','feature_2', 'feature_3','year','month']

In [None]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "nthread": 4,
         "verbosity": -1}

https://www.kaggle.com/fabiendaniel/hyperparameter-tuning

In [None]:
param = {'num_leaves': 111,
         'min_data_in_leaf': 149, 
         'objective':'regression',
         'max_depth': 9,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.7522,
         "bagging_freq": 1,
         "bagging_fraction": 0.7083 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2634,
         "random_state": 133,
         "verbosity": -1}

In [None]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4590)
#from sklearn.model_selection import StratifiedKFold,KFold,RepeatedKFold
#from sklearn.model_selection import RepeatedKFold
#folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4520)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features],
                           label=target.iloc[trn_idx],
                           categorical_feature=categorical_feats
                          )
    val_data = lgb.Dataset(train_df.iloc[val_idx][features],
                           label=target.iloc[val_idx],
                           categorical_feature=categorical_feats
                          )

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds = 200)
    
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

## Feature importance

In [None]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances_v5.png')
plt.show()

## Submission

In [None]:
sub_df = pd.DataFrame({"card_id":test_df["card_id"].values})
sub_df["target"] = predictions
sub_df.to_csv("../submit_v4_strat.csv", index=False)