**FEEL FREE TO UPVOTE**  （＾ｖ＾）

In [0]:
import os
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
import warnings
warnings.filterwarnings('ignore')
np.random.seed(4590)

In [0]:
np.random.seed(4590)

In [0]:
print(os.listdir('../input'))

['fork-of-continued', 'elo-blend', 'elo-dataset', 'continued', 'elo-merchant-category-recommendation']


In [0]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
train=pd.read_csv('../input/elo-merchant-category-recommendation/train.csv', parse_dates=["first_active_month"])
test=pd.read_csv('../input/elo-merchant-category-recommendation/test.csv', parse_dates=["first_active_month"])
df_hist_trans = pd.read_csv('../input/elo-merchant-category-recommendation/historical_transactions.csv')
df_new_merchant_trans = pd.read_csv('../input/elo-merchant-category-recommendation/new_merchant_transactions.csv')

In [0]:
def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [0]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['category_2'].fillna(1.0,inplace=True)
    df['category_3'].fillna('A',inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)

In [0]:
df_hist_trans=reduce_mem_usage(df_hist_trans)
df_new_merchant_trans=reduce_mem_usage(df_new_merchant_trans)

In [0]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['day']=df['purchase_date'].dt.day
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df['hour'] = df['purchase_date'].dt.hour
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0})
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']
    df['price']=df['purchase_amount']/df['installments']
    df['purchase_amount_quantiles']= pd.qcut(df['purchase_amount'], 5, labels=False)
    df['installments_quantiles']= pd.qcut(df['installments'], 5, labels=False)
gc.collect()

In [0]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']

aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['authorized_flag'] = ['sum', 'mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size'],
aggs['installments_quantiles']=['var', 'mean', 'skew']
aggs['purchase_amount_quantiles']=['var', 'mean', 'skew']

for col in ['category_2','category_3']:
    df_hist_trans[col+'_mean'] = df_hist_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']    
    
new_columns = get_new_columns('hist',aggs)
df_hist_trans_group = df_hist_trans.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['hist_purchase_date_diff'] = (df_hist_trans_group['hist_purchase_date_max'] - df_hist_trans_group['hist_purchase_date_min']).dt.days
df_hist_trans_group['hist_purchase_date_average'] = df_hist_trans_group['hist_purchase_date_diff']/df_hist_trans_group['hist_card_id_size']
df_hist_trans_group['hist_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['hist_purchase_date_max']).dt.days
df_hist_trans_group['hist_of_hist_purchase_date_average']=df_hist_trans_group['hist_purchase_date_diff']*df_hist_trans_group['hist_purchase_date_average']

df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
del df_hist_trans_group
gc.collect()

In [0]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']

aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['authorized_flag'] = ['sum', 'mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size'],
aggs['installments_quantiles']=['var', 'mean', 'skew']
aggs['purchase_amount_quantiles']=['var', 'mean', 'skew']


for col in ['category_2','category_3']:
    df_new_merchant_trans[col+'_mean'] = df_new_merchant_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']

new_columns = get_new_columns('new_hist',aggs)
df_hist_trans_group = df_new_merchant_trans.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['new_hist_purchase_date_diff'] = (df_hist_trans_group['new_hist_purchase_date_max'] - df_hist_trans_group['new_hist_purchase_date_min']).dt.days
df_hist_trans_group['new_hist_purchase_date_average'] = df_hist_trans_group['new_hist_purchase_date_diff']/df_hist_trans_group['new_hist_card_id_size']
df_hist_trans_group['new_hist_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['new_hist_purchase_date_max']).dt.days
df_hist_trans_group['new_hist_of_new_hist_purchase_date_average']=df_hist_trans_group['new_hist_purchase_date_diff']*df_hist_trans_group['new_hist_purchase_date_average']

df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
del df_hist_trans_group;gc.collect()

In [0]:
df_train['outliers'] = 0
df_train.loc[df_train['target'] < -30, 'outliers'] = 1
df_train['outliers'].value_counts()

In [0]:
for df in [df_train,df_test]:
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['dayofweek'] = df['first_active_month'].dt.dayofweek
    df['weekofyear'] = df['first_active_month'].dt.weekofyear
    df['month'] = df['first_active_month'].dt.month
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days
    df['hist_first_buy'] = (df['hist_purchase_date_min'] - df['first_active_month']).dt.days
    df['new_hist_first_buy'] = (df['new_hist_purchase_date_min'] - df['first_active_month']).dt.days
    for f in ['hist_purchase_date_max','hist_purchase_date_min','new_hist_purchase_date_max',\
                     'new_hist_purchase_date_min']:
        df[f] = df[f].astype(np.int64) * 1e-9
    df['card_id_total'] = df['new_hist_card_id_size']+df['hist_card_id_size']
    df['purchase_amount_total'] = df['new_hist_purchase_amount_sum']+df['hist_purchase_amount_sum']
    df['days_feature1'] = df['elapsed_time'] * df['feature_1']    
for f in ['feature_1','feature_2','feature_3']:
    order_label = df_train.groupby([f])['outliers'].mean()
    df_train[f] = df_train[f].map(order_label)
    df_test[f] = df_test[f].map(order_label)

In [0]:
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month', 'target', 'outliers', 'hist_category_1_sum', 'new_hist_authorized_flag_mean', 'new_hist_merchant_id_nunique', 'new_hist_month_lag_min', 'new_hist_authorized_flag_sum', 'new_hist_month_nunique', 'new_hist_card_id_size', 'hist_dayofweek_nunique', 'new_hist_dayofweek_nunique', 'new_hist_weekofyear_nunique', 'new_hist_weekend_sum', 'hist_year_nunique', 'new_hist_hour_nunique', 'new_hist_year_nunique', 'new_hist_subsector_id_nunique', 'new_hist_month_wise_purchase_amount_mean_mean', 'hist_installments_min', 'new_hist_installments_max', 'hist_month_lag_min', 'new_hist_installments_min', 'feature_2', 'feature_3', 'hist_month_lag_max', 'new_hist_month_lag_max', 'new_hist_category_2_mean_mean', 'new_hist_installments_sum', 'new_hist_merchant_category_id_nunique', 'new_hist_weekend_mean', 'new_hist_installments_var', 'new_hist_category_3_mean_mean', 'new_hist_month_lag_var', 'dayofweek', 'feature_1', 'hist_installments_max', 'month', 'hist_hour_nunique', 'card_id_total', 'new_hist_month_diff_mean', 'hist_subsector_id_nunique', 'hist_card_id_size', 'hist_weekend_sum']]

In [0]:
def score_feature_selection(df_train=None, df_train_columns=None, target=None):
    params = {
            'gpu_id': 0, 
            #'n_gpus': 2, 
            'objective': 'reg:linear', 
            'eval_metric': 'rmse', 
            'silent': True, 
            'booster': 'gbtree', 
            'n_jobs': 4, 
            'n_estimators': 2500, 
            'tree_method': 'hist', 
            'grow_policy': 'lossguide', 
            'max_depth': 12, 
            'seed': 538, 
            'colsample_bylevel': 0.9, 
            'colsample_bytree': 0.8, 
            'gamma': 0.0001, 
            'learning_rate': 0.006150886706231842, 
            'max_bin': 128, 
            'max_leaves': 47, 
            'min_child_weight': 40, 
            'reg_alpha': 10.0, 
            'reg_lambda': 10.0, 
            'subsample': 0.95}

    
    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=4590)
    oof = np.zeros(len(df_train))
    predictions = np.zeros(len(df_test))
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'])):
        print("fold {}".format(fold_))
        X_train, y_train = df_train[df_train_columns].iloc[trn_idx], target.iloc[trn_idx]
        X_valid, y_valid = df_train[df_train_columns].iloc[val_idx], target.iloc[val_idx]        
        num_round = 10000
        model = xgb.XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=100, eval_metric='rmse',
            early_stopping_rounds=500)
        oof[val_idx] = model.predict(df_train.iloc[val_idx][df_train_columns], ntree_limit=model.best_ntree_limit)
    
        #fold_importance_df = pd.DataFrame()
        #fold_importance_df["Feature"] = df_train_columns
        #fold_importance_df["importance"] = clf.feature_importance()
        #fold_importance_df["fold"] = fold_ + 1
        #feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
        predictions += model.predict(df_test[df_train_columns], ntree_limit=model.best_ntree_limit) / folds.n_splits

    k=np.sqrt(mean_squared_error(oof, target))
    return k, predictions

In [0]:
print('no. of features:', len(df_train_columns))
gain_results, predictions = score_feature_selection(df_train=df_train, df_train_columns=df_train_columns, target=df_train['target'])
print('\t GAIN  : %.6f' % (gain_results))

best_sub = pd.DataFrame({"card_id":df_test["card_id"].values})
best_sub["target"] = predictions

no. of features: 73
fold 0
[08:40:55] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	validation_0-rmse:3.95795
Will train until validation_0-rmse hasn't improved in 500 rounds.
[100]	validation_0-rmse:3.78384
[200]	validation_0-rmse:3.72139
[300]	validation_0-rmse:3.69632
[400]	validation_0-rmse:3.68557
[500]	validation_0-rmse:3.67918
[600]	validation_0-rmse:3.67523
[700]	validation_0-rmse:3.67241
[800]	validation_0-rmse:3.67037
[900]	validation_0-rmse:3.6684
[1000]	validation_0-rmse:3.66711
[1100]	validation_0-rmse:3.66614
[1200]	validation_0-rmse:3.66508
[1300]	validation_0-rmse:3.66483
[1400]	validation_0-rmse:3.66458
[1500]	validation_0-rmse:3.6646
[1600]	validation_0-rmse:3.66479
[1700]	validation_0-rmse:3.66471
[1800]	validation_0-rmse:3.66474
[1900]	validation_0-rmse:3.66477
Stopping. Best iteration:
[1431]	validation_0-rmse:3.66442

fold 1
[08:43:26] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker

In [0]:
train=df_train
test=df_test
df_train=df_train[df_train['outliers']==0]

In [0]:
def score_feature_selection(df_train=None, df_train_columns=None, target=None):
    params = {
            'gpu_id': 0, 
            #'n_gpus': 2, 
            'objective': 'reg:linear', 
            'eval_metric': 'rmse', 
            'silent': True, 
            'booster': 'gbtree', 
            'n_jobs': 4, 
            'n_estimators': 2500, 
            'tree_method': 'hist', 
            'grow_policy': 'lossguide', 
            'max_depth': 12, 
            'seed': 538, 
            'colsample_bylevel': 0.9, 
            'colsample_bytree': 0.8, 
            'gamma': 0.0001, 
            'learning_rate': 0.006150886706231842, 
            'max_bin': 128, 
            'max_leaves': 47, 
            'min_child_weight': 40, 
            'reg_alpha': 10.0, 
            'reg_lambda': 10.0, 
            'subsample': 0.95}

    
    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=4590)
    oof = np.zeros(len(df_train))
    predictions = np.zeros(len(df_test))
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'])):
        print("fold {}".format(fold_))
        X_train, y_train = df_train[df_train_columns].iloc[trn_idx], target.iloc[trn_idx]
        X_valid, y_valid = df_train[df_train_columns].iloc[val_idx], target.iloc[val_idx]        
        num_round = 10000
        model = xgb.XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=100, eval_metric='rmse',
            early_stopping_rounds=500)
        oof[val_idx] = model.predict(df_train.iloc[val_idx][df_train_columns], ntree_limit=model.best_ntree_limit)
    
        #fold_importance_df = pd.DataFrame()
        #fold_importance_df["Feature"] = df_train_columns
        #fold_importance_df["importance"] = clf.feature_importance()
        #fold_importance_df["fold"] = fold_ + 1
        #feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
        predictions += model.predict(df_test[df_train_columns], ntree_limit=model.best_ntree_limit) / folds.n_splits

    k=np.sqrt(mean_squared_error(oof, target))
    return k, predictions

In [0]:
print('no. of features:', len(df_train_columns))
gain_results, predictions = score_feature_selection(df_train=df_train, df_train_columns=df_train_columns, target=df_train['target'])
print('\t GAIN  : %.6f' % (gain_results))

model_without_outlier = pd.DataFrame({"card_id":df_test["card_id"].values})
model_without_outlier["target"] = predictions

no. of features: 73
fold 0
[09:10:02] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	validation_0-rmse:1.80878
Will train until validation_0-rmse hasn't improved in 500 rounds.
[100]	validation_0-rmse:1.67213
[200]	validation_0-rmse:1.62227
[300]	validation_0-rmse:1.60187
[400]	validation_0-rmse:1.59174
[500]	validation_0-rmse:1.58612
[600]	validation_0-rmse:1.5824
[700]	validation_0-rmse:1.57985
[800]	validation_0-rmse:1.57803
[900]	validation_0-rmse:1.57683
[1000]	validation_0-rmse:1.57588
[1100]	validation_0-rmse:1.57516
[1200]	validation_0-rmse:1.57452
[1300]	validation_0-rmse:1.5741
[1400]	validation_0-rmse:1.57371
[1500]	validation_0-rmse:1.57337
[1600]	validation_0-rmse:1.57309
[1700]	validation_0-rmse:1.57288
[1800]	validation_0-rmse:1.57275
[1900]	validation_0-rmse:1.57257
[2000]	validation_0-rmse:1.57242
[2100]	validation_0-rmse:1.57222
[2200]	validation_0-rmse:1.57216
[2300]	validation_0-rmse:1.5721
[2400]	validation_0-rmse:1.57202

In [0]:
df_train = train
df_test = test

In [0]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss

In [0]:
def score_feature_selecti_2(df_train=None, df_train_columns=None, target=None):
    params = {
            'gpu_id': 0, 
            #'n_gpus': 2, 
            'objective': 'binary:logistic', 
            'eval_metric': 'logloss', 
            'silent': True, 
            'booster': 'gbtree', 
            'n_jobs': 4, 
            'n_estimators': 2500, 
            'tree_method': 'hist', 
            'grow_policy': 'lossguide', 
            'max_depth': 12, 
            'seed': 538, 
            'colsample_bylevel': 0.9, 
            'colsample_bytree': 0.8, 
            'gamma': 0.0001, 
            'learning_rate': 0.006150886706231842, 
            'max_bin': 128, 
            'max_leaves': 47, 
            'min_child_weight': 40, 
            'reg_alpha': 10.0, 
            'reg_lambda': 10.0, 
            'subsample': 0.95}

    
    folds = KFold(n_splits=10, shuffle=True, random_state=4590)
    oof = np.zeros(len(df_train))
    predictions = np.zeros(len(df_test))
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'])):
        print("fold {}".format(fold_))
        X_train, y_train = df_train[df_train_columns].iloc[trn_idx], target.iloc[trn_idx]
        X_valid, y_valid = df_train[df_train_columns].iloc[val_idx], target.iloc[val_idx]        
        num_round = 10000
        model = xgb.XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=100, eval_metric='logloss',
            early_stopping_rounds=500)
        oof[val_idx] = model.predict(df_train.iloc[val_idx][df_train_columns], ntree_limit=model.best_ntree_limit)
    
        #fold_importance_df = pd.DataFrame()
        #fold_importance_df["Feature"] = df_train_columns
        #fold_importance_df["importance"] = clf.feature_importance()
        #fold_importance_df["fold"] = fold_ + 1
        #feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
        predictions += model.predict(df_test[df_train_columns], ntree_limit=model.best_ntree_limit) / folds.n_splits

    k=log_loss(target, oof)
    return k, predictions

In [0]:
print('no. of features:', len(df_train_columns))
gain_results, predictions = score_feature_selecti_2(df_train=df_train, df_train_columns=df_train_columns, target=df_train['outliers'])
print('\t GAIN  : %.6f' % (gain_results))

df_outlier_prob = pd.DataFrame({"card_id":df_test["card_id"].values})
df_outlier_prob["target"] = predictions

no. of features: 73
fold 0
[09:45:36] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	validation_0-logloss:0.687291
Will train until validation_0-logloss hasn't improved in 500 rounds.
[100]	validation_0-logloss:0.330256
[200]	validation_0-logloss:0.184013
[300]	validation_0-logloss:0.115438
[400]	validation_0-logloss:0.081665
[500]	validation_0-logloss:0.064783
[600]	validation_0-logloss:0.056341
[700]	validation_0-logloss:0.052098
[800]	validation_0-logloss:0.049974
[900]	validation_0-logloss:0.048927
[1000]	validation_0-logloss:0.048379
[1100]	validation_0-logloss:0.048076
[1200]	validation_0-logloss:0.047903
[1300]	validation_0-logloss:0.047815
[1400]	validation_0-logloss:0.047763
[1500]	validation_0-logloss:0.04775
[1600]	validation_0-logloss:0.047733
[1700]	validation_0-logloss:0.047717
[1800]	validation_0-logloss:0.047729
[1900]	validation_0-logloss:0.047745
[2000]	validation_0-logloss:0.047765
[2100]	validation_0-logloss:0.047781
Stopp

In [0]:
outlier_id = df_outlier_prob[df_outlier_prob['target']>0.5]
outliers=outlier_id['card_id']

In [0]:
most_likely_liers = best_sub.merge(outlier_id,how='right')
most_likely_liers.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.080751
1,C_ID_b709037bc5,-1.010545
2,C_ID_f7cada36d3,0.452038
3,C_ID_6d8dba8475,-0.770577
4,C_ID_7f1041e8e1,-3.868524


In [0]:
%%time
for card_id in most_likely_liers['card_id']:
    model_without_outlier.loc[model_without_outlier['card_id']==card_id,'target']\
    = most_likely_liers.loc[most_likely_liers['card_id']==card_id,'target'].values

CPU times: user 6min 20s, sys: 332 ms, total: 6min 21s
Wall time: 6min 21s


In [0]:
model_without_outlier.to_csv("combining_submission_xgb.csv", index=False)