In [39]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
from datetime import datetime as dt
import os
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
# print(os.listdir("../input"))


#### Assume that we save all files in all/ folder in the same directory 

In [3]:
def read_train(folder):
    df = pd.read_csv('all/train.csv')
    df['fdt'] = pd.to_datetime(df['first_active_month'])
    df['yr'] = df.fdt.apply(lambda x: x.year)
    df['tenure'] = ((datetime.date(2018, 2, 1) - df['fdt'].dt.date).dt.days)/30
    return df

def read_txns(fname):
    df = pd.read_csv(fname)
    
    # convert to date time
    df['purchase_date'] = pd.to_datetime(df.purchase_date)
    df['purchase_month'] = df['purchase_date'].dt.month
    df['purchase_firstday'] = df['purchase_date'].dt.day==1
    # binary 
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y':1, 'N':0})
    
    return df


def read_merchant(fname):
    # Since category column exist in transaction data already, we filter them out before merge into transaction data 
    df = pd.read_csv(fname).drop(['category_1', 'category_2'], 1)
    return df 


def evaluate(y, pred):
    res = pd.DataFrame({'y':y, 'pred':pred})
    res['diff'] = res.y-res.pred
    res['ab_diff'] = res['diff'].abs()
    res['diff_sq'] = res['diff']**2
    res['is_outlier'] = res.y < -33
    rmse = np.sqrt(np.mean(res['diff_sq']))
    summary = res.groupby('is_outlier').agg({
        'ab_diff': ['mean', 'min','max', 'median'],
        'y':['count']
    })
    
    return rmse, summary

# 1. Read data

In [4]:
cards = read_train('')
n_txs = read_txns('all/new_merchant_transactions.csv')
h_txs = read_txns('all/historical_transactions.csv')

In [5]:
merchant = read_merchant('all/merchants.csv')

In [6]:
txs = h_txs.append(n_txs, ignore_index=True)

In [7]:
# combine merchant with transaction data 
txs_with_merchant = pd.merge(txs, merchant, how='left', on = 'merchant_id')

# Features

In [24]:
def agg_tx_features(df, prefix=""):
    agg = df.groupby(by='card_id').agg(
        {'purchase_amount': ['count','sum','mean','min','max', 'var', 'skew']
         ,'merchant_id': ['nunique']
         ,'installments': ['sum']
         ,'authorized_flag': ['mean']
         ,'category_1': ['count']
         ,'category_2': ['count']
         ,'category_3': ['count']
         ,'state_id': ['nunique']
         ,'city_id': ['nunique']
         ,'purchase_date': ['min', 'max']
         ,'purchase_month': ['nunique']
         ,'month_lag': ['nunique', 'min', 'max']
         ,'purchase_firstday': ['sum', 'mean']  
        }).reset_index() 
    
    agg.columns = ["card_id"] + [ prefix + '_'.join(tup).rstrip('_') \
                                 for tup in agg.columns.values[1:]]
    return agg

In [25]:
agg_new = agg_tx_features(txs, "new_")

# Combine

In [26]:
c1 = cards.merge(agg_new, how='left', on='card_id')
c1['no_new_tx'] = c1.new_purchase_amount_min.isnull()
c1['active_pre_newtx'] = c1.fdt <= n_txs.purchase_date.min()

In [27]:
c1 = c1.fillna(0)

In [28]:
feat =['yr', 
       'tenure', 
       'new_purchase_amount_count',
       'new_purchase_amount_sum', 
       'new_purchase_amount_mean',
       'new_purchase_amount_min', 
       'new_purchase_amount_max',
       'new_purchase_amount_var', 
       'new_purchase_amount_skew',
       'new_merchant_id_nunique', 
       'new_installments_sum',
       'new_authorized_flag_mean', 
       'new_category_1_count',
       'new_category_2_count', 
       'new_category_3_count', 
       'new_state_id_nunique',
       'new_city_id_nunique',
       'new_purchase_month_nunique',
       'new_month_lag_nunique',
       'new_month_lag_min', 
       'new_month_lag_max', 
       'new_purchase_firstday_sum',
       'new_purchase_firstday_mean', 
       'no_new_tx', 
       'active_pre_newtx']

### Train with RandomforestRegressor & grid search CV

In [None]:
# train_df, Y
y = c1.target
X = c1[feat]

# tr, val = train_test_split(train_df, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
    
# clf.train(tr)
clf = RandomForestRegressor()
# Grid search params
clf_cv = GridSearchCV(clf, {"colsample_bytree":[1.0],"min_child_weight":[1.0,1.2]
                            ,'max_depth': [3,5,10], 'n_estimators': [500,1000, 3000]}, verbose=1)
clf_cv.fit(X_train,y_train)
clf = RandomForestRegressor(**reg_cv.best_params_) # input best params
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
perf = evaluate(y_test,y_pred)

print('rmse:' + str(perf[0]))
print(perf[1])

In [None]:
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)

feature_importances

### Train with xgb & grid search CV

In [41]:
gbm = xgb.XGBRegressor()
# Grid search params
reg_cv = GridSearchCV(gbm, {"colsample_bytree":[1.0],"min_child_weight":[1.0]
                            ,'max_depth': [5,10], 'n_estimators': [1000, 3000]}, verbose=1)
reg_cv.fit(X_train,y_train)
gbm = xgb.XGBRegressor(**reg_cv.best_params_) # input best params
gbm.fit(X_train,y_train)
y_pred = gbm.predict(X_test)
perf = evaluate(y_test,y_pred)

print('rmse:' + str(perf[0]))
print(perf[1])

rmse:3.84834011613
                y    ab_diff                                 
            count       mean        min        max     median
is_outlier                                                   
False       39919   1.279213   0.000001  16.743164   0.906206
True          465  31.735289  27.394949  34.406123  31.868089


In [42]:
feature_importances = pd.DataFrame(gbm.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)

feature_importances

Unnamed: 0,importance
new_authorized_flag_mean,0.146763
tenure,0.130935
new_month_lag_max,0.129496
new_month_lag_min,0.120863
new_installments_sum,0.080576
new_purchase_amount_skew,0.063309
new_merchant_id_nunique,0.048921
new_purchase_amount_min,0.047482
new_month_lag_nunique,0.035971
new_purchase_amount_max,0.030216
