In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import pickle

from scipy.stats import pointbiserialr, kruskal, ttest_ind, f_oneway, shapiro, mannwhitneyu, levene
import pingouin as pg

import warnings
warnings.filterwarnings('ignore')

from category_encoders import WOEEncoder, BaseNEncoder

In [2]:
fImpReal = pickle.load(open('fImpReal.fn', 'rb'))
getTransactions = pickle.load(open('getTransactions.fn', 'rb'))

In [3]:
train = pd.read_csv('Data/Train.csv', index_col=[0])
trans_date = train['transaction_val_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
train['transaction_val_dt'] = trans_date

test = pd.read_csv('Data/Test_fin.csv', index_col=0)
trans_date = test['transaction_val_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
test['transaction_val_dt'] = trans_date

In [4]:
train['dd'] = train.apply(lambda x: x['transaction_val_dt'].day, axis=1)
comp_gps = train.groupby('rem_company_id_dummy')

In [5]:
comp_names = list(comp_gps.groups.keys())

In [6]:
def fn(x):
    n_inds = len(np.unique(x))
    if n_inds == 1 or 'Unknown' not in x:
        return n_inds
    else:
        return n_inds-1
    
n_ind = comp_gps['rem_company_ind'].unique().apply(fn)
multi_ind = comp_gps['rem_company_ind'].unique().apply(fn)

In [7]:
def get_n_ind(x):
    global multi_ind
    comp = x['rem_company_id_dummy']
    return multi_ind[comp]

train['comp_n_ind'] = train.apply(get_n_ind, axis=1)

In [8]:
comp_dd_means = comp_gps['dd'].mean()

def get_dd_props(x):
    global comp_dd_means, comp_dd_vars
    comp = x['rem_company_id_dummy']
    return abs(comp_dd_means[comp]-x['dd'])

dd_props = train.apply(get_dd_props, axis=1)
train['dd_diff_comp'] = dd_props

In [9]:
comp_txn_means = comp_gps['txn_amt'].mean()

def get_txn_props(x):
    global comp_txn_means
    comp = x['rem_company_id_dummy']
    return comp_txn_means[comp]-x['txn_amt']

txn_props = train.apply(get_txn_props, axis=1)
train['txn_amt_diff_comp'] = txn_props

In [10]:
train['time'] = (train['transaction_val_dt']-min(train['transaction_val_dt'])).apply(lambda x: x.days)
train.replace({'DOMESTIC':1,'CROSS BORDER':0}, inplace=True)

In [None]:
enc1 = BaseNEncoder(cols=['rem_company_ind'], base=2).fit(train['rem_company_ind'])
X = train[['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy']]
y = train['payroll_ind']
enc2 = WOEEncoder(cols=['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy']).fit(X, y)
encoded = enc1.transform(train['rem_company_ind'])
train = pd.merge(train, encoded, left_index=True, right_index=True)
train[['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy']] = enc2.transform(train[['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy']])
train.drop(['transaction_val_dt', 'yearmonth', 'rem_company_ind'], axis=1, inplace=True)

In [26]:
train

Unnamed: 0_level_0,rem_company_id_dummy,rem_act_no_dummy,bene_act_no_dummy,txn_amt,txn_type,payroll_ind,dd,comp_n_ind,dd_diff_comp,txn_amt_diff_comp,time,rem_company_ind_0,rem_company_ind_1,rem_company_ind_2,rem_company_ind_3,rem_company_ind_4
txn_refr_key_dummy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,-1.269904,-4.232049,0.986955,1.458730,0,0,17,1,0.680341,3.401921,16,0,0,0,0,1
4,-3.598650,-10.456369,0.986955,1.269716,0,0,30,1,3.671543,0.796466,29,0,0,0,1,0
5,-4.630725,-4.627996,0.000000,1.190682,0,0,22,1,10.410000,0.204006,21,0,0,0,1,1
7,-5.651940,-5.449196,0.293808,5.657271,0,0,29,1,15.574935,4.730780,28,0,0,1,0,0
8,-3.598650,-10.456369,0.581490,1.239628,0,0,29,1,2.671543,0.826553,28,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3220210,-3.905272,-0.024646,0.293808,24383.318059,1,0,11,1,5.372180,-22129.005488,10,0,1,0,0,0
3220217,-3.905272,-1.875246,-0.334801,1374.240770,1,0,29,1,12.627820,880.071800,28,0,1,0,0,0
3220218,-2.159350,-2.159350,-1.290312,4701.683817,0,0,28,1,12.619565,-3487.027372,149,0,0,1,1,0
3220222,2.843253,-1.153111,-0.922588,1664.592870,0,0,27,1,7.426087,-1528.723407,87,0,0,0,1,1


In [27]:
train.to_csv('trainClean.csv')