In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.metrics import matthews_corrcoef
from math import sqrt
import pickle
import pdb

In [2]:
train = pd.read_csv('D:/_Citi Hackathon/Dataset/Train.csv', index_col=[0])
test = pd.read_csv('D:/_Citi Hackathon/Dataset/Test_fin.csv', index_col=0)
pair_train = pd.read_csv('D:/_Citi Hackathon/Dataset/groupDF.csv', index_col=0)

  mask |= (ar1 == a)


In [3]:
pair_train.head()

Unnamed: 0,n_txn,payroll_ind,rem_company_id_dummy,txn_amt_var,txn_amt_mean,txn_rate,avg_txn_time,varTxnGap,avgTxnPerDay,dd_mean,dd_var,comp_n_ind,dd_diff_comp,dd_var_ratio_comp,txn_amt_diff_comp,txn_amt_var_ratio_comp,txnPerDay_ratio_comp,txnGap_diff_comp,txnGap_var_ratio_comp
"(5000000, 8000000)",3,0,1000000000,0.000954,1.480575,0.025641,58.5,90.25,1.0,11.333333,24.222222,1,4.986326,0.418322,3.380077,1e-06,0.090557,-58.362122,329.636961
"(5000000, 8538500)",4,0,1000000000,0.038843,1.337178,0.25,5.333333,11.555556,1.0,11.0,38.0,1,5.319659,0.656267,3.523474,5.5e-05,0.090557,-5.195456,42.206518
"(5000000, 8813800)",3,0,1000000000,0.465905,1.553873,0.081081,18.5,72.25,1.0,15.666667,16.888889,1,0.652993,0.291674,3.306778,0.00066,0.090557,-18.362122,263.892193
"(5000000, 9768200)",7,0,1000000000,0.003921,1.081052,0.041667,28.0,73.0,1.0,21.285714,47.346939,1,4.966055,0.817691,3.7796,6e-06,0.090557,-27.862122,266.631559
"(5000000, 9784300)",2,0,1000000000,0.0,1.218439,0.181818,11.0,0.0,1.0,9.5,30.25,1,6.819659,0.522423,3.642212,0.0,0.090557,-10.862122,0.0


In [4]:
pair_train['bene_act_no_dummy'] = pd.Series(pair_train.index).apply(lambda x:x[1]).values

In [5]:
def date_formatting(data):
    '''
    Converts date from str to datetime, adds DD column
    Input:
    -------
    data: raw test/train data
    ''' 
    data['transaction_val_dt'] = pd.to_datetime(data['transaction_val_dt'], format='%Y-%m-%d') 
    data['transaction_DD'] = data['transaction_val_dt'].apply(lambda x: int(x.day))
    data.drop('yearmonth', axis=1, inplace=True)
    
def getGapVar(x):
    if len(x)>1:
        return x['transaction_val_dt'].sort_values().diff()[1:].apply(lambda x: x.days).var(ddof=0)
    else:
        return 0
    
def getAvgTxnPerDay(x):
    return len(x)/x['transaction_val_dt'].nunique()

In [6]:
def add_bene_features_pair(data, pair_data):
    '''
    Creates beneficiary related features for rem_bene pair data
    
    Input:
    -------
    data :      train/test data with DD column added
    pair_data:  pair test/train data (with DD column added, without dropping bene_act_no_dummy)
    '''
    
    bene = data.groupby('bene_act_no_dummy')
    
    # number of industries bene is mapped to
    bene_ind_c = bene['rem_company_ind'].nunique()
    pair_data['bene_ind_count'] = bene_ind_c[pair_data.bene_act_no_dummy].values 
    
    # DD abs difference with mean DD
    bene_DD_mean = bene['transaction_DD'].mean()
    pair_data['bene_DD_mode_diff'] = abs(bene_DD_mean[pair_data.bene_act_no_dummy].values - pair_data['dd_mean'])    
    
    # Ratio of pair variance of DD and bene variance of DD
    bene_DD_var = bene['transaction_DD'].var(ddof=0)
    pair_data['bene_DD_var_ratio'] = pair_data['dd_var']/bene_DD_var[pair_data.bene_act_no_dummy].values
    
    # diff of mean txn_amount for bene
    bene_txn_mean = bene['txn_amt'].mean()
    pair_data['avg_bene_txn_diff'] = bene_txn_mean[pair_data.bene_act_no_dummy].values - pair_data['txn_amt_mean']
    
    # Ratio of pair variance of txn and company variance of txn
    bene_txn_var = bene['txn_amt'].var(ddof=0)
    pair_data['bene_txn_var_ratio'] = pair_data['txn_amt_var']/bene_txn_var[pair_data.bene_act_no_dummy].values 
    
    # Gaps info
    delta = (bene['transaction_val_dt'].max()-bene['transaction_val_dt'].min()).apply(lambda x: x.days)
    avgTxnGap = delta/(bene.count()['payroll_ind']-1)
    varTxnGap = bene.apply(getGapVar)
    
    try:
        pair_data['txnGap_diff_bene'] = avgTxnGap[pair_data.bene_act_no_dummy] - pair_data['avg_txn_time']  #issue
        pair_data['txnGap_var_ratio_bene'] = pair_data['varTxnGap']/varTxnGap[pair_data.bene_act_no_dummy]

        # avg transactions per day of pair/avg transactions per day of bene
        bene_avgTxnPerDay = bene.apply(getAvgTxnPerDay)
        pair_data['txnPerDay_ratio_bene'] = pair_data['avgTxnPerDay']/bene_avgTxnPerDay[pair_data.bene_act_no_dummy]

    except:
        pdb.set_trace()
        
    

In [7]:
date_formatting(train)

In [8]:
pair_train.columns

Index(['n_txn', 'payroll_ind', 'rem_company_id_dummy', 'txn_amt_var',
       'txn_amt_mean', 'txn_rate', 'avg_txn_time', 'varTxnGap', 'avgTxnPerDay',
       'dd_mean', 'dd_var', 'comp_n_ind', 'dd_diff_comp', 'dd_var_ratio_comp',
       'txn_amt_diff_comp', 'txn_amt_var_ratio_comp', 'txnPerDay_ratio_comp',
       'txnGap_diff_comp', 'txnGap_var_ratio_comp', 'bene_act_no_dummy'],
      dtype='object')

In [None]:
add_bene_features_pair(train, pair_train)