In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import pickle

from scipy.stats import pointbiserialr, kruskal, ttest_ind, f_oneway, shapiro, mannwhitneyu, levene
# !pip install pingouin
# import pingouin as pg

import warnings
warnings.filterwarnings('ignore')

# !pip install category_encoders
from category_encoders import WOEEncoder, BaseNEncoder
from sklearn.impute import KNNImputer

import os
import pdb

### Data Loading

In [2]:
train = pd.read_csv('Data/Train.csv', index_col=[0])
test = pd.read_csv('Data/Test_fin.csv', index_col=0)

### Useful Functions

In [3]:
def getTransactions(rem_act_no, bene_act_no):
    return train[(train['rem_act_no_dummy']==rem_act_no) & (train['bene_act_no_dummy']==bene_act_no)]


def getGapVar(x):
    if len(x)>1:
        return x['transaction_val_dt'].sort_values().diff()[1:].apply(lambda x: x.days).var(ddof=0)
    else:
        return 0

    
def getAvgTxnPerDay(x):
    return len(x)/x['transaction_val_dt'].nunique()


def ddProps(x):
    days = x.apply(lambda x: x.day)
    return np.mean(days), np.var(days, ddof=0) 


def countNumInd(x):
    n_inds = len(np.unique(x))
    if n_inds == 1 or 'Unknown' not in x:
        return n_inds
    else:
        return n_inds-1
    
def fn(x):
    if len(x)>1 and 'Unknown' in x:
        x = list(x)
        x.remove('Unknown')
    return x[0]

def get_n_ind(x, multi_ind_comp, multi_ind_rem, multi_ind_bene):
    comp = x['rem_company_id_dummy']
    rem = x['rem_act_no_dummy']
    bene = x['bene_act_no_dummy']
    return multi_ind_comp[comp], multi_ind_rem[rem], multi_ind_bene[bene]


def get_dd_global_props(x, comp_dd_means, comp_dd_vars, rem_dd_means, rem_dd_vars, bene_dd_means, bene_dd_vars):
    comp = x['rem_company_id_dummy']
    rem = x['rem_act_no_dummy']
    bene = x['bene_act_no_dummy']
    return abs(comp_dd_means[comp]-x['dd_mean']), x['dd_var']/comp_dd_vars[comp], abs(rem_dd_means[rem]-x['dd_mean']), x['dd_var']/rem_dd_vars[rem], abs(bene_dd_means[bene]-x['dd_mean']), x['dd_var']/bene_dd_vars[bene]

def get_dd_props_txn(x, comp_dd_means, rem_dd_means, bene_dd_means):
    comp = x['rem_company_id_dummy']
    rem = x['rem_act_no_dummy']
    bene = x['bene_act_no_dummy']
    return abs(comp_dd_means[comp]-x['dd']), abs(rem_dd_means[rem]-x['dd']), abs(bene_dd_means[bene]-x['dd'])

def get_txn_props(x, comp_txn_means, comp_txn_vars, rem_txn_means, rem_txn_vars, bene_txn_means, bene_txn_vars):
    comp = x['rem_company_id_dummy']
    rem = x['rem_act_no_dummy']
    bene = x['bene_act_no_dummy']
    return comp_txn_means[comp]-x['txn_amt_mean'], x['txn_amt_var']/comp_txn_vars[comp], rem_txn_means[rem]-x['txn_amt_mean'], x['txn_amt_var']/rem_txn_vars[rem], bene_txn_means[bene]-x['txn_amt_mean'], x['txn_amt_var']/bene_txn_vars[bene]

def get_txn_props_txn(x, comp_txn_means, rem_txn_means, bene_txn_means):
    comp = x['rem_company_id_dummy']
    rem = x['rem_act_no_dummy']
    bene = x['bene_act_no_dummy']
    return comp_txn_means[comp]-x['txn_amt'], rem_txn_means[rem]-x['txn_amt'], bene_txn_means[bene]-x['txn_amt']

def get_txn_per_day_ratio(x, avgTxnPerDayComp, avgTxnPerDayRem, avgTxnPerDayBene):
    comp = x['rem_company_id_dummy']
    rem = x['rem_act_no_dummy']
    bene = x['bene_act_no_dummy']
    return x['txn_per_day_mean']/avgTxnPerDayComp[comp], x['txn_per_day_mean']/avgTxnPerDayRem[rem], x['txn_per_day_mean']/avgTxnPerDayBene[bene]


def get_txn_gaps(x, avgTxnGapComp, varTxnGapComp, avgTxnGapRem, varTxnGapRem, avgTxnGapBene, varTxnGapBene):
    comp = x['rem_company_id_dummy']
    rem = x['rem_act_no_dummy']
    bene = x['bene_act_no_dummy']
    return avgTxnGapComp[comp]-x['txn_gap_mean'], x['txn_gap_var']/varTxnGapComp[comp], avgTxnGapRem[rem]-x['txn_gap_mean'], x['txn_gap_var']/varTxnGapRem[rem], avgTxnGapBene[bene]-x['txn_gap_mean'], x['txn_gap_var']/varTxnGapBene[bene]

### Preprocessing

In [15]:
def preprocess(data, date_path, baseEnc_path, woeEnc_path, train=True):
    
    ########################## Cleaning ###########################
        
    # Outlier removal
    if train:
        print('Fraction of rows removed:', sum(data['txn_amt']>100)/len(data))
        data = data[data['txn_amt']<=100]
    else:
        data.loc[data['txn_amt']>100, 'txn_amt'] = 100

    # Encode dates
    trans_date = data['transaction_val_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    data['transaction_val_dt'] = trans_date

    # Add dd column
    data['dd'] = data.apply(lambda x: x['transaction_val_dt'].day, axis=1)

    ########################## Paired Data #########################

    # Get pair groups
    rem_bene = data.groupby(['rem_act_no_dummy', 'bene_act_no_dummy'])
    rem_bene_groups = rem_bene.groups

    ###################### Features of paired data ##################

    ###################### 1. Stand Alone Features ##################

    print('Stand alone features')

    # column: n_txn
    counts = rem_bene.count()['txn_amt']
    if train == True:
        # column: pay_roll_ind
        sol = rem_bene.mean()['payroll_ind'].values.reshape(-1, 1)
        groupDF = pd.DataFrame(data=np.append(counts.values.reshape(-1, 1), sol, axis=1), index=counts.index.to_flat_index(), columns=['n_txn', 'payroll_ind'])
        groupDF = groupDF.astype({'n_txn':int, 'payroll_ind':int})
    else:
        groupDF = pd.DataFrame(data=counts.values.reshape(-1, 1), index=counts.index.to_flat_index(), columns=['n_txn'])
        groupDF = groupDF.astype({'n_txn':int})

    # column: rem_company_id_dummy, rem_act_no_dummy, bene_act_no_dummy (remove later)
    groupDF['rem_company_id_dummy'] = rem_bene['rem_company_id_dummy'].first().values
    groupDF['rem_act_no_dummy'], groupDF['bene_act_no_dummy'] = groupDF.index.str

    # column: txn_amt_var, txn_amt_mean
    groupDF['txn_amt_var'] = rem_bene['txn_amt'].var(ddof=0)
    groupDF['txn_amt_mean'] = rem_bene['txn_amt'].mean()

    # column: txn_rate, txn_gap_mean
    delta = (rem_bene['transaction_val_dt'].max()-rem_bene['transaction_val_dt'].min()).apply(lambda x: x.days)
    groupDF['txn_rate'] = groupDF['n_txn']/delta
    groupDF['txn_gap_mean'] = delta/(groupDF['n_txn']-1)
    groupDF.loc[groupDF['txn_rate'] == np.inf, 'txn_rate'] = np.nan
    # Fill na with means
    groupDF = groupDF.fillna(groupDF.mean())

    # column: txn_gap_var
    varTxnGap = rem_bene.apply(getGapVar)
    groupDF['txn_gap_var'] = varTxnGap

    # column: txn_per_day_mean
    avgTxnPerDay = rem_bene.apply(getAvgTxnPerDay)
    groupDF['txn_per_day_mean'] = avgTxnPerDay

    # column: dd_mean, dd_var
    day_stats = rem_bene['transaction_val_dt'].apply(ddProps)
    groupDF['dd_mean'], groupDF['dd_var'] = day_stats.str

    ####################### 2. Global Features ##################

    print('Global features')

    comp_gps = data.groupby('rem_company_id_dummy')
    rem_gps = data.groupby('rem_act_no_dummy')
    bene_gps = data.groupby('bene_act_no_dummy')

    # column: xxxx_n_ind
    multi_ind_comp = comp_gps['rem_company_ind'].unique().apply(countNumInd)
    multi_ind_rem = rem_gps['rem_company_ind'].unique().apply(countNumInd)      
    multi_ind_bene = bene_gps['rem_company_ind'].unique().apply(countNumInd)

    n_ind = groupDF.apply(get_n_ind, axis=1, multi_ind_comp=multi_ind_comp, multi_ind_rem=multi_ind_rem, multi_ind_bene=multi_ind_bene)
    groupDF['comp_n_ind'], groupDF['rem_n_ind'], groupDF['bene_n_ind'] = n_ind.str

    # column: xxxx_dd_diff, xxxx_dd_var_ratio
    comp_dd_means = comp_gps['dd'].mean()
    comp_dd_vars = comp_gps['dd'].var(ddof=0)
    rem_dd_means = rem_gps['dd'].mean()
    rem_dd_vars = rem_gps['dd'].var(ddof=0)
    bene_dd_means = bene_gps['dd'].mean()
    bene_dd_vars = bene_gps['dd'].var(ddof=0)    

    dd_props = groupDF.apply(get_dd_global_props, axis=1, comp_dd_means=comp_dd_means, comp_dd_vars=comp_dd_vars, rem_dd_means=rem_dd_means, rem_dd_vars=rem_dd_vars, bene_dd_means=bene_dd_means, bene_dd_vars=bene_dd_vars)
    groupDF['comp_dd_diff'], groupDF['comp_dd_var_ratio'], groupDF['rem_dd_diff'], groupDF['rem_dd_var_ratio'], groupDF['bene_dd_diff'], groupDF['bene_dd_var_ratio']  = dd_props.str

    # column: xxxx_txn_diff, xxxx_txn_var_ratio
    comp_txn_means = comp_gps['txn_amt'].mean()
    comp_txn_vars = comp_gps['txn_amt'].var(ddof=0)
    rem_txn_means = rem_gps['txn_amt'].mean()
    rem_txn_vars = rem_gps['txn_amt'].var(ddof=0)
    bene_txn_means = bene_gps['txn_amt'].mean()
    bene_txn_vars = bene_gps['txn_amt'].var(ddof=0)

    txn_props = groupDF.apply(get_txn_props, axis=1, comp_txn_means=comp_txn_means, comp_txn_vars=comp_txn_vars, rem_txn_means=rem_txn_means, rem_txn_vars=rem_txn_vars, bene_txn_means=bene_txn_means, bene_txn_vars=bene_txn_vars)
    groupDF['comp_txn_diff'], groupDF['comp_txn_var_ratio'], groupDF['rem_txn_diff'], groupDF['rem_txn_var_ratio'], groupDF['bene_txn_diff'], groupDF['bene_txn_var_ratio'] = txn_props.str

    # column: xxxx_txn_per_day_ratio
    avgTxnPerDayComp = comp_gps.apply(getAvgTxnPerDay)
    avgTxnPerDayRem = rem_gps.apply(getAvgTxnPerDay)
    avgTxnPerDayBene = bene_gps.apply(getAvgTxnPerDay)

    txn_per_day_ratio = groupDF.apply(get_txn_per_day_ratio, axis=1, avgTxnPerDayComp=avgTxnPerDayComp, avgTxnPerDayRem=avgTxnPerDayRem, avgTxnPerDayBene=avgTxnPerDayBene)
    groupDF['comp_txn_per_day_ratio'], groupDF['rem_txn_per_day_ratio'], groupDF['bene_txn_per_day_ratio'] = txn_per_day_ratio.str

    # column: xxxx_txn_gap_diff, xxxx_txn_gap_var_ratio
    deltaComp = (comp_gps['transaction_val_dt'].max()-comp_gps['transaction_val_dt'].min()).apply(lambda x: x.days)
    avgTxnGapComp = deltaComp/(comp_gps.count()['txn_amt']-1)
    varTxnGapComp = comp_gps.apply(getGapVar)

    deltaRem = (rem_gps['transaction_val_dt'].max()-rem_gps['transaction_val_dt'].min()).apply(lambda x: x.days)
    avgTxnGapRem = deltaRem/(rem_gps.count()['txn_amt']-1)
    varTxnGapRem = rem_gps.apply(getGapVar)

    deltaBene = (bene_gps['transaction_val_dt'].max()-bene_gps['transaction_val_dt'].min()).apply(lambda x: x.days)
    avgTxnGapBene = deltaBene/(bene_gps.count()['txn_amt']-1)
    varTxnGapBene = bene_gps.apply(getGapVar)    

    txn_gap_info = groupDF.apply(get_txn_gaps, axis=1, avgTxnGapComp=avgTxnGapComp, varTxnGapComp=varTxnGapComp, avgTxnGapRem=avgTxnGapRem, varTxnGapRem=varTxnGapRem, avgTxnGapBene=avgTxnGapBene, varTxnGapBene=varTxnGapBene)
    groupDF['comp_txn_gap_diff'], groupDF['comp_txn_gap_var_ratio'], groupDF['rem_txn_gap_diff'], groupDF['rem_txn_gap_var_ratio'], groupDF['bene_txn_gap_diff'], groupDF['bene_txn_gap_var_ratio'] = txn_gap_info.str

    groupDF.drop(['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy'], axis=1, inplace=True)

    ############### Transaction Data ######################

    # column: xxxx_n_ind
    n_ind_txn = data.apply(get_n_ind, axis=1, multi_ind_comp=multi_ind_comp, multi_ind_rem=multi_ind_rem, multi_ind_bene=multi_ind_bene)
    data['comp_n_ind'], data['rem_n_ind'], data['bene_n_ind'] = n_ind_txn.str

    # column: xxxx_dd_diff  
    dd_props_txn = data.apply(get_dd_props_txn, axis=1, comp_dd_means=comp_dd_means, rem_dd_means=rem_dd_means, bene_dd_means=bene_dd_means)
    data['comp_dd_diff'], data['rem_dd_diff'], data['bene_dd_diff'] = dd_props_txn.str

    # column: xxxx_txn_diff
    txn_props_txn = data.apply(get_txn_props_txn, axis=1, comp_txn_means=comp_txn_means, rem_txn_means=rem_txn_means, bene_txn_means=bene_txn_means)
    data['comp_txn_diff'], data['rem_txn_diff'], data['bene_txn_diff'] = txn_props_txn.str

    # Encoding
    if train:
        # Date 
        if os.path.exists(date_path): 
            min_date = pickle.load(open(date_path, 'rb'))
        else:    
            min_date = min(data['transaction_val_dt'])
            pickle.dump(min_date, open(date_path, 'wb'))

        data['time'] = (data['transaction_val_dt']-min_date).apply(lambda x: x.days)
        data.replace({'DOMESTIC':1,'CROSS BORDER':0}, inplace=True)

        if os.path.exists(baseEnc_path):
            enc1 = pickle.load(open(baseEnc_path, 'rb'))
        else:
            enc1 = BaseNEncoder(cols=['rem_company_ind'], base=2).fit(data['rem_company_ind'])
            pickle.dump(enc1, open(baseEnc_path, 'wb'))

#         if os.path.exists(woeEnc_path):
#             enc2 = pickle.load(open(woeEnc_path, 'rb'))
#         else:
#             X = data[['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy']]
#             y = data['payroll_ind']
#             enc2 = WOEEncoder(cols=['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy']).fit(X, y)
#             pickle.dump(enc2, open(woeEnc_path, 'wb'))

        encoded = enc1.transform(data['rem_company_ind'])
        data = pd.merge(data, encoded, left_index=True, right_index=True)
#         data[['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy']] = enc2.transform(data[['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy']])
        data.drop(['transaction_val_dt', 'yearmonth', 'rem_company_ind'], axis=1, inplace=True)
    else:
        if os.path.exists(date_path): 
            min_date = pickle.load(open(date_path, 'rb'))
        else:
            print('Date file not found!')

        data['time'] = (data['transaction_val_dt']-min_date).apply(lambda x: x.days)
        data.replace({'DOMESTIC':1,'CROSS BORDER':0}, inplace=True)

        if os.path.exists(baseEnc_path):
            enc1 = pickle.load(open(baseEnc_path, 'rb'))
        else:
            print('BaseEnc file not found!')

#         if os.path.exists(woeEnc_path):
#             enc2 = pickle.load(open(woeEnc_path, 'rb'))
#         else:
#             print('WOEEnc file not found!')

        encoded = enc1.transform(data['rem_company_ind'])
        data = pd.merge(data, encoded, left_index=True, right_index=True)
#         data[['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy']] = enc2.transform(data[['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy']])
        data.drop(['rem_plus_bene', 'transaction_val_dt', 'yearmonth', 'rem_company_ind'], axis=1, inplace=True)
        data = data.drop(['rem_n_ind','rem_dd_diff','rem_txn_diff', 'rem_company_ind_0'], axis=1)
        data = data.drop(['rem_company_id_dummy', 'rem_act_no_dummy', 'bene_act_no_dummy'], axis=1)
        
    return groupDF, data

In [5]:
def extra_featues_pair_data(groupDF, data, company_enc_path, baseEnc_pair_path, imputer_path, train=True):
    '''
    groupDF : Obtained from Preprocess function
    data    : raw test/train data
    
    ## change line number 15 to commented version if package is updated!!!!!!!!!!
    
    '''
    # txn_type_mean
    data['rem_plus_bene'] = data.apply(lambda x: (x.rem_act_no_dummy, x.bene_act_no_dummy), axis=1)
    data.replace({'DOMESTIC':1,'CROSS BORDER':0},inplace=True)
    rem_bene = data.groupby(['rem_act_no_dummy', 'bene_act_no_dummy'])
    groupDF['txn_type_mean'] = rem_bene['txn_type'].mean()
    
    # company_ind
    groupDF['rem_company_ind'] = rem_bene['rem_company_ind'].unique().apply(fn)  #########.apply(lambda x: statistics.mode(x))
    
    # company id
#     groupDF['rem_company_id_dummy'] = rem_bene['rem_company_id_dummy'].first()
    
    if train:
#         # company id encoder 
#         if os.path.exists(company_enc_path):
#             enc = pickle.load(open(company_enc_path, 'rb'))
#         else:
#             X = groupDF['rem_company_id_dummy']
#             y = groupDF['payroll_ind']
#             enc = WOEEncoder(cols=['rem_company_id_dummy']).fit(X, y)
#             pickle.dump(enc, open(company_enc_path, 'wb'))    

        # ind encoder
        if os.path.exists(baseEnc_pair_path):
            enc1 = pickle.load(open(baseEnc_pair_path, 'rb'))
        else:
            enc1 = BaseNEncoder(cols=['rem_company_ind'], base=2).fit(groupDF['rem_company_ind'])
            pickle.dump(enc1, open(baseEnc_pair_path, 'wb'))            
    else:
#         if os.path.exists(company_enc_path):
#             enc = pickle.load(open(company_enc_path, 'rb'))
#         else:
#             print('WOE Encoder not found!')
            
        if os.path.exists(baseEnc_pair_path):
            enc1 = pickle.load(open(baseEnc_pair_path, 'rb'))
        else:
            enc1 = BaseNEncoder(cols=['rem_company_ind'], base=2).fit(groupDF['rem_company_ind'])
            print('Binary Encoder not found!')           
                
#     groupDF['rem_company_id_dummy'] = enc.transform(groupDF['rem_company_id_dummy'])
    encoded = enc1.transform(groupDF['rem_company_ind'])
    groupDF = pd.merge(groupDF, encoded, left_index=True, right_index=True)
    groupDF.drop(['rem_company_ind'], axis=1, inplace=True)
    ###### Dropping since std dev of this column was found to be zero!
    groupDF.drop(['rem_company_ind_0'], axis=1, inplace=True)
    
    # Conver inf to nan
    groupDF.replace({np.inf: np.nan}, inplace=True)
    
    # Columns with too many NaNs
    c_drop = ['bene_dd_var_ratio', 'bene_txn_var_ratio', 'bene_txn_gap_diff', 'bene_txn_gap_var_ratio']
    groupDF.drop(c_drop, axis=1, inplace=True)
    
    if train:
        if os.path.exists(imputer_path):
            imputer  = pickle.load(open(imputer_path, 'rb'))
        else:
            imputer = KNNImputer(n_neighbors=5).fit(groupDF.drop('payroll_ind', axis=1))
            pickle.dump(imputer, open(imputer_path, 'wb'))
        temp = imputer.transform(groupDF.drop('payroll_ind', axis=1))
        temp = pd.DataFrame(temp, columns=groupDF.drop('payroll_ind', axis=1).columns, index=groupDF.index)
        temp['payroll_ind'] = groupDF['payroll_ind']
        groupDF = temp
    else:
        if os.path.exists(imputer_path):
            imputer  = pickle.load(open(imputer_path, 'rb'))
        else:
            print('Imputer does not exist')
            return
        temp = imputer.transform(groupDF)
        groupDF = pd.DataFrame(temp, columns=groupDF.columns, index=groupDF.index)

    depCols = ['rem_txn_gap_diff', 'rem_dd_var_ratio', 'rem_n_ind', 'rem_dd_diff', 'rem_txn_diff']
    groupDF.drop(depCols, axis=1, inplace=True)
    
    return groupDF

In [6]:
groupDF, train_data = preprocess(train, date_path = 'Data/min_date.sav', baseEnc_path = 'Data/baseEnc.sav', woeEnc_path = 'Data/woeEnc.sav')

Fraction of rows removed: 0.004035975683491154
Stand alone features
Global features


In [7]:
groupDF_final = extra_featues_pair_data(groupDF, train, company_enc_path='Data/company_enc.sav', baseEnc_pair_path='Data/baseEnc_pair.sav', imputer_path='Data/imputer.sav', train=True)

In [13]:
train_data.to_csv('Data/train_data_txn.csv')  # Does not have NaNs or infs
groupDF_final.to_csv('Data/groupDF_clean.csv')

In [16]:
groupDF_test, test_data = preprocess(test, date_path='Data/min_date.sav', baseEnc_path='Data/baseEnc.sav', woeEnc_path='Data/woeEnc.sav', train=False)
groupDF_final_test = extra_featues_pair_data(groupDF_test, test, company_enc_path='Data/company_enc.sav', baseEnc_pair_path='Data/baseEnc_pair.sav', imputer_path='Data/imputer.sav', train=False)

Stand alone features
Global features


In [17]:
test_data.to_csv('Data/test_data_txn.csv')  # Does not have NaNs or infs
groupDF_final_test.to_csv('Data/groupDF_test.csv')

In [19]:
test_data.shape

(614021, 14)

In [20]:
train_data.shape

(1424859, 15)

In [21]:
groupDF_final.shape

(381971, 30)

In [22]:
groupDF_final_test.shape

(164056, 29)

In [None]:
# nanCols = ['comp_dd_var_ratio',
#  'rem_dd_var_ratio',
#  'comp_txn_var_ratio',
#  'rem_txn_var_ratio',
#  'comp_txn_gap_diff',
#  'comp_txn_gap_var_ratio',
#  'rem_txn_gap_diff',
#  'rem_txn_gap_var_ratio']

# imputer = KNNImputer(n_neighbors=5).fit(groupDF_final[nanCols])
# temp = imputer.transform(groupDF_final[nanCols])

In [None]:
# groupDF_final[nanCols] = temp

In [None]:
# groupDF_final.to_csv('Data/groupDF.csv')

## inf and NaNs

In [None]:
# # Replacing inf with NaN

# cols = groupDF_final.columns
# print('Columns with infs:\n')
# for c in cols:
#     if True in np.unique(np.isinf(groupDF_final.loc[:, c])):
#         print(c)
#         groupDF_final.loc[groupDF_final[c] == np.inf, c] = np.nan

In [None]:
# col_list

In [21]:
wid = max([len(col) for col in groupDF_final.columns])
col_list = []
print('Column', (wid-6)*' ', ':  NaN Rows Count', )
for col in groupDF_final.columns:
    print(col, (wid-len(col))*' ', ':  ', len(groupDF_final[groupDF_final[col].isnull()])/len(groupDF_final))
    if len(groupDF_final[groupDF_final[col].isnull()])>0:
        col_list.append(col)

Column                  :  NaN Rows Count
n_txn                   :   0.0
txn_amt_var             :   0.0
txn_amt_mean            :   0.0
txn_rate                :   0.0
txn_gap_mean            :   0.0
txn_gap_var             :   0.0
txn_per_day_mean        :   0.0
dd_mean                 :   0.0
dd_var                  :   0.0
comp_n_ind              :   0.0
rem_n_ind               :   0.0
bene_n_ind              :   0.0
comp_dd_diff            :   0.0
comp_dd_var_ratio       :   0.0
rem_dd_diff             :   0.0
rem_dd_var_ratio        :   0.0
bene_dd_diff            :   0.0
comp_txn_diff           :   0.0
comp_txn_var_ratio      :   0.0
rem_txn_diff            :   0.0
rem_txn_var_ratio       :   0.0
bene_txn_diff           :   0.0
comp_txn_per_day_ratio  :   0.0
rem_txn_per_day_ratio   :   0.0
bene_txn_per_day_ratio  :   0.0
comp_txn_gap_diff       :   0.0
comp_txn_gap_var_ratio  :   0.0
rem_txn_gap_diff        :   0.0
rem_txn_gap_var_ratio   :   0.0
txn_type_mean           :   0.