In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import timeit
start_time = pd.datetime.now()

In [2]:
df = pd.read_csv('cleaned_data.csv', index_col = 0)

In [3]:
df.shape

(96397, 12)

In [4]:
## note the index and recnum are not consecutive since we exclude non-P transaction types

df.head()

Unnamed: 0,Recnum,Cardnum,Date,Merchnum,Merch description,Merch state,Merch zip,Transtype,Amount,Fraud,Dow,Dow_Risk
0,1,5142190439,2010-01-01,5509006296254,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0,Friday,0.022419
1,2,5142183973,2010-01-01,61003026333,SERVICE MERCHANDISE #81,MA,1803.0,P,31.42,0,Friday,0.022419
2,3,5142131721,2010-01-01,4503082993600,OFFICE DEPOT #191,MD,20706.0,P,178.49,0,Friday,0.022419
3,4,5142148452,2010-01-01,5509006296254,FEDEX SHP 12/28/09 AB#,TN,38118.0,P,3.62,0,Friday,0.022419
4,5,5142190439,2010-01-01,5509006296254,FEDEX SHP 12/23/09 AB#,TN,38118.0,P,3.62,0,Friday,0.022419


In [5]:
df.dtypes

Recnum                 int64
Cardnum                int64
Date                  object
Merchnum              object
Merch description     object
Merch state           object
Merch zip             object
Transtype             object
Amount               float64
Fraud                  int64
Dow                   object
Dow_Risk             float64
dtype: object

### I. Data types

In [6]:
df.Cardnum = df.Cardnum.astype(str)

In [7]:
### add leading 0 to zips
### note: there are some zips that are state abbrv. as we imputed them ealier, so pandas read the column as str

def leading_0(x):
    
    if '.0' in x:
        x = x[:-2]
        if len(x) == 5:
            return x
        else: 
            return '0'*(5-len(x)) + x
    else:
        return '0'*(5-len(x)) + x

df['Merch zip'] = df['Merch zip'].apply(leading_0)

In [8]:
### delete white spaces in merch description

df['Merch description'] = df['Merch description'].str.replace(r'\s', '')

### II. Creat entities

In [9]:
df['card_merch'] = df['Cardnum'] + df['Merchnum']
df['card_zip'] = df['Cardnum'] + df['Merch zip']
df['card_state'] = df['Cardnum'] + df['Merch state']
df['merch_zip'] = df['Merchnum'] + df['Merch zip']
df['merch_state'] = df['Merchnum'] + df['Merch state']

In [10]:
df.columns

Index(['Recnum', 'Cardnum', 'Date', 'Merchnum', 'Merch description',
       'Merch state', 'Merch zip', 'Transtype', 'Amount', 'Fraud', 'Dow',
       'Dow_Risk', 'card_merch', 'card_zip', 'card_state', 'merch_zip',
       'merch_state'],
      dtype='object')

In [11]:
entities = list(df.iloc[:, np.r_[1, 3, 12:17]].columns)

In [12]:
entities

['Cardnum',
 'Merchnum',
 'card_merch',
 'card_zip',
 'card_state',
 'merch_zip',
 'merch_state']

### III. Variables

In [13]:
df.Date = pd.to_datetime(df.Date)

In [14]:
df1 = df.copy()
final = df.copy()
df1['check_date'] = df1.Date
df1['check_record'] = df1.Recnum

#### Day-since, frequency, and amount

In [15]:
start = timeit.default_timer()

for entity in entities:
    
    try: print('Run time for the last entity ----------------- {}s'.format(timeit.default_timer() - st))
    except: print('')
    st = timeit.default_timer()
    
    ## Day-since variables:
    
    df_l = df1[['Recnum', 'Date', entity]]
    df_r = df1[['check_record', 'check_date', entity, 'Amount']]
    
    temp = pd.merge(df_l, df_r, left_on = entity, right_on = entity)
    
    temp1 = temp[temp.Recnum > temp.check_record][['Recnum','Date','check_date']]\
                                                .groupby('Recnum')[['Date', 'check_date']].last()
    mapper = (temp1.Date - temp1.check_date).dt.days
    final[entity + '_day_since'] = final.Recnum.map(mapper)
    final[entity + '_day_since'].fillna((final.Date - pd.to_datetime('2010-01-01')).dt.days, inplace = True)

    print('\n' + entity + '_day_since ---> Done')
    
    ## Frequency & Amount variables:
    
    for time in [0,1,3,7,14,30]:
        
        temp2 = temp[(temp.check_date >= (temp.Date - dt.timedelta(time))) &\
                       (temp.Recnum >= temp.check_record)][['Recnum', entity, 'Amount']]
        
        col_name = entity + '_count_' + str(time)    
        mapper2 = temp2.groupby('Recnum')[entity].count()      
        final[col_name] = final.Recnum.map(mapper2)
        
        print(col_name + ' ---> Done')
        
        final[entity + '_avg_' + str(time)] = final.Recnum.map(temp2.groupby('Recnum')['Amount'].mean())
        final[entity + '_max_' + str(time)] = final.Recnum.map(temp2.groupby('Recnum')['Amount'].max())
        final[entity + '_med_' + str(time)] = final.Recnum.map(temp2.groupby('Recnum')['Amount'].median())
        final[entity + '_total_' + str(time)] = final.Recnum.map(temp2.groupby('Recnum')['Amount'].sum())
        final[entity + '_actual/avg_' + str(time)] = final['Amount'] / final[entity + '_avg_' + str(time)]
        final[entity + '_actual/max_' + str(time)] = final['Amount'] / final[entity + '_max_' + str(time)]
        final[entity + '_actual/med_' + str(time)] = final['Amount'] / final[entity + '_med_' + str(time)]
        final[entity + '_actual/toal_' + str(time)] = final['Amount'] / final[entity + '_total_' + str(time)]
        
        print(entity + ' amount variables over past ' + str(time) + ' ---> Done')

print('Total run time: {}mins'.format((timeit.default_timer() - start)/60))



Cardnum_day_since ---> Done
Cardnum_count_0 ---> Done
Cardnum amount variables over past 0 ---> Done
Cardnum_count_1 ---> Done
Cardnum amount variables over past 1 ---> Done
Cardnum_count_3 ---> Done
Cardnum amount variables over past 3 ---> Done
Cardnum_count_7 ---> Done
Cardnum amount variables over past 7 ---> Done
Cardnum_count_14 ---> Done
Cardnum amount variables over past 14 ---> Done
Cardnum_count_30 ---> Done
Cardnum amount variables over past 30 ---> Done
Run time for the last entity ----------------- 7.041482131000066s

Merchnum_day_since ---> Done
Merchnum_count_0 ---> Done
Merchnum amount variables over past 0 ---> Done
Merchnum_count_1 ---> Done
Merchnum amount variables over past 1 ---> Done
Merchnum_count_3 ---> Done
Merchnum amount variables over past 3 ---> Done
Merchnum_count_7 ---> Done
Merchnum amount variables over past 7 ---> Done
Merchnum_count_14 ---> Done
Merchnum amount variables over past 14 ---> Done
Merchnum_count_30 ---> Done
Merchnum amount variables o

#### Velocity

In [16]:
start = timeit.default_timer()

for ent in entities:
    for d in ['0', '1']:
        for dd in ['7', '14', '30']:
            final[ent + '_count_' + d + '_by_' + dd] =\
            final[ent + '_count_' + d]*1.0/(final[ent + '_count_' + dd]*1.0/float(dd))

print('Total run time: {}s'.format(timeit.default_timer() - start))

Total run time: 0.48102699499986556s


### IV. Keep essential features

In [17]:
final.set_index('Recnum', inplace = True)

In [18]:
final = final.iloc[:, np.r_[8, 10, 16:len(final.columns)]]

In [19]:
final.head()

Unnamed: 0_level_0,Fraud,Dow_Risk,Cardnum_day_since,Cardnum_count_0,Cardnum_avg_0,Cardnum_max_0,Cardnum_med_0,Cardnum_total_0,Cardnum_actual/avg_0,Cardnum_actual/max_0,...,merch_zip_count_0_by_30,merch_zip_count_1_by_7,merch_zip_count_1_by_14,merch_zip_count_1_by_30,merch_state_count_0_by_7,merch_state_count_0_by_14,merch_state_count_0_by_30,merch_state_count_1_by_7,merch_state_count_1_by_14,merch_state_count_1_by_30
Recnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0.022419,0.0,1,3.62,3.62,3.62,3.62,1.0,1.0,...,30.0,7.0,14.0,30.0,7.0,14.0,30.0,7.0,14.0,30.0
2,0,0.022419,0.0,1,31.42,31.42,31.42,31.42,1.0,1.0,...,30.0,7.0,14.0,30.0,7.0,14.0,30.0,7.0,14.0,30.0
3,0,0.022419,0.0,1,178.49,178.49,178.49,178.49,1.0,1.0,...,30.0,7.0,14.0,30.0,7.0,14.0,30.0,7.0,14.0,30.0
4,0,0.022419,0.0,1,3.62,3.62,3.62,3.62,1.0,1.0,...,30.0,7.0,14.0,30.0,7.0,14.0,30.0,7.0,14.0,30.0
5,0,0.022419,0.0,2,3.62,3.62,3.62,7.24,1.0,1.0,...,30.0,7.0,14.0,30.0,7.0,14.0,30.0,7.0,14.0,30.0


In [20]:
final.shape

(96397, 429)

In [21]:
final.to_csv('with_variables.csv')

In [22]:
print('Duration: ', pd.datetime.now() - start_time)

Duration:  0:02:50.407298
