In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
#import data
df = pd.read_csv('app.csv')

In [3]:
df.head()

Unnamed: 0,record,date,ssn,firstname,lastname,address,zip5,dob,homephone,fraud_label
0,1,20160101,379070012,XRRAMMTR,SMJETJMJ,6861 EUTST PL,2765,19070626,1797504115,0
1,2,20160101,387482503,MAMSTUJR,RTTEMRRR,7280 URASA PL,57169,19340615,4164239415,1
2,3,20160101,200332444,SZMMUJEZS,EUSEZRAE,5581 RSREX LN,56721,19070626,216537580,0
3,4,20160101,747451317,SJJZSXRSZ,ETJXTXXS,1387 UJZXJ RD,35286,19440430,132144161,0
4,5,20160101,24065868,SSSXUEJMS,SSUUJXUZ,279 EAASA WY,3173,19980315,6101082272,0


### Fix Frivolous Values and Clean Data

In [4]:
#datatype
df['date'] = pd.to_datetime(df.date.astype(str).apply(lambda x: x[:4] + '-' + x[4:6] + '-' + x[6:]))
df['zip5'].apply(lambda x: '{0:0>5}'.format(x))

#frivolous values

#ssn
df.loc[df.ssn==999999999, 'ssn'] = -df[df.ssn==999999999]['record']
df['ssn'] = df['ssn'].apply(lambda x: '{0:0>9}'.format(x))

#address
df.loc[df.address== '123 MAIN ST', 'address'] = \
    df.loc[df.address== '123 MAIN ST', 'address'].apply(lambda x: str(x) + ' RECORD')

#dob
df.loc[df.dob==19070626, 'dob'] = -df[df.dob==19070626]['record']
df['dob'] = df['dob'].apply(lambda x: '{0:0>8}'.format(x))

#homephone
df.loc[df.homephone==9999999999, 'homephone'] = -df[df.homephone==9999999999]['record']
df['homephone'] = df['homephone'].apply(lambda x: '{0:0>10}'.format(x))

In [5]:
df['zip5'] = df['zip5'].apply(str)

### Create Entities

In [6]:
df['name'] = df.firstname + df.lastname
df['fulladdress'] = df.address + df.zip5
df['name_dob'] = df.name + df.dob
df['name_fulladdress'] = df.name + df.fulladdress
df['name_homephone'] = df.name + df.homephone
df['fulladdress_dob'] = df.fulladdress + df.dob
df['fulladdress_homephone'] = df.fulladdress + df.homephone
df['dob_homephone'] = df.dob + df.homephone
df['homephone_name_dob'] = df.homephone + df.name_dob

In [7]:
#create a bunch more columns for pairings of other columns and ssn
for field in list(df.iloc[:, np.r_[3:9, 12:15]].columns):
    df['ssn_' + field] = df.ssn + df[field]

In [8]:
attributes = list(df.iloc[:, np.r_[2,5,6,7,8,10,11,12:28]].columns)
attributes

['ssn',
 'address',
 'zip5',
 'dob',
 'homephone',
 'name',
 'fulladdress',
 'name_dob',
 'name_fulladdress',
 'name_homephone',
 'fulladdress_dob',
 'fulladdress_homephone',
 'dob_homephone',
 'homephone_name_dob',
 'ssn_firstname',
 'ssn_lastname',
 'ssn_address',
 'ssn_zip5',
 'ssn_dob',
 'ssn_homephone',
 'ssn_name_dob',
 'ssn_name_fulladdress',
 'ssn_name_homephone']

### Risk Table for Day of Week

In [9]:
import calendar
df['dow'] = df.date.apply(lambda x: calendar.day_name[x.weekday()])

In [10]:
train_test = df[df.date < '2016-11-01']

In [11]:
#do statistical smoothing
c = 4; nmid=20; y_avg = train_test['fraud_label'].mean()
y_dow = train_test.groupby('dow')['fraud_label'].mean()
num = train_test.groupby('dow').size()
y_dow_smooth = y_avg + (y_dow - y_avg)/(1+ np.exp(-(num-nmid)/c))
df['dow_risk'] = df.dow.map(y_dow_smooth)

## Create Variables

### Velocity + Day Since

In [12]:
df1 = df.copy()
final = df.copy()
df1['check_date'] = df1.date
df1['check_record'] = df1.record

In [13]:
import datetime as dt
#start = timer()
for entity in attributes:
    #st = timer()
    df_1 = df1[['record', 'date', entity]]
    df_r = df1[['check_record', 'check_date', entity]]
    temp = pd.merge(df_1, df_r, left_on = entity, right_on = entity)
    
    #days since
    day_since_df = temp[temp.record > temp.check_record][['record', 'date', 'check_date']] \
    .groupby('record')[['date', 'check_date']].last()
    mapper = (day_since_df.date - day_since_df.check_date).dt.days
    final[entity + '_day_since'] = final.record.map(mapper)
    final[entity + '_day_since'].fillna(365, inplace = True)
    print(f'\n{entity}_day_since -----> Done')
    
    #velocity
    for offset_t in [0,1,3,7,14,30]:
        count_day_df = temp[(temp.check_date >= (temp.date - dt.timedelta(offset_t)))
                           & (temp.record >= temp.check_record)]
        col_name = f'{entity}_count_{offset_t}'
        mapper2 = count_day_df.groupby('record')[entity].count()
        final[col_name] = final.record.map(mapper2)
        
        print(f'{entity}_count_{str(offset_t)}-----> Done')
        
    #print(f' Run time for entity {entity} ------------ {timer() - st:0.2f}s')
    
#print(f'Total run time: {(timer() - start) / 60:0.2f}min')


ssn_day_since -----> Done
ssn_count_0-----> Done
ssn_count_1-----> Done
ssn_count_3-----> Done
ssn_count_7-----> Done
ssn_count_14-----> Done
ssn_count_30-----> Done

address_day_since -----> Done
address_count_0-----> Done
address_count_1-----> Done
address_count_3-----> Done
address_count_7-----> Done
address_count_14-----> Done
address_count_30-----> Done

zip5_day_since -----> Done
zip5_count_0-----> Done
zip5_count_1-----> Done
zip5_count_3-----> Done
zip5_count_7-----> Done
zip5_count_14-----> Done
zip5_count_30-----> Done

dob_day_since -----> Done
dob_count_0-----> Done
dob_count_1-----> Done
dob_count_3-----> Done
dob_count_7-----> Done
dob_count_14-----> Done
dob_count_30-----> Done

homephone_day_since -----> Done
homephone_count_0-----> Done
homephone_count_1-----> Done
homephone_count_3-----> Done
homephone_count_7-----> Done
homephone_count_14-----> Done
homephone_count_30-----> Done

name_day_since -----> Done
name_count_0-----> Done
name_count_1-----> Done
name_count_3

### Relative Velocity

In [14]:
for att in attributes: 
    for d in ['0','1']:
        for dd in ['3','7','14','30']:
            final[att + '_count_' + d + '_by_' + dd]\
            = final[att + '_count_' + d] /  (final[att + '_count_' + dd] / float(dd))

In [15]:
final.head()

Unnamed: 0,record,date,ssn,firstname,lastname,address,zip5,dob,homephone,fraud_label,...,ssn_name_fulladdress_count_1_by_14,ssn_name_fulladdress_count_1_by_30,ssn_name_homephone_count_0_by_3,ssn_name_homephone_count_0_by_7,ssn_name_homephone_count_0_by_14,ssn_name_homephone_count_0_by_30,ssn_name_homephone_count_1_by_3,ssn_name_homephone_count_1_by_7,ssn_name_homephone_count_1_by_14,ssn_name_homephone_count_1_by_30
0,1,2016-01-01,379070012,XRRAMMTR,SMJETJMJ,6861 EUTST PL,2765,000000-1,1797504115,0,...,14.0,30.0,3.0,7.0,14.0,30.0,3.0,7.0,14.0,30.0
1,2,2016-01-01,387482503,MAMSTUJR,RTTEMRRR,7280 URASA PL,57169,19340615,4164239415,1,...,14.0,30.0,3.0,7.0,14.0,30.0,3.0,7.0,14.0,30.0
2,3,2016-01-01,200332444,SZMMUJEZS,EUSEZRAE,5581 RSREX LN,56721,000000-3,216537580,0,...,14.0,30.0,3.0,7.0,14.0,30.0,3.0,7.0,14.0,30.0
3,4,2016-01-01,747451317,SJJZSXRSZ,ETJXTXXS,1387 UJZXJ RD,35286,19440430,132144161,0,...,14.0,30.0,3.0,7.0,14.0,30.0,3.0,7.0,14.0,30.0
4,5,2016-01-01,24065868,SSSXUEJMS,SSUUJXUZ,279 EAASA WY,3173,19980315,6101082272,0,...,14.0,30.0,3.0,7.0,14.0,30.0,3.0,7.0,14.0,30.0


In [16]:
top30final = ['address_count_0',
 'address_count_0_by_3',
 'address_count_1',
 'address_count_7',
 'fulladdress_count_0',
 'fulladdress_count_0_by_3',
 'fulladdress_count_0_by_7',
 'fulladdress_count_1',
 'fulladdress_count_1_by_7',
 'fulladdress_homephone_count_0_by_30',
 'fulladdress_homephone_count_3',
 'fulladdress_homephone_count_30',
 'fulladdress_homephone_count_7',
 'homephone_count_3',
 'name_dob_count_14',
 'name_dob_count_30',
 'ssn_count_14',
 'ssn_count_30',
 'ssn_dob_count_0_by_14',
 'ssn_dob_count_14',
 'ssn_dob_count_30',
 'ssn_firstname_count_14',
 'ssn_firstname_count_30',
 'ssn_firstname_count_7',
 'ssn_lastname_count_14',
 'ssn_lastname_count_30',
 'ssn_name_dob_count_0_by_30',
 'ssn_name_dob_count_14',
 'ssn_name_dob_count_30',
 'zip5_count_1', 'fraud_label']

In [17]:
train_test30 = final[final['date'] < '2016-11-01']
oot30 = final[final['date'] >= '2016-11-01']

train_test30 = train_test30[top30final]
oot30 = oot30[top30final]

In [18]:
oot30.shape[0] + train_test30.shape[0]

1000000

In [19]:
train_test30.shape

(833507, 31)

In [20]:
#now we have our train/test set (which we'll now split into training and testing for modeling) 
#and we have our oot dataset for the 30 top fields that we've selected. 

from sklearn.model_selection import train_test_split as tts
X = train_test30.drop('fraud_label', axis = 1)
y = train_test30['fraud_label']

X_train, X_test, y_train, y_test = tts(X, y, test_size = .3, stratify = y, random_state = 11)

Let's build some models!

# Modeling Script Guide

1. Start by deciding which and how many models you want to run. 
2. Delete any model types you don't want to run (for example, if you're only running neural nets, delete the boosted tree, random forest, and logistic regression modeling sections. 
3. Replace the existing models with your models, or use the existing models and change the hyperparameters. Make sure to use the same functions though.
4. Make sure to follow the existing format. Create a training set, test set, and oot set for each model. This is important when iterating through models to calculate FDRs. 
5. Replace the model names and the model training, test, and oot set names with the ones you used in your modeling in the modlist object below the modeling section. If you used the same names and simply tweaked the hyperparameters around, then just delete the objects you didn't use from the model and set lists. 
6. Run the rest of the code to get a dataframe where each model is a row, and the model's performance on the training, test, and oot sets occupy the other columns. 

## Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
#lr1

In [23]:
lr1_X_train = X_train.copy()
lr1_X_test = X_test.copy()
lr1_oot = oot30.copy().drop('fraud_label', axis=1)
lr1 = LogisticRegression(penalty = 'none', max_iter = 2000).fit(lr1_X_train, y_train)

In [24]:
#lr2

In [25]:
lr2_X_train = X_train.iloc[:, :5]
lr2_X_test = X_test.iloc[:,:5]
lr2_oot = oot30.iloc[:, :5] 
lr2 = LogisticRegression(penalty = 'none', max_iter = 2000).fit(lr2_X_train, y_train)

In [26]:
#lr3
lr3_X_train = X_train.iloc[:, :10]
lr3_X_test = X_test.iloc[:,:10]
lr3_oot = oot30.iloc[:, :10] 
lr3 = LogisticRegression(penalty = 'none', max_iter = 2000).fit(lr3_X_train, y_train)

In [27]:
#lr4
lr4_X_train = X_train.iloc[:, :15]
lr4_X_test = X_test.iloc[:,:15]
lr4_oot = oot30.iloc[:, :15] 
lr4 = LogisticRegression(penalty = 'none', max_iter = 2000).fit(lr4_X_train, y_train)

In [28]:
#lr5
lr5_X_train = X_train.iloc[:, :25]
lr5_X_test = X_test.iloc[:,:25]
lr5_oot = oot30.iloc[:, :25] 
lr5 = LogisticRegression(penalty = 'none', max_iter = 2000).fit(lr5_X_train, y_train)

In [29]:
#lr6
lr6_X_train = X_train.iloc[:, :20]
lr6_X_test = X_test.iloc[:,:20]
lr6_oot = oot30.iloc[:, :20] 
lr6 = LogisticRegression(penalty = 'none', max_iter = 2000).fit(lr6_X_train, y_train)

### Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier as RF

In [31]:
rf1_X_train = X_train.copy()
rf1_X_test = X_test.copy()
rf1_oot = oot30.copy().drop('fraud_label', axis=1)
rf1 = RF(n_estimators = 50, max_depth = 100).fit(rf1_X_train, y_train)

In [32]:
rf2_X_train = X_train.copy()
rf2_X_test = X_test.copy()
rf2_oot = oot30.copy().drop('fraud_label', axis=1)
rf2 = RF(n_estimators = 100, max_depth = 100).fit(rf2_X_train, y_train)

In [33]:
rf3_X_train = X_train.copy()
rf3_X_test = X_test.copy()
rf3_oot = oot30.copy().drop('fraud_label', axis=1)
rf3 = RF(n_estimators = 150, max_depth = 100).fit(rf3_X_train, y_train)

In [34]:
rf4_X_train = X_train.copy()
rf4_X_test = X_test.copy()
rf4_oot = oot30.copy().drop('fraud_label', axis=1)
rf4 = RF(n_estimators = 100, max_depth = 200).fit(rf4_X_train, y_train)

In [35]:
rf5_X_train = X_train.copy()
rf5_X_test = X_test.copy()
rf5_oot = oot30.copy().drop('fraud_label', axis=1)
rf5 = RF(n_estimators = 50, max_depth = 500).fit(rf1_X_train, y_train)

In [36]:
rf6_X_train = X_train.copy()
rf6_X_test = X_test.copy()
rf6_oot = oot30.copy().drop('fraud_label', axis=1)
rf6 = RF(n_estimators = 100, min_samples_split = 100).fit(rf6_X_train, y_train)

In [37]:
rf7_X_train = X_train.copy()
rf7_X_test = X_test.copy()
rf7_oot = oot30.copy().drop('fraud_label', axis=1)
rf7 = RF(n_estimators = 100, max_depth = 200, criterion = 'entropy').fit(rf7_X_train, y_train)

### Neural Net

In [38]:
from sklearn.neural_network import MLPClassifier as NN

In [39]:
nn1_X_train = X_train.copy()
nn1_X_test = X_test.copy()
nn1_oot = oot30.copy().drop('fraud_label', axis=1)
nn1 = NN(hidden_layer_sizes = (10,), max_iter = 20).fit(nn1_X_train, y_train)



In [40]:
nn2_X_train = X_train.copy()
nn2_X_test = X_test.copy()
nn2_oot = oot30.copy().drop('fraud_label', axis=1)
nn2 = NN(hidden_layer_sizes = (5,), max_iter = 30).fit(nn2_X_train, y_train)

In [41]:
nn3_X_train = X_train.copy()
nn3_X_test = X_test.copy()
nn3_oot = oot30.copy().drop('fraud_label', axis=1)
nn3 = NN(hidden_layer_sizes = (15,), max_iter = 40).fit(nn3_X_train, y_train)

In [42]:
nn4_X_train = X_train.copy()
nn4_X_test = X_test.copy()
nn4_oot = oot30.copy().drop('fraud_label', axis=1)
nn4 = NN(hidden_layer_sizes = (5,), max_iter = 40, learning_rate = 'adaptive').fit(nn4_X_train, y_train)

In [43]:
nn5_X_train = X_train.copy()
nn5_X_test = X_test.copy()
nn5_oot = oot30.copy().drop('fraud_label', axis=1)
nn5 = NN(hidden_layer_sizes = (5,), max_iter = 40, activation = 'tanh').fit(nn5_X_train, y_train)

### Boosted Tree

In [44]:
from sklearn.ensemble import GradientBoostingClassifier as GBC

In [45]:
bt1_X_train = X_train.copy()
bt1_X_test = X_test.copy()
bt1_oot = oot30.copy().drop('fraud_label', axis=1)
bt1 = GBC(n_estimators = 500, max_depth = 3).fit(bt1_X_train, y_train)

In [46]:
bt2_X_train = X_train.copy()
bt2_X_test = X_test.copy()
bt2_oot = oot30.copy().drop('fraud_label', axis=1)
bt2 = GBC(n_estimators = 750, max_depth = 4).fit(bt2_X_train, y_train)

In [47]:
bt3_X_train = X_train.copy()
bt3_X_test = X_test.copy()
bt3_oot = oot30.copy().drop('fraud_label', axis=1)
bt3 = GBC(n_estimators = 500, max_depth = 3, loss = 'exponential').fit(bt1_X_train, y_train)

In [48]:
bt4_X_train = X_train.copy()
bt4_X_test = X_test.copy()
bt4_oot = oot30.copy().drop('fraud_label', axis=1)
bt4 = GBC(n_estimators = 500, max_depth = 3, learning_rate = 0.01).fit(bt4_X_train, y_train)

In [49]:
bt5_X_train = X_train.copy()
bt5_X_test = X_test.copy()
bt5_oot = oot30.copy().drop('fraud_label', axis=1)
bt5 = GBC(n_estimators = 1000, max_depth = 3, learning_rate = 0.001).fit(bt5_X_train, y_train)

## Get FDR for all models 

In [50]:
modList = [lr1, lr2, lr3, lr4, lr5, lr6, rf1, rf2, rf3, rf4, rf5, rf6, rf7, nn1, nn2, nn3, nn4, nn5, bt1, bt2, bt3, bt4, bt5]
train_list = [lr1_X_train, lr2_X_train, lr3_X_train, lr4_X_train, lr5_X_train, lr6_X_train, rf1_X_train, \
             rf2_X_train, rf3_X_train, rf4_X_train, rf5_X_train, rf6_X_train, rf7_X_train, nn1_X_train,\
             nn2_X_train, nn3_X_train, nn4_X_train, nn5_X_train, bt1_X_train, bt2_X_train, bt3_X_train, bt4_X_train, bt5_X_train]
test_list = [lr1_X_test, lr2_X_test, lr3_X_test, lr4_X_test, lr5_X_test, lr6_X_test, rf1_X_test,\
            rf2_X_test, rf3_X_test, rf4_X_test, rf5_X_test, rf6_X_test, rf7_X_test, nn1_X_test, nn2_X_test, \
            nn3_X_test, nn4_X_test, nn5_X_test, bt1_X_test, bt2_X_test, bt3_X_test, bt4_X_train, bt5_X_train]
oot_list = [lr1_oot, lr2_oot, lr3_oot, lr4_oot, lr5_oot, lr6_oot, rf1_oot, rf2_oot, rf3_oot, rf4_oot, rf5_oot, rf6_oot,\
           rf7_oot, nn1_oot, nn2_oot, nn3_oot, nn4_oot, nn5_oot, bt1_oot, bt2_oot, bt3_oot, bt4_oot, bt5_oot]

### On the training data...

In [51]:
training_FDRs = []
i = 0
for model in modList:
    train_set = train_list[i]
    i+=1
    temp = train_set.copy()
    temp['scores'] = 0
    temp['scores'] = model.predict_proba(train_set)
    cutpoint = int(len(temp)*(.03))
    top3percent = temp.sort_values(by = 'scores', ascending = True).iloc[:cutpoint,:]
    FDR = train_test30.loc[top3percent.index, 'fraud_label'].sum()/train_test30.loc[train_set.index, 'fraud_label'].sum()
    training_FDRs.append(FDR)

### On the testing data...

In [52]:
testing_FDRs = []
i = 0
for model in modList:
    test_set = test_list[i]
    i+=1
    temp = test_set.copy()
    temp['scores'] = 0
    temp['scores'] = model.predict_proba(test_set)
    cutpoint = int(len(temp)*(.03))
    top3percent = temp.sort_values(by = 'scores', ascending = True).iloc[:cutpoint,:]
    FDR = train_test30.loc[top3percent.index, 'fraud_label'].sum()/train_test30.loc[test_set.index, 'fraud_label'].sum()
    testing_FDRs.append(FDR)

### And on the OOT data...

In [53]:
oot_FDRs = []
i = 0
for model in modList:
    oot_set = oot_list[i]
    i+=1
    temp = oot_set.copy()
    temp['scores'] = 0
    temp['scores'] = model.predict_proba(oot_set)
    cutpoint = int(len(temp)*(.03))
    top3percent = temp.sort_values(by = 'scores', ascending = True).iloc[:cutpoint,:]
    FDR = oot30.loc[top3percent.index, 'fraud_label'].sum()/oot30['fraud_label'].sum()
    oot_FDRs.append(FDR)

In [58]:
FDRdict = {'Model': modList, 'trainingFDR':training_FDRs, 'testingFDR':testing_FDRs, 'ootFDR':oot_FDRs}
modelFDRs = pd.DataFrame(FDRdict)

In [59]:
modelFDRs

Unnamed: 0,Model,trainingFDR,testingFDR,ootFDR
0,"LogisticRegression(max_iter=2000, penalty='none')",0.559429,0.576069,0.538139
1,"LogisticRegression(max_iter=2000, penalty='none')",0.288876,0.30955,0.254401
2,"LogisticRegression(max_iter=2000, penalty='none')",0.346104,0.362299,0.318944
3,"LogisticRegression(max_iter=2000, penalty='none')",0.536823,0.55136,0.521794
4,"LogisticRegression(max_iter=2000, penalty='none')",0.529328,0.54442,0.514669
5,"LogisticRegression(max_iter=2000, penalty='none')",0.527662,0.541921,0.515926
6,"(DecisionTreeClassifier(max_depth=100, max_fea...",0.577513,0.587174,0.551551
7,"(DecisionTreeClassifier(max_depth=100, max_fea...",0.577751,0.586063,0.552389
8,"(DecisionTreeClassifier(max_depth=100, max_fea...",0.57787,0.586341,0.551132
9,"(DecisionTreeClassifier(max_depth=200, max_fea...",0.577751,0.587451,0.551551


In [55]:
#modelFDRs.to_csv('modelFDR.csv')