In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import pickle
from lightgbm import LGBMClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, accuracy_score

In [2]:
data = pd.read_csv('Data/traintest_mod.csv')
data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,state,zip,lat,...,dob,merch_lat,merch_long,is_fraud,year,month,day,hour,DOW,generation
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,NC,28654,36.0788,...,1988-03-09,36.011293,-82.048315,0.0,2019,1,1,0,Tuesday,80s
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,WA,99160,48.8878,...,1978-06-21,49.159047,-118.186462,0.0,2019,1,1,0,Tuesday,70s
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,ID,83252,42.1808,...,1962-01-19,43.150704,-112.154481,0.0,2019,1,1,0,Tuesday,60s
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,MT,59632,46.2306,...,1967-01-12,47.034331,-112.561071,0.0,2019,1,1,0,Tuesday,60s
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,VA,24433,38.4207,...,1986-03-28,38.674999,-78.632459,0.0,2019,1,1,0,Tuesday,80s


In [3]:
data['full_name'] = data.apply(lambda x: x['first'] + ' ' + x['last'], axis = 1)
data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,state,zip,lat,...,merch_lat,merch_long,is_fraud,year,month,day,hour,DOW,generation,full_name
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,NC,28654,36.0788,...,36.011293,-82.048315,0.0,2019,1,1,0,Tuesday,80s,Jennifer Banks
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,WA,99160,48.8878,...,49.159047,-118.186462,0.0,2019,1,1,0,Tuesday,70s,Stephanie Gill
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,ID,83252,42.1808,...,43.150704,-112.154481,0.0,2019,1,1,0,Tuesday,60s,Edward Sanchez
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,MT,59632,46.2306,...,47.034331,-112.561071,0.0,2019,1,1,0,Tuesday,60s,Jeremy White
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,VA,24433,38.4207,...,38.674999,-78.632459,0.0,2019,1,1,0,Tuesday,80s,Tyler Garcia


In [4]:
data.columns

Index(['trans_date_trans_time', 'merchant', 'category', 'amt', 'first', 'last',
       'gender', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob',
       'merch_lat', 'merch_long', 'is_fraud', 'year', 'month', 'day', 'hour',
       'DOW', 'generation', 'full_name'],
      dtype='object')

In [5]:
cols_to_drop = ['trans_date_trans_time', 'first', 'last', 'dob']

data.drop(cols_to_drop, axis = 1, inplace = True)

In [6]:
data.head()

Unnamed: 0,merchant,category,amt,gender,state,zip,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,year,month,day,hour,DOW,generation,full_name
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",36.011293,-82.048315,0.0,2019,1,1,0,Tuesday,80s,Jennifer Banks
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,49.159047,-118.186462,0.0,2019,1,1,0,Tuesday,70s,Stephanie Gill
2,fraud_Lind-Buckridge,entertainment,220.11,M,ID,83252,42.1808,-112.262,4154,Nature conservation officer,43.150704,-112.154481,0.0,2019,1,1,0,Tuesday,60s,Edward Sanchez
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,MT,59632,46.2306,-112.1138,1939,Patent attorney,47.034331,-112.561071,0.0,2019,1,1,0,Tuesday,60s,Jeremy White
4,fraud_Keeling-Crist,misc_pos,41.96,M,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,38.674999,-78.632459,0.0,2019,1,1,0,Tuesday,80s,Tyler Garcia


In [7]:
cols_to_encode = ['merchant', 'category', 'gender', 'state', 'job', 'merch_lat', 'merch_long',
                 'DOW', 'generation', 'full_name']

for col in cols_to_encode:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    
data.head()

Unnamed: 0,merchant,category,amt,gender,state,zip,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,year,month,day,hour,DOW,generation,full_name
0,514,8,4.97,0,27,28654,36.0788,-81.1781,3495,372,550600,1223201,0.0,2019,1,1,0,5,7,423
1,241,4,107.23,0,47,99160,48.8878,-118.2105,149,431,1745263,110910,0.0,2019,1,1,0,5,6,884
2,390,0,220.11,1,13,83252,42.1808,-112.262,4154,308,1451077,169563,0.0,2019,1,1,0,5,5,304
3,360,2,45.0,1,26,59632,46.2306,-112.1138,1939,330,1697797,164676,0.0,2019,1,1,0,5,5,442
4,297,9,41.96,1,45,24433,38.4207,-79.4629,99,116,787219,1458121,0.0,2019,1,1,0,5,7,950


In [8]:
data.tail()

Unnamed: 0,merchant,category,amt,gender,state,zip,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,year,month,day,hour,DOW,generation,full_name
1852389,507,5,43.77,1,24,63453,40.4931,-91.8912,519,477,978166,715433,,2020,12,31,23,4,5,699
1852390,264,7,111.84,1,43,77566,29.0393,-95.4401,28739,207,88053,495569,,2020,12,31,23,4,8,494
1852391,496,7,86.88,0,47,99323,46.1966,-118.9017,3684,307,1686654,87693,,2020,12,31,23,4,7,61
1852392,75,13,7.99,1,13,83643,44.6255,-116.4493,129,63,1571421,134541,,2020,12,31,23,4,5,316
1852393,125,0,38.13,1,36,73034,35.6665,-97.4798,116001,289,565192,450378,,2020,12,31,23,4,8,835


In [9]:
data['lat_diff'] = data['lat'] - data['merch_lat']
data['long_diff'] = data['long'] - data['merch_long']

a = (np.square(np.sin(np.divide(data['lat_diff'], 2)*np.pi/360)) + 
    np.multiply(np.multiply(np.cos(data['lat']*np.pi/360), np.cos(data['merch_lat']*np.pi/360)),
               np.square(np.sin(np.divide(data['long_diff'], 2))*np.pi/360)))
     
c = 2* np.arctan2(np.sqrt(a), np.sqrt(1-a))
     
data['km_dist'] = 6371*c
     
data['km_dist'].head()

0    13125.216454
1     3664.300303
2    13054.758619
3      515.553928
4    12263.456275
Name: km_dist, dtype: float64

In [10]:
data['km_dist'].sort_values()

1020428        0.013611
1225432        1.101354
1382785        2.498619
1679408        3.505416
914208         4.205209
               ...     
1781152    20012.134829
975562     20012.447972
1735422    20012.488844
628815     20012.907285
1469655    20014.224438
Name: km_dist, Length: 1852394, dtype: float64

In [11]:
train = data[data['is_fraud'].notnull()]
test = data[data['is_fraud'].isnull()]

X = train.drop(['is_fraud'], axis = 1)
y = train['is_fraud']

test.drop(['is_fraud'], axis = 1, inplace = True)
target = pd.read_csv('Data/fraudTest.csv')['is_fraud']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [12]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.15, random_state = 0, stratify = y)

In [13]:
y_train.value_counts()

0.0    1095793
1.0       6380
Name: is_fraud, dtype: int64

In [14]:
weights = []
for i in y_train:
    if i == 0:
        weights.append(6380)
    elif i == 1:
        weights.append(1095793)

In [15]:
anomaly_model = IsolationForest(n_estimators = 200, n_jobs = -1, random_state = 0, verbose = 1)

anomaly_model.fit(x_train, y_train, sample_weight=weights)
val_pred = anomaly_model.predict(x_val)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:   10.4s remaining:   52.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:   11.0s finished


In [16]:
val_pred = pd.Series(val_pred).replace(1, 0)
val_pred = val_pred.replace(-1, 1)

In [17]:
val_pred.value_counts()

0    117631
1     76871
dtype: int64

In [18]:
print("Val F1-score: ", f1_score(y_val, val_pred))

Val F1-score:  0.02212905624575304


Well that experiment didn't work out so well

Guess I'll just oversample and treat this as a classification problem

In [19]:
sm = SMOTE(random_state = 0)
x_train, y_train = sm.fit_resample(x_train, y_train)

y_train.shape

(2191586,)

In [20]:
lgb = LGBMClassifier(n_estimators = 8000, learning_rate = 0.1, max_depth = 14, num_leaves = 1024,
                     random_state = 0, n_jobs = -1, reg_lambda = 0.1)
lgb.fit(x_train, y_train, early_stopping_rounds = 200, eval_set = [(x_train, y_train), (x_val, y_val)],
       verbose = True)

[1]	training's binary_logloss: 0.613084	valid_1's binary_logloss: 0.613186
Training until validation scores don't improve for 200 rounds
[2]	training's binary_logloss: 0.547424	valid_1's binary_logloss: 0.547264
[3]	training's binary_logloss: 0.492345	valid_1's binary_logloss: 0.491748
[4]	training's binary_logloss: 0.446119	valid_1's binary_logloss: 0.445076
[5]	training's binary_logloss: 0.406064	valid_1's binary_logloss: 0.404477
[6]	training's binary_logloss: 0.371064	valid_1's binary_logloss: 0.368918
[7]	training's binary_logloss: 0.340553	valid_1's binary_logloss: 0.33805
[8]	training's binary_logloss: 0.31399	valid_1's binary_logloss: 0.311123
[9]	training's binary_logloss: 0.290474	valid_1's binary_logloss: 0.28731
[10]	training's binary_logloss: 0.269754	valid_1's binary_logloss: 0.266543
[11]	training's binary_logloss: 0.25131	valid_1's binary_logloss: 0.247837
[12]	training's binary_logloss: 0.234771	valid_1's binary_logloss: 0.231157
[13]	training's binary_logloss: 0.21996

[107]	training's binary_logloss: 0.0083042	valid_1's binary_logloss: 0.0141435
[108]	training's binary_logloss: 0.00793671	valid_1's binary_logloss: 0.013715
[109]	training's binary_logloss: 0.00771981	valid_1's binary_logloss: 0.0135087
[110]	training's binary_logloss: 0.00740972	valid_1's binary_logloss: 0.013135
[111]	training's binary_logloss: 0.00720353	valid_1's binary_logloss: 0.0129557
[112]	training's binary_logloss: 0.0070567	valid_1's binary_logloss: 0.0128123
[113]	training's binary_logloss: 0.00683576	valid_1's binary_logloss: 0.0125665
[114]	training's binary_logloss: 0.00659441	valid_1's binary_logloss: 0.0122775
[115]	training's binary_logloss: 0.00634408	valid_1's binary_logloss: 0.0120007
[116]	training's binary_logloss: 0.0062299	valid_1's binary_logloss: 0.0118763
[117]	training's binary_logloss: 0.00603974	valid_1's binary_logloss: 0.0117032
[118]	training's binary_logloss: 0.00592423	valid_1's binary_logloss: 0.011563
[119]	training's binary_logloss: 0.00571012	va

[210]	training's binary_logloss: 0.000568179	valid_1's binary_logloss: 0.00560812
[211]	training's binary_logloss: 0.000553993	valid_1's binary_logloss: 0.00558397
[212]	training's binary_logloss: 0.000539179	valid_1's binary_logloss: 0.00557876
[213]	training's binary_logloss: 0.000527035	valid_1's binary_logloss: 0.00556619
[214]	training's binary_logloss: 0.000515595	valid_1's binary_logloss: 0.00557008
[215]	training's binary_logloss: 0.000500588	valid_1's binary_logloss: 0.00554559
[216]	training's binary_logloss: 0.000490103	valid_1's binary_logloss: 0.00553069
[217]	training's binary_logloss: 0.000478941	valid_1's binary_logloss: 0.00552453
[218]	training's binary_logloss: 0.000469847	valid_1's binary_logloss: 0.00550141
[219]	training's binary_logloss: 0.000461183	valid_1's binary_logloss: 0.00547195
[220]	training's binary_logloss: 0.000450739	valid_1's binary_logloss: 0.00547969
[221]	training's binary_logloss: 0.000441446	valid_1's binary_logloss: 0.00545113
[222]	training's

[311]	training's binary_logloss: 8.21703e-05	valid_1's binary_logloss: 0.00515243
[312]	training's binary_logloss: 8.09694e-05	valid_1's binary_logloss: 0.00515022
[313]	training's binary_logloss: 7.91686e-05	valid_1's binary_logloss: 0.00513903
[314]	training's binary_logloss: 7.80709e-05	valid_1's binary_logloss: 0.00514587
[315]	training's binary_logloss: 7.73231e-05	valid_1's binary_logloss: 0.0051483
[316]	training's binary_logloss: 7.63426e-05	valid_1's binary_logloss: 0.00514729
[317]	training's binary_logloss: 7.55978e-05	valid_1's binary_logloss: 0.00515585
[318]	training's binary_logloss: 7.47557e-05	valid_1's binary_logloss: 0.00515807
[319]	training's binary_logloss: 7.36681e-05	valid_1's binary_logloss: 0.00515846
[320]	training's binary_logloss: 7.17513e-05	valid_1's binary_logloss: 0.00514037
[321]	training's binary_logloss: 7.03745e-05	valid_1's binary_logloss: 0.00515299
[322]	training's binary_logloss: 6.86262e-05	valid_1's binary_logloss: 0.0051398
[323]	training's b

[412]	training's binary_logloss: 2.39292e-05	valid_1's binary_logloss: 0.00522202
[413]	training's binary_logloss: 2.3505e-05	valid_1's binary_logloss: 0.00521347
[414]	training's binary_logloss: 2.331e-05	valid_1's binary_logloss: 0.00521526
[415]	training's binary_logloss: 2.3127e-05	valid_1's binary_logloss: 0.00522157
[416]	training's binary_logloss: 2.29525e-05	valid_1's binary_logloss: 0.00522329
[417]	training's binary_logloss: 2.2703e-05	valid_1's binary_logloss: 0.00522051
[418]	training's binary_logloss: 2.2502e-05	valid_1's binary_logloss: 0.00522253
[419]	training's binary_logloss: 2.2326e-05	valid_1's binary_logloss: 0.00522346
[420]	training's binary_logloss: 2.20772e-05	valid_1's binary_logloss: 0.00522078
[421]	training's binary_logloss: 2.19005e-05	valid_1's binary_logloss: 0.00522071
[422]	training's binary_logloss: 2.17164e-05	valid_1's binary_logloss: 0.00522033
[423]	training's binary_logloss: 2.15377e-05	valid_1's binary_logloss: 0.00522084
[424]	training's binary

LGBMClassifier(max_depth=14, n_estimators=8000, num_leaves=1024, random_state=0,
               reg_lambda=0.1)

In [21]:
val_pred = lgb.predict(x_val)
test_pred = lgb.predict(test)

In [22]:
print("Val F1-score: ", f1_score(y_val, val_pred))
print("Test F1-score: ", f1_score(target, test_pred))

Val F1-score:  0.880107768298159
Test F1-score:  0.7438108484005563


In [23]:
xgb = XGBClassifier(n_estimators = 10, learning_rate = 0.1, random_state = 0, n_jobs = -1)
xgb.fit(x_train, y_train, early_stopping_rounds = 200, eval_set = [(x_train, y_train), (x_val, y_val)],
       verbose = True)



[0]	validation_0-logloss:0.62402	validation_1-logloss:0.62294
[1]	validation_0-logloss:0.56720	validation_1-logloss:0.56562
[2]	validation_0-logloss:0.51934	validation_1-logloss:0.51702
[3]	validation_0-logloss:0.47754	validation_1-logloss:0.47389
[4]	validation_0-logloss:0.44439	validation_1-logloss:0.43928
[5]	validation_0-logloss:0.41470	validation_1-logloss:0.40815
[6]	validation_0-logloss:0.38925	validation_1-logloss:0.38131
[7]	validation_0-logloss:0.36716	validation_1-logloss:0.35759
[8]	validation_0-logloss:0.34791	validation_1-logloss:0.33739
[9]	validation_0-logloss:0.32844	validation_1-logloss:0.31652


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=10, n_jobs=-1, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
val_pred = xgb.predict(x_val)
test_pred = xgb.predict(test)

In [25]:
print("Val F1-score: ", f1_score(y_val, val_pred))
print("Test F1-score: ", f1_score(target, test_pred))

Val F1-score:  0.24704507078841406
Test F1-score:  0.1936606317620834


#### Building a Stratified K-Fold model

In [26]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

splits = kfold.split(X, y)

val_acc_scores = []
val_weighted_scores = []

test_acc_scores = []
test_weighted_scores = []

In [27]:
for i, (Train, Test) in enumerate(splits):
    X_Train, X_Val, Y_Train, Y_Val = X.iloc[Train], X.iloc[Test], y.iloc[Train], y.iloc[Test]
    
    sm = SMOTE(random_state = 0, n_jobs = -1)
    X_Train, Y_Train = sm.fit_resample(X_Train, Y_Train)
    
    lgb = LGBMClassifier(n_estimators = 8000, learning_rate = 0.1, max_depth = 14, num_leaves = 1024,
                     random_state = 0, n_jobs = -1, reg_lambda = 0.1)
    lgb.fit(X_Train, Y_Train, early_stopping_rounds = 100, eval_set = [(X_Train, Y_Train), (X_Val, Y_Val)],  verbose = False)
    
    with open('Model_Data/model_'+str(i+1)+'.pkl', 'wb') as f:
        pickle.dump(lgb, f)
    print('Model-' + str(i+1)+ ' done')
    
    val_pred = lgb.predict(X_Val)
    
    val_acc_scores.append(accuracy_score(Y_Val, val_pred))
    val_weighted_scores.append(f1_score(Y_Val, val_pred))
    
    test_pred = lgb.predict(test)
    
    test_acc_scores.append(accuracy_score(target, test_pred))
    test_weighted_scores.append(f1_score(target, test_pred))

Model-1 done
Model-2 done
Model-3 done
Model-4 done
Model-5 done


In [28]:
print("Validation Weighted F1:", sum(val_weighted_scores)/len(val_weighted_scores))
print("Validation Accuracy:", sum(val_acc_scores)/len(val_acc_scores))

print("Test Weighted F1:", sum(test_weighted_scores)/len(test_weighted_scores))
print("Test Accuracy:", sum(test_acc_scores)/len(test_acc_scores))

Validation Weighted F1: 0.8814541027194241
Validation Accuracy: 0.9986519366842115
Test Weighted F1: 0.739397502481668
Test Accuracy: 0.9983203741459263
