In [29]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import numpy as np
import datetime as dt
from pathlib import Path
from collections import Counter
import pandas as pd
import tensorflow as tf
import os
import seaborn as sns
import matplotlib.pyplot as plt

#  Import and read the fraudtrain.csv.
import pandas as pd 
ccf_df = pd.read_csv("Resources/fraudtrain.csv",parse_dates=['trans_date_trans_time',])

# Drop any null columns
ccf_df=ccf_df.dropna(axis='columns', how='all')
ccf_df.drop_duplicates(inplace=True)

# Drop all null rows
ccf_df= ccf_df.dropna()

# Preview 
ccf_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [31]:
# Generate our categorical variable list
ccf_cat = ccf_df.dtypes[ccf_df.dtypes == "object"].index.tolist()

In [32]:
# Check the number of unique values in each column
ccf_df[ccf_cat].nunique()

merchant         693
category          14
first            352
last             481
gender             2
street           983
city             894
state             51
job              494
dob              968
trans_num    1296675
dtype: int64

In [43]:
# Update fields to better reflect more legible items
ccf_df['age'] = dt.date.today().year-pd.to_datetime(ccf_df['dob']).dt.year
ccf_df['hour'] = pd.to_datetime(ccf_df['trans_date_trans_time']).dt.hour
ccf_df['day'] = pd.to_datetime(ccf_df['trans_date_trans_time']).dt.dayofweek
ccf_df['month'] = pd.to_datetime(ccf_df['trans_date_trans_time']).dt.month

ccf_df2 = ccf_df.drop(columns = ['cc_num','Unnamed: 0', 'trans_date_trans_time','merchant','dob', 'first', 'last', 'street', 'trans_num', 'city', 'job'])

ccf_df2.head()

Unnamed: 0,category,amt,gender,state,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,age,hour,day,month
0,misc_net,4.97,F,NC,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,34,0,1,1
1,grocery_pos,107.23,F,WA,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,44,0,1,1
2,entertainment,220.11,M,ID,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0,60,0,1,1
3,gas_transport,45.0,M,MT,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,55,0,1,1
4,misc_pos,41.96,M,VA,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,36,0,1,1


In [44]:
# Create a function to label states by region
def label_states (row):
    if row['state'] in ['VT', 'ME', 'NH', 'MA', 'RI', 'CT','NY','PA','NJ']:
        return 'North-East'
    if row['state'] in ['WI','MI','IL','IN','OH','ND','SD','NE','KS','MN','IA','MO']:
        return 'Mid-West'
    if row['state'] in ['DE','MD','DC','VA','WV','NC','SC','GA','FL','KY','TN','MS','AL','AR','LA']:
        return 'South-East'
    if row['state'] in ['OK','TX','NM','AZ']:
        return 'South-West'
    if row['state'] in ['CA','NV','UT','CO','WY','ID','OR','WA','MT','AK','HI']:
        return 'West'
    return 'etc'

In [45]:
# Add regions to each row of data
ccf_df2['Region']=ccf_df2.apply(lambda row: label_states(row), axis=1)

In [46]:
# Drop the state column and preview
ccf_df2 = ccf_df2.drop(columns = ['state'])
ccf_df2.head()

Unnamed: 0,category,amt,gender,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,age,hour,day,month,Region
0,misc_net,4.97,F,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,34,0,1,1,South-East
1,grocery_pos,107.23,F,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,44,0,1,1,West
2,entertainment,220.11,M,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0,60,0,1,1,West
3,gas_transport,45.0,M,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,55,0,1,1,West
4,misc_pos,41.96,M,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,36,0,1,1,South-East


In [47]:
ccf_df2.dtypes

category       object
amt           float64
gender         object
zip             int64
lat           float64
long          float64
city_pop        int64
unix_time       int64
merch_lat     float64
merch_long    float64
is_fraud        int64
age             int64
hour            int64
day             int64
month           int64
Region         object
dtype: object

In [48]:
#convert category, gender & region to dummy variables
df=pd.get_dummies(ccf_df2, drop_first=True)
df

Unnamed: 0,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,age,...,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M,Region_North-East,Region_South-East,Region_South-West,Region_West
0,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,34,...,0,0,0,0,0,0,0,1,0,0
1,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,44,...,0,0,0,0,0,0,0,0,0,1
2,220.11,83252,42.1808,-112.2620,4154,1325376051,43.150704,-112.154481,0,60,...,0,0,0,0,0,1,0,0,0,1
3,45.00,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,55,...,0,0,0,0,0,1,0,0,0,1
4,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,36,...,1,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,15.56,84735,37.7175,-112.4777,258,1371816728,36.841266,-111.690765,0,61,...,0,0,0,0,0,1,0,0,0,1
1296671,51.70,21790,39.2667,-77.5101,100,1371816739,38.906881,-78.246528,0,43,...,0,0,0,0,0,1,0,1,0,0
1296672,105.93,88325,32.9396,-105.8189,899,1371816752,33.619513,-105.130529,0,55,...,0,0,0,0,0,1,0,0,1,0
1296673,74.90,57756,43.3526,-102.5411,1126,1371816816,42.788940,-103.241160,0,42,...,0,0,0,0,0,1,0,0,0,0


In [49]:
# Split our preprocessed data into our features and target arrays
y = df["is_fraud"].values
X = df.drop(["is_fraud"], axis='columns').values

#Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [50]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

method= SMOTE()
X_resampled, y_resampled = method.fit_resample(X_train, y_train)
model=LogisticRegression()
model.fit(X_resampled,y_resampled)
y_pred=model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

              precision    recall  f1-score   support

           0       1.00      0.88      0.94    322272
           1       0.04      0.76      0.07      1897

    accuracy                           0.88    324169
   macro avg       0.52      0.82      0.50    324169
weighted avg       0.99      0.88      0.93    324169

[[284500  37772]
 [   462   1435]]


In [52]:
#random forest 
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(random_state=13)
model2.fit(X_train,y_train)
y_pred=model2.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    322272
           1       0.98      0.75      0.85      1897

    accuracy                           1.00    324169
   macro avg       0.99      0.87      0.92    324169
weighted avg       1.00      1.00      1.00    324169

[[322242     30]
 [   481   1416]]


In [53]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled1, y_resampled1 = ros.fit_resample(X_train, y_train)
Counter(y_resampled1)

Counter({0: 5609, 1: 5609})

In [54]:
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
model3 = LogisticRegression(solver='lbfgs', random_state=13)
model3.fit(X_resampled1, y_resampled1)
y_pred1=model3.predict(X_test)
balanced_accuracy_score(y_test, y_pred)
report = classification_report_imbalanced(y_test, y_pred1)
print(report)
matrix = confusion_matrix(y_test, y_pred1)
print(matrix)

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.89      0.76      0.94      0.82      0.68    322272
          1       0.04      0.76      0.89      0.07      0.82      0.66      1897

avg / total       0.99      0.89      0.76      0.93      0.82      0.68    324169

[[286119  36153]
 [   462   1435]]
