In [None]:
# THIS NOTEBOOK CLEANS AND PREPS THE TRAINING DATASET FOR USE IN ML MODELS

In [31]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [2]:
# Read in CSV
fraud_data = pd.read_csv(
    Path('fraud_dataset.csv'))
fraud_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
# Drop unnamed columns
fraud_data = fraud_data.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
fraud_data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
# Identify target variable imbalance
fraud_data['is_fraud'].value_counts()

is_fraud
0    24776
1      224
Name: count, dtype: int64

In [6]:
# Identify data types to prepare for model
fraud_data.dtypes

trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [7]:
# Drop unnecessary columns
fraud_data = fraud_data.drop(columns=['cc_num', 'first', 'last', 'trans_num'])

In [10]:
# Seperate categorical data for encoding
categorical_data = fraud_data[['trans_date_trans_time', 'merchant', 'category', 'gender', 
                               'street', 'city', 'state', 'job', 'dob']].copy()
categorical_data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,gender,street,city,state,job,dob
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,F,561 Perry Cove,Moravian Falls,NC,"Psychologist, counselling",1988-03-09
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,F,43039 Riley Greens Suite 393,Orient,WA,Special educational needs teacher,1978-06-21
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,M,594 White Dale Suite 530,Malad City,ID,Nature conservation officer,1962-01-19
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,M,9443 Cynthia Court Apt. 038,Boulder,MT,Patent attorney,1967-01-12
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,M,408 Bradley Rest,Doe Hill,VA,Dance movement psychotherapist,1986-03-28


In [11]:
# Create a loop that will iterate through and encode the categorical variables
for column in categorical_data.columns:
    frequency_map = categorical_data[column].value_counts().to_dict()
    categorical_data[column + 'freq_enc'] = categorical_data[column].map(frequency_map)

categorical_data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,gender,street,city,state,job,dob,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,F,561 Perry Cove,Moravian Falls,NC,"Psychologist, counselling",1988-03-09,1,31,1305,13584,45,45,595,79,45
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,F,43039 Riley Greens Suite 393,Orient,WA,Special educational needs teacher,1978-06-21,1,45,2448,13584,67,82,347,104,67
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,M,594 White Dale Suite 530,Malad City,ID,Nature conservation officer,1962-01-19,1,28,1763,11416,12,12,115,12,12
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,M,9443 Cynthia Court Apt. 038,Boulder,MT,Patent attorney,1967-01-12,1,43,2619,11416,7,7,243,45,7
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,M,408 Bradley Rest,Doe Hill,VA,Dance movement psychotherapist,1986-03-28,1,31,1496,11416,44,44,590,44,44


In [12]:
categorical_data.dtypes

trans_date_trans_time            object
merchant                         object
category                         object
gender                           object
street                           object
city                             object
state                            object
job                              object
dob                              object
trans_date_trans_timefreq_enc     int64
merchantfreq_enc                  int64
categoryfreq_enc                  int64
genderfreq_enc                    int64
streetfreq_enc                    int64
cityfreq_enc                      int64
statefreq_enc                     int64
jobfreq_enc                       int64
dobfreq_enc                       int64
dtype: object

In [13]:
# Drop original, unencoded variables so only encoded versions remain
categorical_data = categorical_data.drop(columns=['trans_date_trans_time', 'merchant', 'category', 'gender', 
                               'street', 'city', 'state', 'job', 'dob'])
categorical_data.head()

Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc
0,1,31,1305,13584,45,45,595,79,45
1,1,45,2448,13584,67,82,347,104,67
2,1,28,1763,11416,12,12,115,12,12
3,1,43,2619,11416,7,7,243,45,7
4,1,31,1496,11416,44,44,590,44,44


In [16]:
# Assign floats to new variable to next concatenate encoded categorical data with floats
floats = fraud_data.drop(['trans_date_trans_time', 'merchant', 'category', 'gender', 'street', 'city', 'state', 'job', 'dob'], axis=1)
floats.head()

Unnamed: 0,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [17]:
# Concat categorical data and floats back together
data = pd.concat([categorical_data, floats], axis=1)
data.head()

Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,1,31,1305,13584,45,45,595,79,45,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,1,45,2448,13584,67,82,347,104,67,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,1,28,1763,11416,12,12,115,12,12,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,1,43,2619,11416,7,7,243,45,7,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,1,31,1496,11416,44,44,590,44,44,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [18]:
# Confirm dtypes are model friendly
data.dtypes

trans_date_trans_timefreq_enc      int64
merchantfreq_enc                   int64
categoryfreq_enc                   int64
genderfreq_enc                     int64
streetfreq_enc                     int64
cityfreq_enc                       int64
statefreq_enc                      int64
jobfreq_enc                        int64
dobfreq_enc                        int64
amt                              float64
zip                                int64
lat                              float64
long                             float64
city_pop                           int64
unix_time                          int64
merch_lat                        float64
merch_long                       float64
is_fraud                           int64
dtype: object

In [None]:
# Export as csv to be loaded into next portions of project
data.to_csv('prepped_data.csv')