In [170]:
# Initial imports

import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [171]:
# Load in the CSV data into a DataFrame
fraud_df = pd.read_csv(
    Path("./_data/card_transaction_data.csv")
)
# Display head and tail of the dataframe
fraud_df


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,1/1/2019 0:00,3.890000e+13,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.2620,4154,Nature conservation officer,1/19/1962,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
1,1/1/2019 0:00,6.300000e+11,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,6/21/1978,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,1/1/2019 0:00,2.700000e+15,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",3/9/1988,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
3,1/1/2019 0:01,3.530000e+15,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1/12/1967,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,1/1/2019 0:03,3.760000e+14,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,3/28/1986,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59555,1/18/2019 23:20,4.590000e+15,fraud_Pouros-Conroy,shopping_pos,1334.07,Michelle,Gregory,F,6983 Carrillo Isle,Edisto Island,...,32.5486,-80.3070,2408,"Sales professional, IT",7/5/1997,f0c086495d403024ce6d30f715713319,1326928816,31.615611,-79.702908,1
59556,1/18/2019 23:31,3.720000e+14,fraud_Ullrich Ltd,kids_pets,6.06,Rick,Martinez,M,062 Poole Hollow Apt. 815,Deadwood,...,44.3566,-103.6999,1979,Multimedia programmer,3/13/1970,4886a67813ff3bfdf38359908d5c0d2b,1326929485,44.371569,-104.435937,1
59557,1/18/2019 23:31,4.640000e+15,fraud_Wilkinson Ltd,entertainment,386.91,Sabrina,Johnson,F,320 Nicholson Orchard,Thompson,...,38.9999,-109.6150,46,"Surveyor, minerals",4/23/1987,48dcc21bdef98405171a831d70b463c8,1326929496,39.262556,-109.656927,1
59558,1/18/2019 23:39,3.720000e+14,"fraud_Pouros, Walker and Spencer",kids_pets,7.57,Rick,Martinez,M,062 Poole Hollow Apt. 815,Deadwood,...,44.3566,-103.6999,1979,Multimedia programmer,3/13/1970,68739f741b5cc017f1ba78f50966f628,1326929995,44.359917,-103.267581,1


In [172]:
# Review the data types associated with the columns
fraud_df.dtypes

trans_date_trans_time     object
cc_num                   float64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [173]:
# Drop the "trans_date_trans_time" column from the DataFrame
fraud_df = fraud_df.drop(
    columns=[
        "merchant","trans_date_trans_time","cc_num","first","last","street","lat","long","city_pop","job","trans_num","unix_time","merch_lat","merch_long"
        ]
).copy()

# Review the DataFrame
fraud_df

Unnamed: 0,category,amt,gender,city,state,zip,dob,is_fraud
0,entertainment,220.11,M,Malad City,ID,83252,1/19/1962,0
1,grocery_pos,107.23,F,Orient,WA,99160,6/21/1978,0
2,misc_net,4.97,F,Moravian Falls,NC,28654,3/9/1988,0
3,gas_transport,45.00,M,Boulder,MT,59632,1/12/1967,0
4,misc_pos,41.96,M,Doe Hill,VA,24433,3/28/1986,0
...,...,...,...,...,...,...,...,...
59555,shopping_pos,1334.07,F,Edisto Island,SC,29438,7/5/1997,1
59556,kids_pets,6.06,M,Deadwood,SD,57732,3/13/1970,1
59557,entertainment,386.91,F,Thompson,UT,84540,4/23/1987,1
59558,kids_pets,7.57,M,Deadwood,SD,57732,3/13/1970,1


In [174]:
# Create a list of categorical values
catergorical_variables = list(fraud_df.dtypes[fraud_df.dtypes =="object"].index)

# Encode categorical variables using OneHotEncoder
catergorical_variables

['category', 'gender', 'city', 'state', 'dob']

In [175]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [176]:
# Encode the categorcal variables using OneHotEncoder
encoded_data = enc.fit_transform(fraud_df[catergorical_variables])

In [177]:
# Create a dataframe with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns=enc.get_feature_names(catergorical_variables)

)
# Display sample data
encoded_df

Unnamed: 0,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,...,dob_9/30/1978,dob_9/6/1940,dob_9/6/1988,dob_9/7/1948,dob_9/8/1935,dob_9/8/1938,dob_9/8/1969,dob_9/8/1976,dob_9/8/1987,dob_9/9/1927
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59557,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [178]:
# Create a DataFrame with the columns containing numerical variables from the original dataset
encoded_df = pd.concat([encoded_df,fraud_df.drop(columns=catergorical_variables)], axis=1)

# Review the DataFrame
encoded_df


Unnamed: 0,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,...,dob_9/7/1948,dob_9/8/1935,dob_9/8/1938,dob_9/8/1969,dob_9/8/1976,dob_9/8/1987,dob_9/9/1927,amt,zip,is_fraud
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,220.11,83252,0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107.23,99160,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.97,28654,0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45.00,59632,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.96,24433,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1334.07,29438,1
59556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.06,57732,1
59557,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,386.91,84540,1
59558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.57,57732,1


In [179]:
# Define the target set y using the "is_fraud" column
y = encoded_df["is_fraud"]

# Display a sample of y
y[:10]


0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: is_fraud, dtype: int64

In [180]:
# Define the features set X by selecting all columns but "is_fraud"
X = encoded_df.drop(columns=["is_fraud"])

# Review the feature DataFrame
X.head(10)

Unnamed: 0,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,...,dob_9/6/1988,dob_9/7/1948,dob_9/8/1935,dob_9/8/1938,dob_9/8/1969,dob_9/8/1976,dob_9/8/1987,dob_9/9/1927,amt,zip
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,220.11,83252
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107.23,99160
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.97,28654
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0,59632
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.96,24433
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94.63,18917
6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.54,67851
7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.65,22824
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.27,15665
9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,198.39,37040


In [181]:
# Split the preprocessed data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [182]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [188]:
# Define the number of inputs to the model
number_input_features = len(X_train.iloc[0])

# Review the number of features
number_input_features

# Define the number of neurons in the output layer
number_output_neurons = 1

# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 = (number_input_features + 1) // 2

# Review the number of hidden nodes in the first layer
hidden_nodes_layer1

# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 = (hidden_nodes_layer1 + 1) // 2

# Review the number of hidden nodes in the second layer
hidden_nodes_layer2

450

In [189]:
# Create the Sequential model instance
nn = Sequential()

# Add the first hidden layer
nn.add(Dense(units = hidden_nodes_layer1, input_dim = number_input_features, activation = "relu"))

# Add the second hidden layer
nn.add(Dense(units = hidden_nodes_layer2, activation = "relu"))

# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units = number_output_neurons , activation="sigmoid"))

# Display the Sequential model summary
nn.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 900)               1620000   
_________________________________________________________________
dense_1 (Dense)              (None, 450)               405450    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 451       
Total params: 2,025,901
Trainable params: 2,025,901
Non-trainable params: 0
_________________________________________________________________


In [190]:
# Compile the Sequential model
nn.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])


In [191]:
# Fit the model using 50 epochs and the training data
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [192]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy =  nn.evaluate(X_test_scaled,y_test,verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

466/466 - 1s - loss: 0.0206 - accuracy: 0.9979
Loss: 0.020625099539756775, Accuracy: 0.9978508949279785
