### **Model Development**

---

### **Project Setup**

In [None]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import missingno as msno

# Data Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer

# Training and Evaluations
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Hyper-parameter Tuning and Cross Validation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline 

# Import machine learning models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [None]:
# Read the dataset
test_df = pd.read_csv("credit_card_fraud.csv")
test_df.head()

Unnamed: 0,transaction_time,credit_card_number,merchant,category,transaction_amount,gender,street,city,state,zip,...,job,transaction_number,is_fraud,full_name,age,transaction_location,merchant_location,transaction_hour,distance_km,is_nighttime
0,2019-01-01 00:00:18,2703186189652095,"Rippin, Kub and Mann",misc_net,4.97,F,561 Perry Cove,Moravian Falls,NC,28654,...,"Psychologist, counselling",0b242abb623afc578575680df30655b9,0,Jennifer Banks,31,"-81.1781, 36.0788","-82.048315, 36.011293",0,78.597568,1
1,2019-01-01 00:00:44,630423337322,"Heller, Gutmann and Zieme",grocery_pos,107.23,F,43039 Riley Greens Suite 393,Orient,WA,99160,...,Special educational needs teacher,1f76529f8574734946361c461b024d99,0,Stephanie Gill,41,"-118.2105, 48.8878","-118.186462, 49.159046999999994",0,30.212176,1
2,2019-01-01 00:00:51,38859492057661,Lind-Buckridge,entertainment,220.11,M,594 White Dale Suite 530,Malad City,ID,83252,...,Nature conservation officer,a1a22d70485983eac12b5b88dad1cf95,0,Edward Sanchez,57,"-112.262, 42.1808","-112.154481, 43.150704",0,108.206083,1
3,2019-01-01 00:01:16,3534093764340240,"Kutch, Hermiston and Farrell",gas_transport,45.0,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,...,Patent attorney,6b849c168bdad6f867558c3793159a81,0,Jeremy White,52,"-112.1138, 46.2306","-112.561071, 47.034331",0,95.673231,1
4,2019-01-01 00:03:06,375534208663984,Keeling-Crist,misc_pos,41.96,M,408 Bradley Rest,Doe Hill,VA,24433,...,Dance movement psychotherapist,a41d7549acf90789359a9aa5346dcb46,0,Tyler Garcia,33,"-79.4629, 38.4207","-78.632459, 38.674999",0,77.556744,1


In [None]:
# Dataset information 
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   transaction_time      object 
 1   credit_card_number    int64  
 2   merchant              object 
 3   category              object 
 4   transaction_amount    float64
 5   gender                object 
 6   street                object 
 7   city                  object 
 8   state                 object 
 9   zip                   int64  
 10  city_population       int64  
 11  job                   object 
 12  transaction_number    object 
 13  is_fraud              int64  
 14  full_name             object 
 15  age                   int64  
 16  transaction_location  object 
 17  merchant_location     object 
 18  transaction_hour      int64  
 19  distance_km           float64
 20  is_nighttime          int64  
dtypes: float64(2), int64(7), object(12)
memory usage: 296.8+ MB


In [None]:
# Change data types
test_df["transaction_time"] = pd.to_datetime(test_df["transaction_time"])
test_df["credit_card_number"] = test_df["credit_card_number"].astype(str)
test_df["zip"] = test_df["zip"].astype(str)
test_df["is_fraud"] = test_df["is_fraud"].astype(str)
test_df["is_nighttime"] = test_df["is_nighttime"].astype(str)

In [None]:
# Descriptive statistics on numerical features
test_df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
transaction_time,1852394.0,2020-01-20 21:31:46.801827328,2019-01-01 00:00:18,2019-07-23 04:13:43.750000128,2020-01-02 01:15:31,2020-07-23 12:11:25.249999872,2020-12-31 23:59:34,
transaction_amount,1852394.0,70.063567,1.0,9.64,47.45,83.1,28948.9,159.253975
city_population,1852394.0,88643.674509,23.0,741.0,2443.0,20328.0,2906700.0,301487.618344
age,1852394.0,46.21138,14.0,33.0,44.0,57.0,96.0,17.395446
transaction_hour,1852394.0,12.806119,0.0,7.0,14.0,19.0,23.0,6.815753
distance_km,1852394.0,76.111726,0.022255,55.320087,78.21638,98.509467,152.117173,29.11697


In [None]:
# Descriptive statistics on categorical features
test_df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
credit_card_number,1852394,999,6538441737335434,4392
merchant,1852394,693,Kilback LLC,6262
category,1852394,14,gas_transport,188029
gender,1852394,2,F,1014749
street,1852394,999,444 Robert Mews,4392
city,1852394,906,Birmingham,8040
state,1852394,51,TX,135269
zip,1852394,985,82514,5116
job,1852394,497,Film/video editor,13898
transaction_number,1852394,1852394,0b242abb623afc578575680df30655b9,1


**Dataset Description**: This dataset was intially used inside the notebook file for exploratory data analysis to understand and gain more insights of fraud transactions. There were some cleaning and new columns added and now we will use this dataset to do preprocessing before fitting into the models.

---

### **Data Preprocessing**

In [46]:
# Get only necessary columns
features = ["transaction_amount", "is_nighttime", "category", "transaction_location", "job", "state", "is_fraud"]
df = df[features]
df.columns

Index(['transaction_amount', 'is_nighttime', 'category',
       'transaction_location', 'job', 'state', 'is_fraud'],
      dtype='object')

In [47]:
from category_encoders import TargetEncoder
high_card_cols = ["transaction_location"]  # Example columns
encoder = TargetEncoder(cols=high_card_cols, smoothing=10)
test_df = encoder.fit_transform(df, df['is_fraud'])

In [48]:
test_df

Unnamed: 0,transaction_amount,is_nighttime,category,transaction_location,job,state,is_fraud
0,4.97,1,misc_net,0.003758,"Psychologist, counselling",NC,0
1,107.23,1,grocery_pos,0.001605,Special educational needs teacher,WA,0
2,220.11,1,entertainment,0.010884,Nature conservation officer,ID,0
3,45.00,1,gas_transport,0.020188,Patent attorney,MT,0
4,41.96,1,misc_pos,0.004449,Dance movement psychotherapist,VA,0
...,...,...,...,...,...,...,...
1852389,43.77,1,health_fitness,0.005011,Town planner,MO,0
1852390,111.84,1,kids_pets,0.004370,Futures trader,TX,0
1852391,86.88,1,kids_pets,0.001917,Musician,WA,0
1852392,7.99,1,travel,0.003766,Cartographer,ID,0


### **Model Training & Evaluation**