In [1]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [2]:
fraud_data = pd.read_csv(
    Path('fraud_dataset.csv'))
fraud_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
fraud_data = fraud_data.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
fraud_data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [None]:
#fraud_data['trans_date_trans_time'] = pd.to_datetime(fraud_data['trans_date_trans_time'])

In [4]:
fraud_data['is_fraud'].value_counts()

is_fraud
0    24776
1      224
Name: count, dtype: int64

In [5]:
fraud_data.dtypes

trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [6]:
fraud_data = fraud_data.drop(columns=['cc_num', 'first', 'last', 'trans_num'])

In [7]:
fraud_data.dtypes

trans_date_trans_time     object
merchant                  object
category                  object
amt                      float64
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [8]:
fraud_data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,1325376186,38.674999,-78.632459,0


In [9]:
categorical_data = fraud_data[['trans_date_trans_time', 'merchant', 'category', 'gender', 
                               'street', 'city', 'state', 'job', 'dob']].copy()
categorical_data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,gender,street,city,state,job,dob
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,F,561 Perry Cove,Moravian Falls,NC,"Psychologist, counselling",1988-03-09
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,F,43039 Riley Greens Suite 393,Orient,WA,Special educational needs teacher,1978-06-21
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,M,594 White Dale Suite 530,Malad City,ID,Nature conservation officer,1962-01-19
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,M,9443 Cynthia Court Apt. 038,Boulder,MT,Patent attorney,1967-01-12
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,M,408 Bradley Rest,Doe Hill,VA,Dance movement psychotherapist,1986-03-28


In [10]:
for column in categorical_data.columns:
    frequency_map = categorical_data[column].value_counts().to_dict()
    categorical_data[column + 'freq_enc'] = categorical_data[column].map(frequency_map)

categorical_data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,gender,street,city,state,job,dob,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,F,561 Perry Cove,Moravian Falls,NC,"Psychologist, counselling",1988-03-09,1,31,1305,13584,45,45,595,79,45
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,F,43039 Riley Greens Suite 393,Orient,WA,Special educational needs teacher,1978-06-21,1,45,2448,13584,67,82,347,104,67
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,M,594 White Dale Suite 530,Malad City,ID,Nature conservation officer,1962-01-19,1,28,1763,11416,12,12,115,12,12
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,M,9443 Cynthia Court Apt. 038,Boulder,MT,Patent attorney,1967-01-12,1,43,2619,11416,7,7,243,45,7
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,M,408 Bradley Rest,Doe Hill,VA,Dance movement psychotherapist,1986-03-28,1,31,1496,11416,44,44,590,44,44


In [11]:
categorical_data.dtypes

trans_date_trans_time            object
merchant                         object
category                         object
gender                           object
street                           object
city                             object
state                            object
job                              object
dob                              object
trans_date_trans_timefreq_enc     int64
merchantfreq_enc                  int64
categoryfreq_enc                  int64
genderfreq_enc                    int64
streetfreq_enc                    int64
cityfreq_enc                      int64
statefreq_enc                     int64
jobfreq_enc                       int64
dobfreq_enc                       int64
dtype: object

In [12]:
categorical_data = categorical_data.drop(columns=['trans_date_trans_time', 'merchant', 'category', 'gender', 
                               'street', 'city', 'state', 'job', 'dob'])
categorical_data.head()

Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc
0,1,31,1305,13584,45,45,595,79,45
1,1,45,2448,13584,67,82,347,104,67
2,1,28,1763,11416,12,12,115,12,12
3,1,43,2619,11416,7,7,243,45,7
4,1,31,1496,11416,44,44,590,44,44


In [13]:
fraud_data.dtypes

trans_date_trans_time     object
merchant                  object
category                  object
amt                      float64
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [14]:
fraud_data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,1325376186,38.674999,-78.632459,0


In [48]:
floats = fraud_data.drop(['trans_date_trans_time', 'merchant', 'category', 'gender', 'street', 'city', 'state', 'job', 'dob'], axis=1)
floats.head()

Unnamed: 0,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [17]:
data = pd.concat([categorical_data, floats], axis=1)
data.head()

Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,1,31,1305,13584,45,45,595,79,45,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,1,45,2448,13584,67,82,347,104,67,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,1,28,1763,11416,12,12,115,12,12,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,1,43,2619,11416,7,7,243,45,7,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,1,31,1496,11416,44,44,590,44,44,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [18]:
data.dtypes

trans_date_trans_timefreq_enc      int64
merchantfreq_enc                   int64
categoryfreq_enc                   int64
genderfreq_enc                     int64
streetfreq_enc                     int64
cityfreq_enc                       int64
statefreq_enc                      int64
jobfreq_enc                        int64
dobfreq_enc                        int64
amt                              float64
zip                                int64
lat                              float64
long                             float64
city_pop                           int64
unix_time                          int64
merch_lat                        float64
merch_long                       float64
is_fraud                           int64
dtype: object

In [19]:
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

In [23]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_smote)
X_test_pca = pca.transform(X_test_scaled)

In [24]:
logisticreg = LogisticRegression(max_iter=10000)
logisticreg.fit(X_train_pca, y_train_smote)

In [25]:
y_pred = logisticreg.predict(X_test_pca)

In [26]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9395


In [27]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[7001  438]
 [  16   45]]


In [28]:
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      7439
           1       0.09      0.74      0.17        61

    accuracy                           0.94      7500
   macro avg       0.55      0.84      0.57      7500
weighted avg       0.99      0.94      0.96      7500



In [47]:
floats.head()

Unnamed: 0,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
0,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315
1,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462
2,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481
3,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071
4,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459


In [56]:
'''
# Plot the clusters using the "limit_bal" and "age" columns
ccinfo_default_df.hvplot.scatter(
    x="limit_bal",
    y="age",
    by="customer_segments"
)
'''

import hvplot.pandas
floats.hvplot.scatter(
    x="amt",
    y="city_pop",
    by="is_fraud"
)

In [45]:

# Use PCA to reduce number of factors

# floats.drop(columns="is_fraud", inplace=True)

In [57]:
floats.head()

Unnamed: 0,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [58]:
# Use the StandardScaler module and fit_transform function to 
# scale all columns with numerical values
#ccinfo_default_df.loc[:,'age'] = StandardScaler().fit_transform(ccinfo_default_df[['age']])
# ccinfo_default_lst = StandardScaler().fit_transform(ccinfo_default_df)

fraud_lst = StandardScaler().fit_transform(floats)

In [63]:

'''
# Create a DataFrame called with the scaled data
# The column names should match those referenced in the StandardScaler step
ccinfo_default_df_scaled = pd.DataFrame(
    ccinfo_default_lst,
    columns=["limit_bal", "age", "bill_amt", "pay_amt"])
ccinfo_default_df_scaled.head()
'''

floats_scaled = pd.DataFrame(
    floats,
    columns=["amt", "city_pop"]
)

float_scaled.head()





Unnamed: 0,amt,city_pop
0,4.97,3495
1,107.23,149
2,220.11,4154
3,45.0,1939
4,41.96,99


In [64]:
# Import the PCA module
from sklearn.decomposition import PCA

In [65]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=2)

In [69]:
floats_pca = pca.fit_transform(floats)
floats_pca[:10]

array([[645107.90812027, -85464.02588219],
       [645206.16876962, -88322.2199048 ],
       [645162.02369235, -84427.40744145],
       [645104.74509754, -86805.82768635],
       [644942.54782434, -88889.60247905],
       [644865.78982322, -86868.99421387],
       [644910.03102932, -85997.56168954],
       [644801.33929186, -82982.2391885 ],
       [644792.41428148, -87577.6693278 ],
       [644363.00924274,  62878.90841704]])

In [70]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.5989704 , 0.39784624])

In [76]:
# Create the PCA DataFrame
floats_pca_df = pd.DataFrame(
    floats_pca,
    columns=["PCA1", "PCA2"]
)

floats_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,645107.90812,-85464.025882
1,645206.16877,-88322.219905
2,645162.023692,-84427.407441
3,645104.745098,-86805.827686
4,644942.547824,-88889.602479


In [80]:
 import numpy as np
# Get the absolute values of the PCA components
pca_components_abs = np.abs(pca.components_)

In [84]:
# pca_weights_df = pd.DataFrame(pca_components_abs, columns=floats_pca_df.columns, index=['PC1', 'PC2'])
pca_weights_df = pd.DataFrame(pca_components_abs, index=['PC1', 'PC2'])

In [85]:
pca_weights_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
PC1,6e-06,0.001629,8.406821e-08,9.264718e-07,0.002802,0.999995,8.707698e-08,9.021812e-07,1.155678e-08
PC2,4e-06,0.006918,2.696165e-06,2.333973e-06,0.999972,0.002791,2.699192e-06,2.341199e-06,4.034405e-09


In [87]:
heatmap = pca_weights_df.hvplot.heatmap(
    xlabel='Features', 
    ylabel='Principal Components',
    title='Weights of Original Features in Principal Components',
    cmap='coolwarm',
    width=700,
    height=500
)
heatmap