In [1]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df = pd.read_csv('fraud_data.csv')

In [3]:
df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,04-01-2019 00:58,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1
1,04-01-2019 15:06,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1
2,04-01-2019 22:37,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1
3,04-01-2019 23:06,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1
4,04-01-2019 23:59,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1


In [10]:
df.info()

## we can see that " is_fraud " has object type (以0, 1方式儲存)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14446 entries, 0 to 14445
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   trans_date_trans_time  14446 non-null  datetime64[ns]
 1   merchant               14446 non-null  object        
 2   category               14446 non-null  object        
 3   amt                    14446 non-null  float64       
 4   city                   14446 non-null  object        
 5   state                  14446 non-null  object        
 6   lat                    14446 non-null  float64       
 7   long                   14446 non-null  float64       
 8   city_pop               14446 non-null  int64         
 9   job                    14446 non-null  object        
 10  dob                    14446 non-null  object        
 11  trans_num              14446 non-null  object        
 12  merch_lat              14446 non-null  float64       
 13  m

In [7]:
df['trans_date_trans_time']  = pd.to_datetime(df['trans_date_trans_time'], format='%d-%m-%Y %H:%M')
df['trans_date_trans_time'].head()

0   2019-01-04 00:58:00
1   2019-01-04 15:06:00
2   2019-01-04 22:37:00
3   2019-01-04 23:06:00
4   2019-01-04 23:59:00
Name: trans_date_trans_time, dtype: datetime64[ns]

In [6]:
""" 
Column Name	Column Description
transdatetrans_time	Transaction DateTime
merchant	Merchant Name
category	Category of Merchant
amt	Amount of Transaction
city	City of Credit Card Holder
state	State of Credit Card Holder
lat	Latitude Location of Purchase
long	Longitude Location of Purchase
city_pop	Credit Card Holder's City Population
job	Job of Credit Card Holder
dob	Date of Birth of Credit Card Holder
trans_num	Transaction Number
merch_lat	Latitude Location of Merchant
merch_long	Longitude Location of Merchant
is_fraud	Whether Transaction is Fraud (1) or Not (0)
""" 

" \nColumn Name\tColumn Description\ntransdatetrans_time\tTransaction DateTime\nmerchant\tMerchant Name\ncategory\tCategory of Merchant\namt\tAmount of Transaction\ncity\tCity of Credit Card Holder\nstate\tState of Credit Card Holder\nlat\tLatitude Location of Purchase\nlong\tLongitude Location of Purchase\ncity_pop\tCredit Card Holder's City Population\njob\tJob of Credit Card Holder\ndob\tDate of Birth of Credit Card Holder\ntrans_num\tTransaction Number\nmerch_lat\tLatitude Location of Merchant\nmerch_long\tLongitude Location of Merchant\nis_fraud\tWhether Transaction is Fraud (1) or Not (0)\n"

In [9]:
df['is_fraud'].tail()

14441    0
14442    0
14443    0
14444    0
14445    0
Name: is_fraud, dtype: object

In [11]:
df

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,2019-01-04 00:58:00,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1
1,2019-01-04 15:06:00,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1
2,2019-01-04 22:37:00,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1
3,2019-01-04 23:06:00,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1
4,2019-01-04 23:59:00,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14441,2019-01-22 00:37:00,Hudson-Grady,shopping_pos,122.00,Athena,OR,45.8289,-118.4971,1302,Dealer,18-10-1976,699a4c06b22711bf3e0d8ef91232d356,46.442439,-118.524214,0
14442,2019-01-22 00:41:00,"""Nienow, Ankunding and Collie""",misc_pos,9.07,Gardiner,OR,43.7857,-124.1437,260,"""Engineer, maintenance""",01-09-1956,080d620d24815c7d6c637cf0b71dde8e,42.901265,-124.995317,0
14443,2019-01-22 00:42:00,Pacocha-O'Reilly,grocery_pos,104.84,Alva,WY,44.6873,-104.4414,110,"""Administrator, local government""",16-05-1973,3c346c8cd627c5fe3ed57430db2e9ae7,45.538062,-104.542117,0
14444,2019-01-22 00:48:00,"""Bins, Balistreri and Beatty""",shopping_pos,268.16,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,e66ffcc95ba7fc490486242af1205d04,64.081462,-165.898698,0


In [8]:
# Cleaning the 'is_fraud' column and extracting the first character to deal with malformed data and converting to integer
df['is_fraud'] = df['is_fraud'].astype(str).str.extract(r'(\d)').fillna(0).astype(int)

""" 
將 df['is_fraud'] 列中的數據轉換為字符串。
提取字符串中的第一個數字字符。
如果沒有找到數字字符，則將該值設為 0。
最後，將這些值轉換為整數類型。
""" 

#Printing unique values in 'is_fraud' column to verify conversion
print("Unique values in 'is_fraud' column:", df['is_fraud'].unique())


Unique values in 'is_fraud' column: [1 0]


In [20]:
df['is_fraud'] = df['is_fraud'].astype(int)
nan_count = df['is_fraud'].isna().sum()
print(nan_count)

0


In [23]:
df

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,2019-01-04 00:58:00,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1
1,2019-01-04 15:06:00,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1
2,2019-01-04 22:37:00,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1
3,2019-01-04 23:06:00,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1
4,2019-01-04 23:59:00,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14441,2019-01-22 00:37:00,Hudson-Grady,shopping_pos,122.00,Athena,OR,45.8289,-118.4971,1302,Dealer,18-10-1976,699a4c06b22711bf3e0d8ef91232d356,46.442439,-118.524214,0
14442,2019-01-22 00:41:00,"""Nienow, Ankunding and Collie""",misc_pos,9.07,Gardiner,OR,43.7857,-124.1437,260,"""Engineer, maintenance""",01-09-1956,080d620d24815c7d6c637cf0b71dde8e,42.901265,-124.995317,0
14443,2019-01-22 00:42:00,Pacocha-O'Reilly,grocery_pos,104.84,Alva,WY,44.6873,-104.4414,110,"""Administrator, local government""",16-05-1973,3c346c8cd627c5fe3ed57430db2e9ae7,45.538062,-104.542117,0
14444,2019-01-22 00:48:00,"""Bins, Balistreri and Beatty""",shopping_pos,268.16,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,e66ffcc95ba7fc490486242af1205d04,64.081462,-165.898698,0


In [26]:
# Grouping by product category and counting the number fraud occurrences
fraud_counts_by_state = df[df['is_fraud'] == 1].groupby('state').size().reset_index(name='fraud_count')
# Displaying first five rows
fraud_counts_by_state.head()

Unnamed: 0,state,fraud_count
0,AK,65
1,AZ,64
2,CA,411
3,CO,115
4,HI,16


In [27]:
fraud_counts_by_cat = df[df['is_fraud'] == 1].groupby('category').size().reset_index(name='fraud_count')
fraud_counts_by_cat


Unnamed: 0,category,fraud_count
0,entertainment,59
1,food_dining,39
2,gas_transport,159
3,grocery_net,32
4,grocery_pos,444
5,health_fitness,37
6,home,51
7,kids_pets,56
8,misc_net,223
9,misc_pos,64


In [30]:
#Creating a bar plot for fraud count by product category
fig = px.bar(
    fraud_counts_by_state, x='state', y='fraud_count', color='state',
    title='Fraud Count by state', labels={'category': 'state', 'fraud_count': 'Fraud Count'},
height=650, width=600, color_discrete_sequence=px.colors.sequential.Plasma)
#Updating the layout to rotate x-axis labels and set titles for x and y axes
fig.update_layout(xaxis_tickangle=-90, xaxis_title='state', yaxis_title='Fraud Count')
#Displaying the plot
fig.update_layout(xaxis={'categoryorder':'total ascending'}) 
fig.show()

##Visualisation of state - wise total transactions and its variation with fraud count¶



In [11]:
# Filtering for fraudulent transactions, grouping by state, and counting occurrences
frauds_by_state = df[df['is_fraud'] == 1].groupby('state').size().reset_index(name='fraud_count')
# Grouping by state to count total transactions and merge with fraud counts
merged_df = df.groupby('state').size().reset_index(name='total_transactions').merge(frauds_by_state, on='state', how='left')
# Displaying the merged DataFrame
merged_df

Unnamed: 0,state,total_transactions,fraud_count
0,AK,173,65
1,AZ,673,64
2,CA,3375,411
3,CO,856,115
4,HI,172,16
5,ID,347,33
6,MO,2329,267
7,NE,1460,238
8,NM,1003,121
9,OR,1211,197


In [12]:
# Checking for null values in the merged DataFrame
null_values = merged_df.isnull().sum()
# Displaying the count of null values for each column
null_values

state                 0
total_transactions    0
fraud_count           0
dtype: int64

In [13]:
#  Sorting the merged_df by : fraud counts
sorted_df = merged_df.sort_values(by='fraud_count', ascending=False)

In [14]:
# Creating bar graph
fig = go.Figure()
# Adding total transactions bar
fig.add_trace(go.Bar(x=sorted_df['state'], y=sorted_df['total_transactions'], name='Total Transactions', marker_color='blue'))
# Adding fraud count bar
fig.add_trace(go.Bar(x=sorted_df['state'], y=sorted_df['fraud_count'], name='Fraud Count', marker_color='red'))
# Customizing the layout
fig.update_layout(
    barmode='group', xaxis_tickangle=-90, title='Total Transactions and Fraud Count by State',
    xaxis_title='State', yaxis_title='Number of Transactions and frauds', legend_title='Legend', height=600, width=700
)
# Displaying the plot
fig.show()

In [15]:
#Creating the bar plot with fraud count by state
fig = px.bar(
    sorted_df, x='state', y='fraud_count', 
    hover_data=['state', 'total_transactions'], color='state',
    labels={'fraud_count': 'Fraud Count', 'state': 'State', 'total_transactions': 'Total Transactions'},
    height=600, width=600, title='Fraud Count by State'
)
#Displaying the plot
fig.show()

In [16]:
# Defining fraud rate
sorted_df['fraud_rate'] = sorted_df['fraud_count'] / sorted_df['total_transactions'] * 100
# Creating a choropleth map for fraud rates by state
fig = px.choropleth(
    sorted_df, locations='state', locationmode='USA-states', color='fraud_rate',
    color_continuous_scale=px.colors.sequential.Inferno, scope='usa',
    title='Fraud Rate by State', labels={'fraud_rate': 'Fraud Rate (%)'},
    height=800, width=1000  # Set plot size
)
# Displaying the plot
fig.show()

In [17]:
#Creating and customize the histogram plot with a violin plot for distribution

fig = px.histogram(
    df, x='amt', nbins=30, marginal='violin',
    title='Distribution of Transaction Amounts',
    labels={'amt': 'Transaction Amount'},
    color_discrete_sequence=['indigo'],  # Set bar color
    height=600, width=500  # Set plot size
)
#Updating the layout for axis titles and bar gap
fig.update_layout(
    xaxis_title='Transaction Amount',
    yaxis_title='Frequency',
    bargap=0.1
)
#Displaying the plot
fig.show()


In [18]:
#Grouping by 'category' and count unique 'trans_num'

category_counts = df.groupby('category')['trans_num'].nunique().reset_index(name='transaction_count')

#Creating and displaying the plot
fig = px.bar(
    category_counts, x='category', y='transaction_count',
    hover_data=['category', 'transaction_count'], color='category',
    labels={'transaction_count': 'Number of Transactions', 'category': 'Category'},
    title='Number of Transactions by Category', height=600, width=600
)
fig.update_layout(xaxis={'categoryorder':'total descending'}) 
fig.show()

In [19]:
#Extracting and cleaning city data
maps = df[df['is_fraud'] == 1][['city', 'lat', 'long']].drop_duplicates().sort_values('city').reset_index(drop=True)

#Counting frauds per city where 'is_fraud' = 1 and merging with maps
count = df[df['is_fraud'] == 1]['city'].value_counts().rename_axis('city').reset_index(name='count')
maps = maps.merge(count, on='city', how='left').sort_values('count')

In [20]:
maps.head()

Unnamed: 0,city,lat,long,count
85,Littleton,39.5994,-105.0044,2
84,Laramie,41.4247,-105.4781,2
115,Odessa,38.9829,-93.9757,3
111,Newhall,34.3795,-118.523,3
169,Vancouver,45.6892,-122.6616,3


In [21]:
# Creating and customizing a choropleth map

fig = px.density_mapbox(
    maps, lat='lat', lon='long', z='count',
    color_continuous_scale='Plasma', radius=10,
    title='Citywise fraud count Density map',
    center={"lat": 39.8283, "lon": -98.5795}, zoom=3,
    mapbox_style="open-street-map", hover_name='city',
    height=500, width=1000
)
#Displaying the plot
fig.show()

In [22]:
#Converting date of birth:'dob' and transaction date and time:'trans_date_trans_time' columns to datetime format
df['dob'] = pd.to_datetime(df['dob'], format='%d-%m-%Y')
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%d-%m-%Y %H:%M')

#Extracting date and time into separate columns
df['trans_date'] = df['trans_date_trans_time'].dt.date
df['trans_time'] = df['trans_date_trans_time'].dt.time

#Calculating age directly by subtracting year of birth from transaction year
df['age'] = df['trans_date'].apply(lambda x: x.year) - df['dob'].dt.year

#Filtering for fraudulent transactions
fraud_df = df[df['is_fraud'] == 1]

In [23]:
#Creating and customizing a scatter plot
fig = px.scatter(
    fraud_df, x='trans_date', y='age',
    title='Age of Fraud Victims by Transaction Date',
    labels={'trans_date': 'Transaction Date', 'age': 'Age'},
    color='age', color_continuous_scale='Viridis'
)
#Updating layout
fig.update_layout(
    title_font_size=20,
    xaxis_title='Transaction Date',
    yaxis_title='Age',
    xaxis_tickformat='%Y-%m-%d'  # Format date on x-axis
)
# Displaying the plot
fig.show()

In [24]:
#Creating and customizing a histogram
fig = px.histogram(
    fraud_df, x='age',
    title='Distribution of Fraud Counts by Age',
    labels={'age': 'Age'},
    nbins=10, color='age',
    color_discrete_sequence=px.colors.sequential.Reds
)
#Updating layout
fig.update_layout(
    title_font_size=20,
    xaxis_title='Age',
    yaxis_title='Count',
    height=500, width=800
)
# Displaying the plot
fig.show()

In [25]:
""" 
age 50 - 60 with hightest fraud count of defaulting 
""" 
df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud,trans_date,trans_time,age
0,2019-01-04 00:58:00,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1,2019-01-04,00:58:00,80
1,2019-01-04 15:06:00,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1,2019-01-04,15:06:00,80
2,2019-01-04 22:37:00,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1,2019-01-04,22:37:00,80
3,2019-01-04 23:06:00,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1,2019-01-04,23:06:00,80
4,2019-01-04 23:59:00,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1,2019-01-04,23:59:00,80


## Model training 

In [26]:
#Extracting hour of transaction
df['transaction_hour'] = df['trans_date_trans_time'].dt.hour

#Dropping irrelevant columns
df = df.drop(columns=['trans_date_trans_time', 'trans_date', 'dob', 'trans_num','trans_time', 'merchant','state','city'])

In [27]:
# cat var to num var by LABEL ENCODER

In [28]:
#Choosing categorical columns to fit into model training
categorical_columns = ['category', 'job']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le