### Import Dependencies
Standard Python libraries used for -------------

In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
import numpy as np

### Data Ingestion
Loading the data from `credit_card_transactions.csv` and checking the initial few rows.

In [2]:
# Read the CSV file into a dataframe
data = pd.read_csv('credit_card_transactions.csv')

# Display the first few records
data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,28705.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,83236.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,22844.0


### Data Exploration
#### Explore the dataframe to assess for necessary pre-processing steps like null value handling and feature engineering 

In [3]:
# Explore data columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 24 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [4]:
# function to get number of unique values for each column ..
# to determine what kind of encoding and imputation needs to be done!

def display_unique_values(data):
    """
    Display unique values and the number of unique values for each column in the dataframe.

    Parameters:
    data (pd.DataFrame): The dataframe to analyze.

    Returns:
    None
    """
    for column in data.columns:
        unique_values = data[column].unique()
        print(f"Column: {column}")
        print(f"Data Type: {data[column].dtype}")
        print(f"Number of Unique Values: {len(unique_values)}")
        print(f"Unique Values: {unique_values[:10]}...")  # Show the first 10 unique values for brevity
        print("-" * 50)

# Display details of each column:
display_unique_values(data)

Column: Unnamed: 0
Data Type: int64
Number of Unique Values: 1296675
Unique Values: [0 1 2 3 4 5 6 7 8 9]...
--------------------------------------------------
Column: trans_date_trans_time
Data Type: object
Number of Unique Values: 1274791
Unique Values: ['2019-01-01 00:00:18' '2019-01-01 00:00:44' '2019-01-01 00:00:51'
 '2019-01-01 00:01:16' '2019-01-01 00:03:06' '2019-01-01 00:04:08'
 '2019-01-01 00:04:42' '2019-01-01 00:05:08' '2019-01-01 00:05:18'
 '2019-01-01 00:06:01']...
--------------------------------------------------
Column: cc_num
Data Type: int64
Number of Unique Values: 983
Unique Values: [2703186189652095     630423337322   38859492057661 3534093764340240
  375534208663984 4767265376804500   30074693890476 6011360759745864
 4922710831011201 2720830304681674]...
--------------------------------------------------
Column: merchant
Data Type: object
Number of Unique Values: 693
Unique Values: ['fraud_Rippin, Kub and Mann' 'fraud_Heller, Gutmann and Zieme'
 'fraud_Lind-Buckr

### Data Cleaning
Cleaning and preprocessing the dataset by removing unnecessary columns and converting date columns.

In [5]:
# Make a copy for transformation
ak_df = data.copy()

# Convert date columns to datetime format
ak_df['trans_date_trans_time'] = pd.to_datetime(ak_df['trans_date_trans_time'])
ak_df['dob'] = pd.to_datetime(ak_df['dob'])

#### Feature Engineering - Age Patterns

In [6]:
# AGE - Calculate age based on DOB and transaction year
ak_df['ft_age'] = ak_df['trans_date_trans_time'].dt.year - ak_df['dob'].dt.year

# AGE SEGMENT - Define age segments
def age_segment(age):
    if age < 18:
        return 'Under 18'
    elif 19 <= age <= 28:
        return '19-28'
    elif 29 <= age <= 38:
        return '29-38'
    elif 39 <= age <= 48:
        return '39-48'
    elif 49 <= age <= 58:
        return '49-58'
    elif 59 <= age <= 68:
        return '59-68'
    elif 69 <= age <= 78:
        return '69-78'
    else:
        return '79+'

# Apply age segmentation
ak_df['ft_age_group'] = ak_df['ft_age'].apply(age_segment)

ak_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode,ft_age,ft_age_group
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,28705.0,31,29-38
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,,41,39-48
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,83236.0,57,49-58
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,,52,49-58
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,22844.0,33,29-38


#### Feature Engineering - Temporal Patterns

In [18]:
# TRANSACTION HOUR
ak_df['ft_trans_hour'] = ak_df['trans_date_trans_time'].dt.hour

# TIME OF DAY - Categorize time of day based on transaction hour
ak_df['ft_time_of_day'] = ak_df['trans_date_trans_time'].dt.hour.apply(
    lambda hour: 'Morning' if 6 <= hour < 12 else
    ('Afternoon' if 12 <= hour < 18 else
    ('Evening' if 18 <= hour < 24 else 'Night'))
)

# TRANSACTION DAY
ak_df['ft_trans_day'] = ak_df['trans_date_trans_time'].dt.day

# TRANSACTION DAY OF YEAR
ak_df['ft_trans_day_of_year'] = ak_df['trans_date_trans_time'].dt.dayofyear

# TRANSACTION MONTH
ak_df['ft_trans_month'] = ak_df['trans_date_trans_time'].dt.month

# DAY OF WEEK - Add a new column 'day_of_week' to represent the day of the week (0=Monday, 6=Sunday)
ak_df['ft_day_of_week'] = ak_df['trans_date_trans_time'].dt.day_name()


ak_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,1,Tuesday,78.597568,0.637376,1267,87.393215,-82.423215,2028,0.004923,1
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,1,Tuesday,30.212176,-41.58037,2503,53.94932,53.28068,3030,0.005073,1
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,1,Tuesday,108.206083,34.130756,1895,65.87004,154.23996,503,0.001984,1
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1,Tuesday,95.673231,23.97106,2613,72.776673,-27.776673,493,0.002722,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,1,Tuesday,77.556744,2.322402,1592,95.178091,-53.218091,2017,0.006769,1


#### Feature Engineering - Distance Patterns
* Distance between user and merchant for the transaction
* Large deviations in the transaction’s location compared to the user’s regular pattern could indicate fraud

In [19]:
# DISTANCE BETWEEN USER AND MERCHANT

import numpy as np

# Define the Haversine Formula function that calculates the distance given two latitude/longitude points
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    radius = 6371  # Radius of Earth in kilometers
    return radius * c

# Create a new column for distance
ak_df['ft_distance_user_merchant'] = haversine(
    ak_df['lat'], ak_df['long'],
    ak_df['merch_lat'], ak_df['merch_long']
)

# Calculate the average distance of previous transactions for a user and compare the current transaction distance
user_avg_distance = ak_df.groupby('cc_num')['ft_distance_user_merchant'].transform('mean')
ak_df['ft_merchant_distance_from_user_mean'] = ak_df['ft_distance_user_merchant'] - user_avg_distance


ak_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,1,Tuesday,78.597568,0.637376,1267,87.393215,-82.423215,2028,0.004923,1
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,1,Tuesday,30.212176,-41.58037,2503,53.94932,53.28068,3030,0.005073,1
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,1,Tuesday,108.206083,34.130756,1895,65.87004,154.23996,503,0.001984,1
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1,Tuesday,95.673231,23.97106,2613,72.776673,-27.776673,493,0.002722,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,1,Tuesday,77.556744,2.322402,1592,95.178091,-53.218091,2017,0.006769,1


#### Feature Engineering - Merchant Popularity
* Uncommon merchants with a low transaction count might be associated with fraud

In [20]:
# Count the number of transactions for each merchant in the dataset
merchant_transaction_counts = ak_df['merchant'].value_counts()
ak_df['ft_merchant_popularity'] = ak_df['merchant'].map(merchant_transaction_counts)

ak_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,1,Tuesday,78.597568,0.637376,1267,87.393215,-82.423215,2028,0.004923,1
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,1,Tuesday,30.212176,-41.58037,2503,53.94932,53.28068,3030,0.005073,1
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,1,Tuesday,108.206083,34.130756,1895,65.87004,154.23996,503,0.001984,1
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1,Tuesday,95.673231,23.97106,2613,72.776673,-27.776673,493,0.002722,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,1,Tuesday,77.556744,2.322402,1592,95.178091,-53.218091,2017,0.006769,1


#### Feature Engineering - User Spending Behavior: Mean Transaction Amount (mean_amt_per_user)
* Transactions significantly above or below the user’s average spending pattern may indicate fraud

In [21]:
# Calculate the mean transaction amount for each user (cc_num)
user_mean_amt = ak_df.groupby('cc_num')['amt'].transform('mean')
ak_df['ft_mean_amt_per_user'] = user_mean_amt

ak_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,1,Tuesday,78.597568,0.637376,1267,87.393215,-82.423215,2028,0.004923,1
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,1,Tuesday,30.212176,-41.58037,2503,53.94932,53.28068,3030,0.005073,1
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,1,Tuesday,108.206083,34.130756,1895,65.87004,154.23996,503,0.001984,1
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1,Tuesday,95.673231,23.97106,2613,72.776673,-27.776673,493,0.002722,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,1,Tuesday,77.556744,2.322402,1592,95.178091,-53.218091,2017,0.006769,1


#### Feature Engineering - Transaction Amount Deviation (amt_deviation)
* Sudden changes in spending behavior (either very high or very low) could be a red flag

In [22]:
# Calculate the deviation of the transaction amount from the user’s mean transaction amount
ak_df['ft_amt_deviation'] = ak_df['amt'] - ak_df['ft_mean_amt_per_user']

ak_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,1,Tuesday,78.597568,0.637376,1267,87.393215,-82.423215,2028,0.004923,1
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,1,Tuesday,30.212176,-41.58037,2503,53.94932,53.28068,3030,0.005073,1
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,1,Tuesday,108.206083,34.130756,1895,65.87004,154.23996,503,0.001984,1
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1,Tuesday,95.673231,23.97106,2613,72.776673,-27.776673,493,0.002722,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,1,Tuesday,77.556744,2.322402,1592,95.178091,-53.218091,2017,0.006769,1


#### Feature Engineering - Transaction Frequency (transaction_count_per_user)
* Users with abnormally high transaction counts might exhibit fraudulent behavior

In [23]:
# Count the number of transactions for each user (cc_num)
user_transaction_count = ak_df['cc_num'].value_counts()
ak_df['ft_transaction_count_per_user'] = ak_df['cc_num'].map(user_transaction_count)

ak_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,1,Tuesday,78.597568,0.637376,1267,87.393215,-82.423215,2028,0.004923,1
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,1,Tuesday,30.212176,-41.58037,2503,53.94932,53.28068,3030,0.005073,1
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,1,Tuesday,108.206083,34.130756,1895,65.87004,154.23996,503,0.001984,1
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1,Tuesday,95.673231,23.97106,2613,72.776673,-27.776673,493,0.002722,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,1,Tuesday,77.556744,2.322402,1592,95.178091,-53.218091,2017,0.006769,1


#### Feature Engineering - Fraud Rate by State (state_fraud_rate)
* States with unusually high fraud rates could signal suspicious activity

In [24]:
# Calculate the fraud rate for each state by dividing the number of fraudulent transactions by the total number of transactions
fraud_rate_by_state = ak_df.groupby('state')['is_fraud'].mean()
ak_df['ft_state_fraud_rate'] = ak_df['state'].map(fraud_rate_by_state)

ak_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,1,Tuesday,78.597568,0.637376,1267,87.393215,-82.423215,2028,0.004923,1
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,1,Tuesday,30.212176,-41.58037,2503,53.94932,53.28068,3030,0.005073,1
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,1,Tuesday,108.206083,34.130756,1895,65.87004,154.23996,503,0.001984,1
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1,Tuesday,95.673231,23.97106,2613,72.776673,-27.776673,493,0.002722,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,1,Tuesday,77.556744,2.322402,1592,95.178091,-53.218091,2017,0.006769,1


#### Feature Engineering - Recurring Transaction Flag (is_recurring)
* Recurring transactions are less likely to be fraudulent

In [25]:
# Flag transactions as recurring if the same merchant and cc_num combination appears multiple times within a short period
ak_df['ft_transaction_is_recurring'] = ak_df.duplicated(subset=['cc_num', 'merchant'], keep=False).astype(int)

ak_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,1,Tuesday,78.597568,0.637376,1267,87.393215,-82.423215,2028,0.004923,1
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,1,Tuesday,30.212176,-41.58037,2503,53.94932,53.28068,3030,0.005073,1
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,1,Tuesday,108.206083,34.130756,1895,65.87004,154.23996,503,0.001984,1
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1,Tuesday,95.673231,23.97106,2613,72.776673,-27.776673,493,0.002722,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,1,Tuesday,77.556744,2.322402,1592,95.178091,-53.218091,2017,0.006769,1


In [26]:
# lets review the current columns of the updated dataframe
ak_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 40 columns):
 #   Column                               Non-Null Count    Dtype         
---  ------                               --------------    -----         
 0   Unnamed: 0                           1296675 non-null  int64         
 1   trans_date_trans_time                1296675 non-null  datetime64[ns]
 2   cc_num                               1296675 non-null  int64         
 3   merchant                             1296675 non-null  object        
 4   category                             1296675 non-null  object        
 5   amt                                  1296675 non-null  float64       
 6   first                                1296675 non-null  object        
 7   last                                 1296675 non-null  object        
 8   gender                               1296675 non-null  object        
 9   street                               1296675 non-null  ob

In [27]:
# Drop unnecessary columns
columns_to_drop = [
    'Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 'trans_num', 'lat', 
    'long', 'merch_zipcode', 'ft_age', 'merch_lat', 'merch_long', 'trans_num', 'unix_time', 'zip', 'dob'
]
ak_df_cleaned = ak_df.drop(columns=columns_to_drop)

ak_df_cleaned.head()

# Check for null values
# print(ak_df_cleaned.isnull().sum())

Unnamed: 0,merchant,category,amt,gender,city,state,city_pop,job,is_fraud,ft_age_group,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,3495,"Psychologist, counselling",0,29-38,...,1,Tuesday,78.597568,0.637376,1267,87.393215,-82.423215,2028,0.004923,1
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,149,Special educational needs teacher,0,39-48,...,1,Tuesday,30.212176,-41.58037,2503,53.94932,53.28068,3030,0.005073,1
2,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,4154,Nature conservation officer,0,49-58,...,1,Tuesday,108.206083,34.130756,1895,65.87004,154.23996,503,0.001984,1
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Boulder,MT,1939,Patent attorney,0,49-58,...,1,Tuesday,95.673231,23.97106,2613,72.776673,-27.776673,493,0.002722,0
4,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,99,Dance movement psychotherapist,0,29-38,...,1,Tuesday,77.556744,2.322402,1592,95.178091,-53.218091,2017,0.006769,1


In [28]:
# Explore data columns
ak_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 24 columns):
 #   Column                               Non-Null Count    Dtype  
---  ------                               --------------    -----  
 0   merchant                             1296675 non-null  object 
 1   category                             1296675 non-null  object 
 2   amt                                  1296675 non-null  float64
 3   gender                               1296675 non-null  object 
 4   city                                 1296675 non-null  object 
 5   state                                1296675 non-null  object 
 6   city_pop                             1296675 non-null  int64  
 7   job                                  1296675 non-null  object 
 8   is_fraud                             1296675 non-null  int64  
 9   ft_age_group                         1296675 non-null  object 
 10  ft_trans_hour                        1296675 non-null  int32  
 11

## SPlit the Data for training
* To DO: For final code, we need to split the data in train, validation and test

In [29]:
# Setup X and y variables
X = ak_df_cleaned.drop(columns='is_fraud')
y = ak_df_cleaned['is_fraud'].values.reshape(-1,1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.describe()

Unnamed: 0,amt,city_pop,ft_trans_hour,ft_trans_day,ft_trans_day_of_year,ft_trans_month,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
count,972506.0,972506.0,972506.0,972506.0,972506.0,972506.0,972506.0,972506.0,972506.0,972506.0,972506.0,972506.0,972506.0,972506.0
mean,70.331104,89066.86,12.803069,15.581916,171.292392,6.14163,76.106539,-0.006737,2047.612108,70.362675,-0.031571,1818.943181,0.00579,0.880264
std,156.429843,302135.4,6.819203,8.83003,104.403847,3.418533,29.105581,29.016692,529.873171,19.501144,155.224097,742.364302,0.002923,0.324653
min,1.0,23.0,0.0,1.0,1.0,1.0,0.022255,-80.831303,727.0,42.951671,-906.125556,7.0,0.001984,0.0
25%,9.65,743.0,7.0,8.0,87.0,3.0,55.352026,-20.657938,1783.0,59.800213,-52.488369,1466.0,0.005149,1.0
50%,47.56,2456.0,14.0,15.0,155.0,6.0,78.213079,2.182059,1985.0,65.09374,-24.082432,2000.0,0.005693,1.0
75%,83.21,20328.0,19.0,23.0,255.0,9.0,98.479446,22.250824,2444.0,83.283737,12.565647,2524.0,0.006585,1.0
max,27390.12,2906700.0,23.0,31.0,365.0,12.0,152.117173,70.815341,4403.0,948.818182,27293.342232,3123.0,1.0,1.0


## Encoding
#### Used Target Encoding to genrealise the encoding across multiple columns

In [30]:
!pip install category-encoders



In [31]:
import category_encoders as ce

# Define categorical columns to encode
categorical_columns = ['merchant', 'category', 'gender', 'city', 'state', 
                       'job', 'ft_time_of_day', 'ft_age_group', 'ft_day_of_week']

# Initialize the target encoder
encoder = ce.TargetEncoder(cols=categorical_columns)

# Fit the encoder on X_train using y_train
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_train_encoded.head()

Unnamed: 0,merchant,category,amt,gender,city,state,city_pop,job,ft_age_group,ft_trans_hour,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
458681,0.000567,0.001517,152.2,0.005277,0.005102,0.005227,5908,0.00432,0.007298,22,...,7,0.004817,93.025633,14.185436,2391,65.359526,86.840474,1560,0.005049,1
282534,0.001103,0.001517,73.74,0.005277,0.007175,0.007016,509,0.016461,0.004339,14,...,5,0.004817,89.99475,15.395644,2445,60.857433,12.882567,2030,0.007448,1
836287,0.004748,0.002241,81.83,0.006392,0.00646,0.00263,370,0.005198,0.007298,15,...,12,0.006926,26.213452,-47.83014,2230,69.655766,12.174234,1032,0.002077,1
350322,0.002187,0.001708,106.98,0.005277,0.001742,0.005227,24536,0.004544,0.00449,18,...,6,0.006926,79.520118,0.661116,1799,88.345212,18.634788,1533,0.005049,1
554505,0.014555,0.014083,116.07,0.005277,0.0,0.004296,149,0.001574,0.004339,3,...,8,0.004817,69.970289,-1.822257,2476,53.94932,62.12068,3030,0.005073,1


In [32]:
# Transform X_test using the already fitted encoder
X_test_encoded = encoder.transform(X_test)

# Check the transformed X_test
X_test_encoded.head()

Unnamed: 0,merchant,category,amt,gender,city,state,city_pop,job,ft_age_group,ft_trans_hour,...,ft_trans_month,ft_day_of_week,ft_distance_user_merchant,ft_merchant_distance_from_user_mean,ft_merchant_popularity,ft_mean_amt_per_user,ft_amt_deviation,ft_transaction_count_per_user,ft_state_fraud_rate,ft_transaction_is_recurring
1045211,0.001764,0.003157,194.51,0.006392,0.0,0.005732,972,0.004698,0.006088,15,...,3,0.004662,54.336119,-22.09335,1524,69.821821,124.688179,1494,0.005736,1
547406,0.001509,0.001598,52.32,0.005277,0.011152,0.008582,217,0.006555,0.009219,15,...,8,0.006926,66.060865,-6.639384,1751,64.054238,-11.734238,1043,0.008012,1
110142,0.008944,0.007118,6.53,0.005277,0.0,0.005732,184,0.001555,0.007166,1,...,3,0.004662,94.386045,17.820994,2362,63.39064,-56.86064,2062,0.005736,1
1285953,0.001091,0.001517,7.33,0.006392,0.0,0.006613,10717,0.001039,0.006578,20,...,6,0.005818,109.25129,35.316763,2456,69.869291,-62.539291,1496,0.006647,1
271705,0.004545,0.004664,64.29,0.005277,0.006623,0.005388,635,0.003891,0.004339,5,...,5,0.005818,67.501516,-6.773677,2676,50.852975,13.437025,995,0.005693,1


## Scaling
#### Logistic Regression will require scaling, other forest models do not require scaling

In [33]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on X_train_encoded and transform it
X_train_scaled = scaler.fit_transform(X_train_encoded)
print("X_train scaled sample:")
print(X_train_scaled[:5])  # Display the first 5 rows of the scaled training data

# Transform X_test_encoded using the fitted scaler
X_test_scaled = scaler.transform(X_test_encoded)
print("X_test scaled sample:")
print(X_test_scaled[:5])  # Display the first 5 rows of the scaled testing data

X_train scaled sample:
[[-0.9260134  -0.7990119   0.52335882 -0.90858171 -0.04703003 -0.39000917
  -0.27523718 -0.24209729  1.02969467  1.34868192  0.99604914  0.61359778
   0.29412349  0.25109323 -1.05050493  0.581301    0.48910398  0.6480571
  -0.25655685  0.55965595 -0.34880895 -0.25347943  0.36881277]
 [-0.83083228 -0.7990119   0.02179186 -0.90858171  0.24055955  0.88279896
  -0.29310666  1.95764371 -0.97858935  0.17552374 -0.99690486  0.38709786
  -0.30930286 -0.33395336 -1.05050493  0.47716686  0.53081132  0.74996833
  -0.48741995  0.08319677  0.28430371  0.56719337  0.36881277]
 [-0.18347881 -0.66334717  0.07350836  1.10061648  0.14131472 -2.23825224
  -0.29356672 -0.08315363  1.02969467  0.32216851 -0.99690486 -0.40565187
   1.67338372  1.71370972  1.24792515 -1.7142112  -1.64813505  0.34421064
  -0.03624966  0.07863348 -1.06005041 -1.26986245  0.36881277]
 [-0.63831662 -0.76318293  0.23428339 -0.90858171 -0.51305283 -0.39000917
  -0.21358268 -0.20153154 -0.87602746  0.76210283

## Modeling

In [34]:
# CatBoost
from catboost import CatBoostClassifier
from sklearn.metrics import balanced_accuracy_score

In [53]:
def cost_of_error_by_chebyshevs(x, y_real, y_prediction, cost=0.25 ):
    
    false_negative_cost = []
    false_positive_cost = []
    correct = []
    
    size = len(y_real)
    
    for i in range(0, size-1):
        if ( y_real[i][0] != y_prediction[i])  :
            if (y_real[i][0] == np.int64(1)):
                false_negative_cost.append(x[i])
            else:
                false_positive_cost.append(x[i])
        else:
            if ( y_real[i][0] == np.int64(1)):
                correct.append(x[i])
    
    cost_correct_pred = (sum(correct)*len(correct))
    cost_fp_pred = (cost * sum(false_positive_cost) * (len(false_positive_cost)))
    cost_fn_pred = ( sum(false_negative_cost) * (len(false_negative_cost)) )
    
    return  { 
                'accuracy' : len(correct)/(len(correct) + len(false_positive_cost) + len(false_negative_cost) ),
                'error_cost' : cost_fn_pred/(cost_correct_pred + cost_fn_pred), 
                'error' : len(false_negative_cost)/(len(correct) + len(false_positive_cost) + len(false_negative_cost) ),
                'customer_experience_cost' : 1 - (cost_correct_pred/(cost_correct_pred + cost_fp_pred)),  
                'customer_experience_rating' : 1 - (len(false_positive_cost)/(len(correct) + len(false_positive_cost) + len(false_negative_cost) ))
        }

In [36]:
# Initialize CatBoostClassifier
model = CatBoostClassifier(depth=7, iterations=100, random_seed=13, verbose=0)

In [38]:
# Fit the model
model.fit(X_train_encoded, y_train)

<catboost.core.CatBoostClassifier at 0x1870a40eff0>

In [39]:
# Make predictions on the training set
y_train_pred = model.predict(X_train_encoded)
print("Balanced Accuracy Score on Training Set:", balanced_accuracy_score(y_train, y_train_pred))

# Make predictions on the test set
y_test_pred = model.predict(X_test_encoded)
print("Balanced Accuracy Score on Test Set:", balanced_accuracy_score(y_test, y_test_pred))

Balanced Accuracy Score on Training Set: 0.9475808249483308
Balanced Accuracy Score on Test Set: 0.9122683428215463


In [54]:

train_model_rating = cost_of_error_by_chebyshevs(X_train_encoded['amt'].to_list(), y_train, y_train_pred)

print(f'''Training Model Information:\n customer experience rating: {train_model_rating['customer_experience_rating']} \n 
      error rating: {train_model_rating['error']} \n
      customer experience rating cost: {train_model_rating['customer_experience_cost']} \n
      error cost: {train_model_rating['error_cost']} \n
      ''' )

Training Model Information:
 customer experience rating: 0.9878755930416447 
 
      error rating: 0.10349674925320682 

      customer experience rating cost: 3.428086005485831e-05 

      error cost: 0.0025205891672246085 

      


In [56]:

test_model_rating = cost_of_error_by_chebyshevs(X_test_encoded['amt'].to_list(),y_test, y_test_pred)

print(f'''Testing Model Information:\n customer experience rating: {test_model_rating['customer_experience_rating']} \n 
      error rating: {test_model_rating['error']} \n
      customer experience rating cost: {test_model_rating['customer_experience_cost']} \n
      error cost: {test_model_rating['error_cost']} \n
      ''' )

Testing Model Information:
 customer experience rating: 0.9505549949545913 
 
      error rating: 0.16649848637739656 

      customer experience rating cost: 0.0016012014317758894 

      error cost: 0.01596437656361233 

      


In [55]:
confusion_matrix(y_train, y_train_pred)

array([[966815,     69],
       [   589,   5033]], dtype=int64)

In [58]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    966884
           1       0.99      0.90      0.94      5622

    accuracy                           1.00    972506
   macro avg       0.99      0.95      0.97    972506
weighted avg       1.00      1.00      1.00    972506



In [59]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    322285
           1       0.94      0.82      0.88      1884

    accuracy                           1.00    324169
   macro avg       0.97      0.91      0.94    324169
weighted avg       1.00      1.00      1.00    324169

