In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set pandas to display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
data = pd.read_csv('./data/fraudTrain.csv', dtype={
    'merch_lat': str,
    'merch_long': str
})

**Data Dictionary**

- Transaction event data: 'trans_date_trans_time', 'amt', 'unix_time', 'trans_num', 'is_fraud'
- Customer: 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'job', 'dob'
- Merchant: 'merchant', 'category', 'merch_lat', 'merch_long', 'city_pop'

In [None]:
# EDA
## Keep only necessary columns
## - Transaction event data: 'trans_date_trans_time', 'amt', 'unix_time', 'is_fraud'
## - Customer: 'gender', 'lat', 'long', 'job', 'dob' --keep only lat, long to calculate distance
## - Merchant: 'category', 'merch_lat', 'merch_long', 'city_pop'

# split features / label
features = data.drop('Unnamed: 0', axis=1).drop('is_fraud', axis=1)
features = features[['trans_date_trans_time', 'amt', 'unix_time','gender', 'lat', 'long', 'job', 'dob','category', 'merch_lat', 'merch_long', 'city_pop']]
label = data['is_fraud']
display(features.info())
display(features.describe(include='all').T)

## EDA

- Transaction event data: 'trans_date_trans_time', 'amt', 'unix_time', 'is_fraud'
- Customer: 'gender', 'lat', 'long', 'job', 'dob' --keep only lat, long to calculate distance
- Merchant: 'category', 'merch_lat', 'merch_long', 'city_pop'

In [4]:
# Setup environment
# sns.set_style(style="whitegrid")

### is_fraud

In [None]:
sns.countplot(data=data, x="is_fraud")
plt.title("Distribution of Fraud")
print(data['is_fraud'].value_counts())
plt.show()

### amt

- amt is severely left-skewed.
- amt of fraud transactions  is concentrated below ~1400
- fraud transactions are more frequent between 800 and 1100 (use amt as a feature!)

In [None]:
print("amt of non-fraud\n", data.query('is_fraud == 0')['amt'].describe())
print("\namt of fraud\n", data.query('is_fraud == 1')['amt'].describe())

sns.boxplot(data=data.query('is_fraud == 1'), x="amt")
plt.show()

In [None]:
# Create bins for the amt column
data['amt_bins'] = pd.cut(data['amt'], bins=[0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, float('inf')])

# Calculate fraud rate for each bin
fraud_count = data.groupby('amt_bins')['is_fraud'].count().reset_index(name="fraud_count")
fraud_rate = data.groupby('amt_bins')['is_fraud'].mean().reset_index(name="fraud_rate")

# Plot the fraud rate
plt.figure(figsize=(12, 6))
sns.barplot(x='amt_bins', y='fraud_rate', data=fraud_rate)
plt.title('Fraud Rate by Transaction Amount')
plt.xlabel('Transaction Amount Bins')
plt.ylabel('Fraud Rate')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### trans_date_trans_time

Notes: frauds happen all year long, but there are some weeks with more frauds than others, with some variation but nothing critical.

In [None]:
# Extract weeks
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'], errors='coerce')
data['date_week'] = data['trans_date_trans_time'].dt.to_period('W')

# Group by date and calculate count of transactions
daily_count = data.groupby(['date_week', 'is_fraud']).size().reset_index(name="count")

# Convert 'date' to datetime for proper plotting
daily_count['date_week'] = daily_count['date_week'].dt.to_timestamp()

# Set up the plot
plt.figure(figsize=(15, 6))

# Create a line plot
sns.lineplot(data=daily_count, x='date_week', y='count', hue='is_fraud')

# Customize the plot
plt.yscale('log')
plt.title('Transaction Count by Week')
plt.xlabel('Date (Week)')
plt.ylabel('Average Count')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Show the plot
plt.tight_layout()
plt.show()

### Other features

In [None]:
# gender
sns.countplot(data=data, x="gender", hue="is_fraud")
plt.title("Distribution of Gender")
plt.show()

In [None]:
# 'lat', 'long'
sns.scatterplot(data=data, x="lat", y="long", hue="is_fraud")
plt.title("Distribution of Latitude and Longitude")
plt.show()

# Feature selection

In [12]:
import math

def haversine(lat1, lon1, lat2, lon2):
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    r = 6371  # Radius of Earth in kilometers
    return r * c

In [32]:
model_features_names = ['trans_week', 'amt', 'gender_male', 'distance_cust_merchant', 'fraud_rate_job', 'yob', 'city_pop']

features['trans_week'] = pd.to_datetime(features['trans_date_trans_time']).apply(lambda x: x.week)
features['gender_male']= features['gender'].map({'F': 0, 'M': 1})
features['distance_cust_merchant'] = features.apply(lambda row: haversine(
    float(row['lat']), float(row['long']), 
    float(row['merch_lat']), float(row['merch_long'])
), axis=1)
fraud_rate_job = data.groupby('job')['is_fraud'].mean().reset_index(name="fraud_rate_job")
features['fraud_rate_job'] = features['job'].map(fraud_rate_job.set_index('job')['fraud_rate_job'])
features['yob'] = pd.to_datetime(features['dob'], errors='coerce').dt.to_period('Y')
features['yob'] = features['yob'].apply(lambda x: int(x.strftime('%Y')))

model_features = features[model_features_names]
model_features['yob'] = model_features['yob'].apply(lambda x: int(x.strftime('%Y')))

# Model Development

In [47]:
# split and scale

import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(model_features, label, test_size=0.1, random_state=14)

# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Training and Evaluation

In [53]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def train_and_evaluate(model, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test,y_pred)
  print(f'{model.__class__.__name__} Accuracy: {accuracy:.4f}')
  print(classification_report(y_test, y_pred))
  return None

In [None]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression
lrg_model = LogisticRegression()
train_and_evaluate(lrg_model, X_train, X_test, y_train, y_test)

In [None]:
## XGBoost
from xgboost import XGBClassifier
xgb_model = XGBClassifier(random_state=14, eta=0.9)
train_and_evaluate(xgb_model, X_train, X_test, y_train, y_test)

In [None]:
# SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=14)

X_resampled, y_resampled, = smote.fit_resample(X_train, y_train)

train_and_evaluate(xgb_model, X_resampled, X_test, y_resampled, y_test)