In [2]:
import kagglehub
import numpy as np
import pandas as pd

import data_prep as dp

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score

## Import Data

In [3]:
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)

file = "/fraudTest.csv"
dat = pd.read_csv(path + file)

dat_tst = dat.drop('Unnamed: 0', axis = 1)

Path to dataset files: /Users/anabellafalk/.cache/kagglehub/datasets/kartik2112/fraud-detection/versions/1


In [4]:
dat_tr = dp.load_train()

Path to dataset files: /Users/anabellafalk/.cache/kagglehub/datasets/kartik2112/fraud-detection/versions/1


## Prepare Data

### Extract Age

In [5]:
# Calculate and create age column
dp.extract_age(dat_tr)
dp.extract_age(dat_tst)

### Extract Distance

In [6]:
# Calculate and create distance column
dp.extract_distance(dat_tr)
dp.extract_distance(dat_tst)

### Encode Category

In [7]:
category_encoder = dp.fit_col_encoder(dat_tr, 'category')
category_enc_tr = dp.encode_col(dat_tr, 'category', category_encoder)
category_enc = dp.encode_col(dat_tst, 'category', category_encoder)

### Encode Year

In [8]:
# Create year column
dp.extract_year(dat_tr)
dp.extract_year(dat_tst)

In [9]:
year_encoder = dp.fit_col_encoder(dat_tr, 'trans_year')
year_enc_tr = dp.encode_col(dat_tr, 'trans_year', year_encoder)
year_enc = dp.encode_col(dat_tst, 'trans_year', year_encoder)

### Encode Month

In [10]:
# Create month column
dp.extract_month(dat_tr)
dp.extract_month(dat_tst)

In [11]:
dat_tr['trans_month'] = dat_tr['trans_month'].astype(str)
dat_tst['trans_month'] = dat_tst['trans_month'].astype(str)

In [12]:
month_encoder = dp.fit_col_encoder(dat_tr, 'trans_month')
month_enc_tr = dp.encode_col(dat_tr, 'trans_month', month_encoder)
month_enc = dp.encode_col(dat_tst, 'trans_month', month_encoder)

### Encode and Bin Jobs

In [13]:
# Fit binning on training data
job_binner = dp.fit_job_bins(dat_tr)

In [14]:
# Apply bins to training data
dp.bin_jobs(dat_tr, job_binner)
dp.bin_jobs(dat_tst, job_binner)

In [15]:
dat_tr['job_bin'] = dat_tr['job_bin'].astype(str)
dat_tst['job_bin'] = dat_tst['job_bin'].astype(str)

In [16]:
job_bin_encoder = dp.fit_col_encoder(dat_tr, 'job_bin')
job_bin_enc_tr = dp.encode_col(dat_tr, 'job_bin', job_bin_encoder)
job_bin_enc = dp.encode_col(dat_tst, 'job_bin', job_bin_encoder)

### Create Model Dataframe

In [17]:
model_dat = pd.concat([
    dat_tr[['is_fraud'] + ['amt'] + ['age_at_trans', 'city_pop', 'distance']].reset_index(drop=True),
    category_enc_tr.reset_index(drop=True),
    year_enc_tr.reset_index(drop=True),
    month_enc_tr.reset_index(drop=True),
    job_bin_enc_tr.reset_index(drop=True)
], axis=1)

In [24]:
test_dat = pd.concat([
    dat_tst[['is_fraud'] + ['amt'] + ['age_at_trans', 'city_pop', 'distance']].reset_index(drop=True),
    category_enc.reset_index(drop=True),
    year_enc.reset_index(drop=True),
    month_enc.reset_index(drop=True),
    job_bin_enc.reset_index(drop=True)
], axis=1)

## Fit Model

In [31]:
rfc = RandomForestClassifier(class_weight='balanced', min_samples_leaf=9, min_samples_split=2, n_estimators=500)

In [32]:
rfc.fit(model_dat.drop('is_fraud', axis=1), model_dat['is_fraud'])

RandomForestClassifier(class_weight='balanced', min_samples_leaf=9,
                       n_estimators=500)

## Evaluate Model

In [33]:
rfc_true = test_dat['is_fraud']
rfc_pred = rfc.predict(test_dat.drop('is_fraud', axis=1))

In [34]:
roc_auc_score(test_dat['is_fraud'], rfc.predict_proba(test_dat.drop('is_fraud', axis=1))[:,1])

0.9931030355716125

In [35]:
precision_score(rfc_true, rfc_pred, average=None)

array([0.99925208, 0.49218528])

In [36]:
recall_score(rfc_true, rfc_pred, average=None)

array([0.99677189, 0.80745921])