Import libraries

In [2]:
import pandas as pd

In [3]:
import data_prep as dp

Import Data

In [4]:
dat_val, dat_tr = dp.load_train_split()

Path to dataset files: /Users/anabellafalk/.cache/kagglehub/datasets/kartik2112/fraud-detection/versions/1


### Prepare Data

In [5]:
# Initialize relevant columns
model_cols = ['category', 'amt', 'trans_month', 'trans_year']
identifier_cols = ['age_at_trans', 'city_pop', 'job_bin']
other_cols = ['distance']

#### Extract Date Info

In [6]:
# Create month column
dp.extract_month(dat_tr)
dp.extract_month(dat_val)

In [7]:
# Create year column
dp.extract_year(dat_tr)
dp.extract_year(dat_val)

In [8]:
# Create month and year column
dp.extract_month_year(dat_tr)
dp.extract_month_year(dat_val)

#### Extract Age

In [9]:
# Calculate and create age column
dp.extract_age(dat_tr)
dp.extract_age(dat_val)

#### Bin Jobs

In [10]:
# Fit binning on training data
job_binner = dp.fit_job_bins(dat_tr)

In [11]:
# Apply bins to training data
dp.bin_jobs(dat_tr, job_binner)
dp.bin_jobs(dat_val, job_binner)

#### Extract Distance

In [12]:
# Calculate and create distance column
dp.extract_distance(dat_tr)
dp.extract_distance(dat_val)

#### Scale Numeric Data

##### Scale Age

In [None]:
age_scaler = dp.fit_col_scaler(dat_tr, 'age_at_trans')
dp.scale_col(dat_tr, 'age_at_trans', age_scaler)
dp.scale_col(dat_val, 'age_at_trans', age_scaler)

##### Scale Distance

In [None]:
dist_scaler = dp.fit_col_scaler(dat_tr, 'distance')
dp.scale_col(dat_tr, 'distance', dist_scaler)
dp.scale_col(dat_val, 'distance', dist_scaler)

##### Scale City Population

In [None]:
city_pop_scaler = dp.fit_col_scaler(dat_tr, 'city_pop')
dp.scale_col(dat_tr, 'city_pop', city_pop_scaler)
dp.scale_col(dat_val, 'city_pop', city_pop_scaler)

##### Scale Transaction Amount

In [None]:
amt_scaler = dp.fit_col_scaler(dat_tr, 'amt')
dp.scale_col(dat_tr, 'amt', amt_scaler)
dp.scale_col(dat_val, 'amt', amt_scaler)

#### Encode Categorical Data

##### Encode Category

In [13]:
category_encoder = dp.fit_col_encoder(dat_tr, 'category')
category_enc_tr = dp.encode_col(dat_tr, 'category', category_encoder)
category_enc_val = dp.encode_col(dat_val, 'category', category_encoder)
category_enc_tr

Unnamed: 0,category.entertainment,category.food_dining,category.gas_transport,category.grocery_net,category.grocery_pos,category.health_fitness,category.home,category.kids_pets,category.misc_net,category.misc_pos,category.personal_care,category.shopping_net,category.shopping_pos,category.travel
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319617,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
319618,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
319619,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
319620,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


##### Encode Year

In [14]:
year_encoder = dp.fit_col_encoder(dat_tr, 'trans_year')
year_enc_tr = dp.encode_col(dat_tr, 'trans_year', year_encoder)
year_enc_val = dp.encode_col(dat_val, 'trans_year', year_encoder)
year_enc_tr

Unnamed: 0,trans_year.2019,trans_year.2020
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
319617,0.0,1.0
319618,0.0,1.0
319619,0.0,1.0
319620,0.0,1.0


##### Encode State

Decided not to use state due to imbalance of data and fear of overfitting

In [15]:
# state_encoder = dp.fit_col_encoder(dat_tr, 'state')
# state_enc_tr = dp.encode_col(dat_tr, 'state', state_encoder)
# state_enc_val = dp.encode_col(dat_val, 'state', dp.fit_col_encoder(dat_val, 'state'))
# state_enc_tr

### Fit Models

#### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

##### Full Model

In [17]:
rfc = RandomForestClassifier()

In [18]:
rfc_model_dat = pd.concat([
    dat_tr[['is_fraud'] + ['amt'] + identifier_cols + other_cols].reset_index(drop=True),
    category_enc_tr.reset_index(drop=True),
    year_enc_tr.reset_index(drop=True),
], axis=1)

In [19]:
rfc.fit(rfc_model_dat.drop('is_fraud', axis=1), rfc_model_dat['is_fraud'])

RandomForestClassifier()

In [20]:
rfc.score(rfc_model_dat.drop('is_fraud', axis=1), rfc_model_dat['is_fraud'])

1.0

In [21]:
identifier_cols

['age_at_trans', 'city_pop', 'job_bin']

In [22]:
rfc_val_dat = pd.concat([
    dat_val[['is_fraud'] + ['amt'] + identifier_cols + other_cols].reset_index(drop=True),
    category_enc_val.reset_index(drop=True),
    year_enc_val.reset_index(drop=True),
], axis=1)

In [23]:
rfc.score(rfc_val_dat.drop('is_fraud', axis=1), rfc_val_dat['is_fraud'])

0.997122981046064

In [None]:
import shap
explainer = shap.Explainer(rfc, rfc_model_dat)
shap_values = explainer(rfc_model_dat)




IndexError: list index out of range

In [38]:
shap_values

.values =
array([[[ 0.00000000e+00,  0.00000000e+00],
        [ 7.70166663e-03, -7.70166643e-03],
        [-2.64499994e-03,  2.64499995e-03],
        ...,
        [ 8.33333470e-05, -8.33333354e-05],
        [ 3.33333260e-04, -3.33333306e-04],
        [ 0.00000000e+00,  0.00000000e+00]],

       [[ 0.00000000e+00,  0.00000000e+00],
        [-3.13833326e-03,  3.13833316e-03],
        [-3.13666659e-03,  3.13666660e-03],
        ...,
        [ 1.33333327e-04, -1.33333332e-04],
        [ 2.40333326e-03, -2.40333329e-03],
        [ 0.00000000e+00,  0.00000000e+00]],

       [[ 0.00000000e+00,  0.00000000e+00],
        [ 5.71666657e-03, -5.71666651e-03],
        [ 6.11833323e-03, -6.11833318e-03],
        ...,
        [-2.33333185e-05,  2.33333278e-05],
        [-2.00000691e-05,  2.00000018e-05],
        [ 0.00000000e+00,  0.00000000e+00]],

       ...,

       [[ 0.00000000e+00,  0.00000000e+00],
        [ 1.62049999e-02, -1.62049997e-02],
        [-2.74833328e-03,  2.74833329e-03],
        

##### No Job

In [25]:
rfc_no_job = RandomForestClassifier()

In [26]:
identifier_cols

['age_at_trans', 'city_pop', 'job_bin']

In [27]:
rfc_no_job_model_dat = pd.concat([
    dat_tr[['is_fraud'] + ['amt'] + identifier_cols[:2] + other_cols].reset_index(drop=True),
    category_enc_tr.reset_index(drop=True),
    year_enc_tr.reset_index(drop=True),
], axis=1)

In [28]:
rfc_no_job.fit(rfc_no_job_model_dat.drop('is_fraud', axis=1), rfc_no_job_model_dat['is_fraud'])
rfc_no_job.score(rfc_no_job_model_dat.drop('is_fraud', axis=1), rfc_no_job_model_dat['is_fraud'])

0.9999968713042281

In [None]:
rfc_no_job_val_dat = pd.concat([
    dat_val[['is_fraud'] + ['amt'] + identifier_cols[:2] + other_cols].reset_index(drop=True),
    category_enc_val.reset_index(drop=True),
    year_enc_val.reset_index(drop=True),
], axis=1)

In [None]:
rfc_no_job.score(rfc_no_job_val_dat.drop('is_fraud', axis=1), rfc_no_job_val_dat['is_fraud'])

#### SVM

scale features, < 100, use kernel, balance classes

#### XGBoost

#### Logistic Regression