## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve

## Load dataset

In [3]:
df = pd.read_csv('../raw_data/fraudTrain_cleaned2.csv', index_col=0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1295379 entries, 0 to 1295378
Data columns (total 42 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   trans_date_trans_time    1295379 non-null  object 
 1   cc_num                   1295379 non-null  int64  
 2   merchant                 1295379 non-null  object 
 3   category                 1295379 non-null  object 
 4   amt                      1295379 non-null  float64
 5   first                    1295379 non-null  object 
 6   last                     1295379 non-null  object 
 7   state                    1295379 non-null  object 
 8   lat                      1295379 non-null  float64
 9   long                     1295379 non-null  float64
 10  city_pop                 1295379 non-null  int64  
 11  job                      1295379 non-null  object 
 12  dob                      1295379 non-null  object 
 13  merch_lat                1295379 non-null 

## One hot encoding categorical variables

In [5]:
# get dummies for 'state'
state_dummies = pd.get_dummies(df['state'],prefix='state')

# Concatenate one-hot encoded dataframe to main dataframe.
df = pd.concat([df, state_dummies], axis=1)

In [6]:
# get dummies for 'day_of_week'
day_dummies = pd.get_dummies(df['day_of_week'],prefix='day')

# Concatenate one-hot encoded dataframe to main dataframe.
df = pd.concat([df, day_dummies], axis=1)

In [7]:
# get dummies for 'month'
month_dummies = pd.get_dummies(df['month'],prefix='month')

# Concatenate one-hot encoded dataframe to main dataframe.
df = pd.concat([df, month_dummies], axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1295379 entries, 0 to 1295378
Columns: 112 entries, trans_date_trans_time to month_12
dtypes: float64(8), int64(25), object(9), uint8(70)
memory usage: 511.4+ MB


## Drop unnecessary columns

In [9]:
drop_columns = ['val_mean_amt', 'state', 'job', 'dob', 'day_of_week', 'month', 'hour', 'distance', 'distance2', 'lat', 'long', 'merch_lat', 'merch_long']
df.drop(drop_columns, axis = 1, inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1295379 entries, 0 to 1295378
Data columns (total 99 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   trans_date_trans_time    1295379 non-null  object 
 1   cc_num                   1295379 non-null  int64  
 2   merchant                 1295379 non-null  object 
 3   category                 1295379 non-null  object 
 4   amt                      1295379 non-null  float64
 5   first                    1295379 non-null  object 
 6   last                     1295379 non-null  object 
 7   city_pop                 1295379 non-null  int64  
 8   is_fraud                 1295379 non-null  int64  
 9   val_mean_amt_dist        1295379 non-null  float64
 10  category_entertainment   1295379 non-null  int64  
 11  category_food_dining     1295379 non-null  int64  
 12  category_gas_transport   1295379 non-null  int64  
 13  category_grocery_net     1295379 non-null 

## Split data into training and testing sets.

In [11]:
#Check partition sizes with 70/30 train/test split
len(df) * .7, len(df) * .3

(906765.2999999999, 388613.7)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='is_fraud'), 
                                                    df.is_fraud, test_size=0.3, 
                                                    random_state=47)

In [13]:
X_train.shape, X_test.shape

((906765, 98), (388614, 98))

In [14]:
y_train.shape, y_test.shape

((906765,), (388614,))

In [15]:
#Save the 'trans_date_trans_time', 'cc_num', 'merchant', 'category', 'first', 'last' columns 
#from the train/test data into labels_train and labels_test
#Then drop those columns from `X_train` and `X_test`.

labels_list = ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'first', 'last']
labels_train = X_train[['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'first', 'last']]
labels_test = X_test[['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'first', 'last']]
X_train.drop(columns=labels_list, inplace=True)
X_test.drop(columns=labels_list, inplace=True)
X_train.shape, X_test.shape

((906765, 92), (388614, 92))

In [16]:
#Check the `info` method of `X_train` to verify all features are numeric
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 906765 entries, 825668 to 889991
Data columns (total 92 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   amt                      906765 non-null  float64
 1   city_pop                 906765 non-null  int64  
 2   val_mean_amt_dist        906765 non-null  float64
 3   category_entertainment   906765 non-null  int64  
 4   category_food_dining     906765 non-null  int64  
 5   category_gas_transport   906765 non-null  int64  
 6   category_grocery_net     906765 non-null  int64  
 7   category_grocery_pos     906765 non-null  int64  
 8   category_health_fitness  906765 non-null  int64  
 9   category_home            906765 non-null  int64  
 10  category_kids_pets       906765 non-null  int64  
 11  category_misc_net        906765 non-null  int64  
 12  category_misc_pos        906765 non-null  int64  
 13  category_personal_care   906765 non-null  int64  
 14 

In [17]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 388614 entries, 1133158 to 157046
Data columns (total 92 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   amt                      388614 non-null  float64
 1   city_pop                 388614 non-null  int64  
 2   val_mean_amt_dist        388614 non-null  float64
 3   category_entertainment   388614 non-null  int64  
 4   category_food_dining     388614 non-null  int64  
 5   category_gas_transport   388614 non-null  int64  
 6   category_grocery_net     388614 non-null  int64  
 7   category_grocery_pos     388614 non-null  int64  
 8   category_health_fitness  388614 non-null  int64  
 9   category_home            388614 non-null  int64  
 10  category_kids_pets       388614 non-null  int64  
 11  category_misc_net        388614 non-null  int64  
 12  category_misc_pos        388614 non-null  int64  
 13  category_personal_care   388614 non-null  int64  
 14

## Initial baseline modeling

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression()
# Fit the model on the trainng data.
logreg.fit(X_train, y_train)

y_pred_train = logreg.predict(X_train)
y_pred_test = logreg.predict(X_test)

# Print the accuracy from the testing data.
print(accuracy_score(y_pred_test, y_test))

0.9942668045927321


This doesn't tell us very much because of the imbalanced nature of the data. It's possible that the model has accurately predicted EVERY transaction as non-fraud and still have a high accuracy score.

In [19]:
from sklearn.metrics import classification_report

print("[Training Classification Report]")
print(classification_report(y_train, y_pred_train))

print("[Test Classification Report]")
print(classification_report(y_test, y_pred_test))

[Training Classification Report]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    901486
           1       0.48      0.19      0.27      5279

    accuracy                           0.99    906765
   macro avg       0.74      0.59      0.63    906765
weighted avg       0.99      0.99      0.99    906765

[Test Classification Report]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    386387
           1       0.50      0.22      0.31      2227

    accuracy                           0.99    388614
   macro avg       0.75      0.61      0.65    388614
weighted avg       0.99      0.99      0.99    388614



Based on the precision values, this tells us that ~50% of our current "out of the box" model's fraud predictions are correctly classifications. Based on our recall values, our model was only able to correctly classify ~20% of our positive cases.

Because the objective of this problem is to catch fraudulent transactions, we want to aim for high recall.

## Scaling features

Is it better to scale only nonbinary features, or to scale ALL features?  We'll create two different versions of this.

In [20]:
#Scale all features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## GridSearchCV to tune parameters

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score

In [None]:
logreg2 = LogisticRegression(solver='saga')
params = {"C":np.logspace(-3,3,5,7), "penalty":["l1", "l2", 'none']}

clf2 = GridSearchCV(logreg2, params, cv=5, scoring = 'recall')

clf2.fit(X_train_scaled, y_train)

print(clf2.best_params_, clf2.best_score_)



In [None]:
y_pred_train2 = clf2.predict(X_train_scaled)
y_pred_test2 = clf2.predict(X_test_scaled)

In [None]:
print("[Training Classification Report]")
print(classification_report(y_train, y_pred_train2))

print("[Test Classification Report]")
print(classification_report(y_test, y_pred_test2))