In [1]:
import sklearn as sk
import numpy as np 
import pandas as pd 

In [2]:
# read in our training data and take the first column as the index column
fraud_train = pd.read_csv('fraudTrain.csv', index_col=0)

# split the training data into x and y
X = fraud_train.drop(columns='is_fraud')
y = fraud_train['is_fraud']


In [3]:
# first look at our data
X.info()
X.sample(3, random_state=1)

<class 'pandas.core.frame.DataFrame'>
Index: 1296675 entries, 0 to 1296674
Data columns (total 21 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   first                  1296675 non-null  object 
 6   last                   1296675 non-null  object 
 7   gender                 1296675 non-null  object 
 8   street                 1296675 non-null  object 
 9   city                   1296675 non-null  object 
 10  state                  1296675 non-null  object 
 11  zip                    1296675 non-null  int64  
 12  lat                    1296675 non-null  float64
 13  long                   1296675 non-null  float64
 14  city_pop               

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long
94102,2019-02-25 08:24:40,374497717543058,fraud_Funk Group,grocery_net,20.0,Linda,Hurst,F,31701 Tucker Square Suite 893,Wilton,...,58579,47.1709,-100.7944,1190,"Designer, ceramics/pottery",1948-06-30,1595dec12f6f19ceaae9b7df0f8af5c0,1330158280,46.398331,-99.813959
198791,2019-04-12 19:50:15,4428154703770710,"fraud_Prosacco, Kreiger and Kovacek",home,284.88,Brittany,Guerra,F,79209 Gary Dale,Colton,...,99113,46.5901,-117.1692,761,Chief Marketing Officer,1943-06-30,0ed26b649ed0fce94d8e632b7208dea0,1334260215,45.687331,-117.488135
1238587,2020-05-31 21:50:53,213148039875802,"fraud_Langworth, Boehm and Gulgowski",shopping_net,5.07,Jill,Jacobs,F,034 Kimberly Mountains,Brandon,...,33510,27.9551,-82.2966,79613,Environmental consultant,1978-11-30,7096316ec1a4b261e8613013827abae7,1370037053,27.254081,-81.974799


# Preprocessing

## 1. Finding and Handling Missing Values

In [4]:
# what columns have null values?
sum(X.isnull().sum()) + y.isnull().sum()

0

In [5]:
# what columns have missing/na values?
sum(X.isna().sum()) + y.isna().sum()

0

No handling of missing values is required as there are no missing values in our trainng dataset

## 2. Finding and Removing Outliers

Removing outliers is an important part of preprocessing as it can:
- distort data analysis 
- reduce machine learning model accuracy and generalization
- impact visual data, skewing the scale 
- more?

For numeric columns (those with an integer or float data type), I chose to use the [Interquartile Range Method](https://online.stat.psu.edu/stat200/lesson/3/3.2) of finding and removing outliers. 

Categorical, or 

## FIXME

In [6]:
# find the numerical columns
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
num_cols

# calculate the IQR lower and upper bounds for each numerical column
def iqr_bounds(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

In [7]:
# lets look at the lower and upper bounds for each numerical column
for col in num_cols:
    lower, upper = iqr_bounds(X[col])
    print(f"{col}: lower={lower}, upper={upper}")

cc_num: lower=-6513275846701038.0, upper=1.133557426847813e+16
amt: lower=-100.58499999999998, upper=193.375
zip: lower=-42470.5, upper=140749.5
lat: lower=23.640650000000004, upper=52.920249999999996
long: lower=-121.75800000000001, upper=-55.198
city_pop: lower=-28634.5, upper=49705.5
unix_time: lower=1307798793.0, upper=1390337325.0
merch_lat: lower=23.898184000000008, upper=52.79255199999999
merch_long: lower=-121.88799400000002, upper=-55.24607799999998


In [8]:
# how many outliers are there in each numerical column?
for col in num_cols:
    lower, upper = iqr_bounds(X[col])
    outliers = fraud_train[(fraud_train[col] < lower) | (fraud_train[col] > upper)]
    print(f"{col}: {len(outliers)} outliers")

cc_num: 118789 outliers
amt: 67290 outliers
zip: 0 outliers
lat: 4679 outliers
long: 49922 outliers
city_pop: 242674 outliers
unix_time: 0 outliers
merch_lat: 4967 outliers
merch_long: 41994 outliers


I won't be dropping the rows with outlier values before training a model on the dataset, as I'm interested in seeing how the model performs with and without the outlier values.

### 3. Correlation Analysis & Feature Engineering

### Other Preprocessing Steps

The trans_date_trans_time column may not help us detect fraud very well, but there is some interesting information we may want to look at within the column. Information such as day of the week or the hour in which the transaction took place could help us identify fraudulent transactions more accurately.

In [9]:
# transforming 'trans_date_trans_time' into day of week, hour of day, and month
X['trans_date_trans_time'] = pd.to_datetime(X['trans_date_trans_time'])
X['trans_month'] = X['trans_date_trans_time'].dt.month_name()
X['trans_day_of_week'] = X['trans_date_trans_time'].dt.day_name()
X['trans_hour_of_day'] = X['trans_date_trans_time'].dt.hour

#X = X.drop(columns='trans_date_trans_time')

X[['trans_date_trans_time', 'trans_month', 'trans_day_of_week', 'trans_hour_of_day']]

Unnamed: 0,trans_date_trans_time,trans_month,trans_day_of_week,trans_hour_of_day
0,2019-01-01 00:00:18,January,Tuesday,0
1,2019-01-01 00:00:44,January,Tuesday,0
2,2019-01-01 00:00:51,January,Tuesday,0
3,2019-01-01 00:01:16,January,Tuesday,0
4,2019-01-01 00:03:06,January,Tuesday,0
...,...,...,...,...
1296670,2020-06-21 12:12:08,June,Sunday,12
1296671,2020-06-21 12:12:19,June,Sunday,12
1296672,2020-06-21 12:12:32,June,Sunday,12
1296673,2020-06-21 12:13:36,June,Sunday,12
