In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [53]:
# Load Data
train = pd.read_csv('../data/fraudTrain.csv', index_col=0)
test = pd.read_csv('../data/fraudTest.csv', index_col=0)

train['dataset'] = 'train'
test['dataset'] = 'test'

# Combine for preprocessing
data = pd.concat([train, test], axis=0)
data.reset_index(drop=True, inplace=True)

# Convert timestamp
data['transaction_time'] = pd.to_datetime(data['trans_date_trans_time'])
data['unix_time'] = data['unix_time'].astype(int)

In [54]:
data['trans_hour'] = data['transaction_time'].dt.hour
data['trans_dayofweek'] = data['transaction_time'].dt.dayofweek

data['is_night'] = data['trans_hour'].between(21, 23).astype(int)
data['is_weekend'] = data['trans_dayofweek'].isin([5, 6]).astype(int)

# Age feature
data['dob'] = pd.to_datetime(data['dob'])
data['age'] = ((data['transaction_time'] - data['dob']).dt.days // 365)

data.drop(columns='dob', inplace=True)

# Transaction velocity feature
data.sort_values(by=['cc_num', 'unix_time'], inplace=True)

data['unix_time_prev_trans'] = (
    data.groupby('cc_num')['unix_time']
    .shift(1)
)

data['unix_time_prev_trans'] = data['unix_time_prev_trans'].fillna(
    data['unix_time'] - 86400
)

data['timedelta_last_trans'] = (
    (data['unix_time'] - data['unix_time_prev_trans']) // 60
)


In [55]:
data.dtypes

trans_date_trans_time            object
cc_num                            int64
merchant                         object
category                         object
amt                             float64
first                            object
last                             object
gender                           object
street                           object
city                             object
state                            object
zip                               int64
lat                             float64
long                            float64
city_pop                          int64
job                              object
trans_num                        object
unix_time                         int64
merch_lat                       float64
merch_long                      float64
is_fraud                          int64
dataset                          object
transaction_time         datetime64[ns]
trans_hour                        int32
trans_dayofweek                   int32


In [56]:
data.head(1) 

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,is_fraud,dataset,transaction_time,trans_hour,trans_dayofweek,is_night,is_weekend,age,unix_time_prev_trans,timedelta_last_trans
1017,2019-01-01 12:47:15,60416207185,"fraud_Jones, Sawayn and Romaguera",misc_net,7.27,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,...,0,train,2019-01-01 12:47:15,12,1,0,0,32,1325336000.0,1440.0


In [57]:
cat_cols = data.select_dtypes(include = 'object').columns 

for col in cat_cols: 
    data[col] = data[col].str.lower().str.strip() 

data[cat_cols].nunique().sort_values() 

gender                         2
dataset                        2
category                      14
state                         51
first                        355
last                         486
job                          497
merchant                     693
city                         906
street                       999
trans_date_trans_time    1819551
trans_num                1852394
dtype: int64

In [58]:
data.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'dataset', 'transaction_time', 'trans_hour',
       'trans_dayofweek', 'is_night', 'is_weekend', 'age',
       'unix_time_prev_trans', 'timedelta_last_trans'],
      dtype='object')

In [None]:
cols_to_keep = [
    'category',
    'amt',
    'gender','age',
    'state',
    'city_pop',
    'is_fraud', 'dataset','trans_hour',
    'trans_dayofweek', 'is_night', 'is_weekend',
    'timedelta_last_trans'
]

data = data[[col for col in cols_to_keep if col in data.columns]]



In [60]:
data.columns

Index(['category', 'amt', 'gender', 'age', 'state', 'city_pop', 'is_fraud',
       'dataset', 'trans_hour', 'is_night', 'is_weekend',
       'timedelta_last_trans'],
      dtype='object')

In [61]:
train = data[data['dataset'] == 'train'].drop(columns='dataset')
test  = data[data['dataset'] == 'test'].drop(columns='dataset')

In [62]:
train

Unnamed: 0,category,amt,gender,age,state,city_pop,is_fraud,trans_hour,is_night,is_weekend,timedelta_last_trans
1017,misc_net,7.27,f,32,wy,1645,0,12,0,0,1440.0
2724,gas_transport,52.94,f,32,wy,1645,0,8,0,0,1197.0
2726,gas_transport,82.08,f,32,wy,1645,0,8,0,0,2.0
2882,kids_pets,34.79,f,32,wy,1645,0,12,0,0,230.0
2907,home,27.18,f,32,wy,1645,0,13,0,0,32.0
...,...,...,...,...,...,...,...,...,...,...,...
1294934,personal_care,60.47,m,64,il,532,0,21,1,1,512.0
1295369,gas_transport,74.29,m,64,il,532,0,0,0,1,216.0
1295587,shopping_net,246.56,m,64,il,532,0,2,0,1,126.0
1296206,shopping_pos,2.62,m,64,il,532,0,8,0,1,316.0


In [63]:
X_train = train.drop(columns='is_fraud')
y_train = train['is_fraud']

X_test = test.drop(columns='is_fraud')
y_test = test['is_fraud']

In [64]:
X_train.head()

Unnamed: 0,category,amt,gender,age,state,city_pop,trans_hour,is_night,is_weekend,timedelta_last_trans
1017,misc_net,7.27,f,32,wy,1645,12,0,0,1440.0
2724,gas_transport,52.94,f,32,wy,1645,8,0,0,1197.0
2726,gas_transport,82.08,f,32,wy,1645,8,0,0,2.0
2882,kids_pets,34.79,f,32,wy,1645,12,0,0,230.0
2907,home,27.18,f,32,wy,1645,13,0,0,32.0


In [65]:
num_cols = ['amt', 'city_pop', 'timedelta_last_trans']

In [66]:
cat_cols = X_train.select_dtypes(exclude = np.number).columns # choose the categorical variables

X_train[cat_cols].nunique().sort_values(ascending = False) # look at the cardinalitries of the categorical variables

state       51
category    14
gender       2
dtype: int64

In [67]:
import pandas as pd

X_train = pd.get_dummies(X_train, columns=["category", "gender"], drop_first=True)
X_test = pd.get_dummies(X_test, columns=["category", "gender"], drop_first=True)

X_train.head()


Unnamed: 0,amt,age,state,city_pop,trans_hour,is_night,is_weekend,timedelta_last_trans,category_food_dining,category_gas_transport,...,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_m
1017,7.27,32,wy,1645,12,0,0,1440.0,False,False,...,False,False,False,True,False,False,False,False,False,False
2724,52.94,32,wy,1645,8,0,0,1197.0,False,True,...,False,False,False,False,False,False,False,False,False,False
2726,82.08,32,wy,1645,8,0,0,2.0,False,True,...,False,False,False,False,False,False,False,False,False,False
2882,34.79,32,wy,1645,12,0,0,230.0,False,False,...,False,False,True,False,False,False,False,False,False,False
2907,27.18,32,wy,1645,13,0,0,32.0,False,False,...,False,True,False,False,False,False,False,False,False,False
