In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import preprocessing
sb.set()

In [2]:
train1 = pd.read_csv('fraudTrain.csv')
print(train1.shape,'\n')

(1296675, 23) 



In [3]:
train1.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
#print a list of the column names to start grouping them
train_heads = train1.columns.to_numpy()
train_heads.tolist()

['Unnamed: 0',
 'trans_date_trans_time',
 'cc_num',
 'merchant',
 'category',
 'amt',
 'first',
 'last',
 'gender',
 'street',
 'city',
 'state',
 'zip',
 'lat',
 'long',
 'city_pop',
 'job',
 'dob',
 'trans_num',
 'unix_time',
 'merch_lat',
 'merch_long',
 'is_fraud']

# Column Grouping


## Identification Details
##### Not used as Features but rather as Identifiers
* Cardholder Identifiers -  `job`, `dob`, `first`, `last`, `gender` & `cc_num` using LabelEncoding 
* Encode Cardholder Vicinity : Encode `city`, `state`, `street` & `zip` using LabelEncoding
* Encode `trans_num` from alphanumeric to numeric

## Transaction Details
##### These are what we use as important features

### Single Values
* `city_pop` Used as a comparison tool
* Spending patterns - `amt`
### Multiple Values
* Transaction Timing Details related - `trans_date_trans_time` split into 
    * 1. `day` of the week as a Categorical Variable 
    * 2. `time` as a continuous series
* Transaction Type - 
    * Encode `category`
    * Encode `merchant`
* Calculate distance between owner's residence and merchant locations using `lat`, `long`, `merch_lat`, `mech_long` 

# Fraud Values
* `is_fraud` is the value to be predicted so it'll serve as the `y` value for the modelling

###### OHE NOT USED DUE TO TREE MODELS BEING USED

# `Fraud`

In [5]:
fraud = pd.DataFrame(train1['is_fraud'])

# Identifier Encoding

### $LabelEncoder$

In [6]:
#cardholder details
cardholder = pd.DataFrame(train1[['cc_num','job','dob','first','last','gender']])

# residential details
location = pd.DataFrame(train1[['city','state','street','zip']])

# transaction number
trans_num = pd.DataFrame(train1['trans_num'])

identifiers = pd.concat([cardholder,location,trans_num], axis = 1)

In [7]:
LabelEncoder = preprocessing.LabelEncoder()
id_var = pd.DataFrame(identifiers.apply(LabelEncoder.fit_transform))
id_var.shape

(1296675, 11)

# Feature Engineering

## Single Variables

In [8]:
single_var = pd.DataFrame(train1[['amt', 'city_pop']])
print(single_var.dtypes) # check dtypes to see if the variables can be used as they are
single_var.head()

amt         float64
city_pop      int64
dtype: object


Unnamed: 0,amt,city_pop
0,4.97,3495
1,107.23,149
2,220.11,4154
3,45.0,1939
4,41.96,99


## Multi Variables

# `Distance`

In [9]:
import geopy.distance

distance_values = pd.DataFrame(train1[['lat','long','merch_lat','merch_long']])

#create a function to group coordinates so that they can be used by the geopy function

def dist_calc(row):
    owner_coords = (row['lat'], row['long'])
    merch_coords = (row['merch_lat'], row['merch_long'])
    return geopy.distance.geodesic(owner_coords, merch_coords).km

distance_values['distance'] = distance_values.apply(dist_calc, axis=1)

owner_to_merch = pd.DataFrame(distance_values['distance'])
owner_to_merch

Unnamed: 0,distance
0,78.773821
1,30.216618
2,108.102912
3,95.685115
4,77.702395
...,...
1296670,119.696415
1296671,75.202184
1296672,98.987927
1296673,84.688356


# `Transaction Type`

### $LeaveOneOut$ $Encoder$

In [10]:
merch_info = pd.DataFrame(train1[['merchant','category']])
merchs = merch_info['merchant']
cats = merch_info['category']

In [11]:
# count unique categories in each 
from collections import Counter

Counter(merchs).keys()
val_counts1 = pd.DataFrame(Counter(merchs).values())
print(val_counts1.shape)

Counter(cats).keys()
val_counts2 = pd.DataFrame(Counter(cats).values())
print(val_counts2.shape)

(693, 1)
(14, 1)


In [12]:
import category_encoders as ce

encoder = ce.LeaveOneOutEncoder(return_df = True)
merch_info_loo = encoder.fit_transform(merch_info, fraud)
merch_info_loo.dtypes

merchant    float64
category    float64
dtype: object

# `Day and Time`

In [13]:
train_time = pd.DataFrame(train1[['trans_date_trans_time']])
train_time['trans_date_trans_time'] = pd.to_datetime(train_time['trans_date_trans_time'])
trans_time = pd.DataFrame(train_time)
trans_time.dtypes

trans_date_trans_time    datetime64[ns]
dtype: object

In [14]:
trans_time['day'] = trans_time['trans_date_trans_time'].dt.dayofweek

In [15]:
def daycalc(a):
    array = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday','Monday']
    return array[a]

In [16]:
trans_time['day_of_week'] = trans_time['day'].apply(daycalc)
trans_time['day_of_week'].astype('category')

0          Tuesday
1          Tuesday
2          Tuesday
3          Tuesday
4          Tuesday
            ...   
1296670     Sunday
1296671     Sunday
1296672     Sunday
1296673     Sunday
1296674     Sunday
Name: day_of_week, Length: 1296675, dtype: category
Categories (7, object): ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']

In [17]:
trans_time.head()

Unnamed: 0,trans_date_trans_time,day,day_of_week
0,2019-01-01 00:00:18,1,Tuesday
1,2019-01-01 00:00:44,1,Tuesday
2,2019-01-01 00:00:51,1,Tuesday
3,2019-01-01 00:01:16,1,Tuesday
4,2019-01-01 00:03:06,1,Tuesday


In [18]:
trans_time['time'] = trans_time['trans_date_trans_time'].dt.time
trans_time['time']

0          00:00:18
1          00:00:44
2          00:00:51
3          00:01:16
4          00:03:06
             ...   
1296670    12:12:08
1296671    12:12:19
1296672    12:12:32
1296673    12:13:36
1296674    12:13:37
Name: time, Length: 1296675, dtype: object

In [19]:
trans_time['time'] = trans_time['trans_date_trans_time'].apply(lambda x: x.value)
trans_time['time2'] = trans_time['time'].divide(1000000000)

In [20]:
trans_time.head()

Unnamed: 0,trans_date_trans_time,day,day_of_week,time,time2
0,2019-01-01 00:00:18,1,Tuesday,1546300818000000000,1546301000.0
1,2019-01-01 00:00:44,1,Tuesday,1546300844000000000,1546301000.0
2,2019-01-01 00:00:51,1,Tuesday,1546300851000000000,1546301000.0
3,2019-01-01 00:01:16,1,Tuesday,1546300876000000000,1546301000.0
4,2019-01-01 00:03:06,1,Tuesday,1546300986000000000,1546301000.0


In [21]:
date_time_final = pd.DataFrame(trans_time[['day','time2']])
date_time_final.head()

Unnamed: 0,day,time2
0,1,1546301000.0
1,1,1546301000.0
2,1,1546301000.0
3,1,1546301000.0
4,1,1546301000.0


# **Compiling the Data**
#### Into `X` and `y`

In [22]:
X = pd.concat([id_var,date_time_final,single_var,merch_info_loo,owner_to_merch],axis = 1)
print(X.shape)
X.head()

(1296675, 18)


Unnamed: 0,cc_num,job,dob,first,last,gender,city,state,street,zip,trans_num,day,time2,amt,city_pop,merchant,category,distance
0,444,370,779,162,18,0,526,27,568,265,56438,1,1546301000.0,4.97,3495,0.014218,0.014458,78.773821
1,42,428,607,309,157,0,612,47,435,965,159395,1,1546301000.0,107.23,149,0.010791,0.014098,30.216618
2,237,307,302,115,381,1,468,13,602,858,818703,1,1546301000.0,220.11,4154,0.002112,0.002478,108.102912
3,509,328,397,163,463,1,84,26,930,614,544575,1,1546301000.0,45.0,1939,0.003446,0.004694,95.685115
4,368,116,734,336,149,1,216,45,418,231,831111,1,1546301000.0,41.96,99,0.003771,0.003139,77.702395


In [23]:
y = fraud
y.head()

Unnamed: 0,is_fraud
0,0
1,0
2,0
3,0
4,0


# `Balancing` and `Sampling` the Data

In [24]:
import imblearn

In [25]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE(random_state = 1015)

X1,y1 = oversample.fit_resample(X,y)

In [26]:
X1

Unnamed: 0,cc_num,job,dob,first,last,gender,city,state,street,zip,trans_num,day,time2,amt,city_pop,merchant,category,distance
0,444,370,779,162,18,0,526,27,568,265,56438,1,1.546301e+09,4.970000,3495,0.014218,0.014458,78.773821
1,42,428,607,309,157,0,612,47,435,965,159395,1,1.546301e+09,107.230000,149,0.010791,0.014098,30.216618
2,237,307,302,115,381,1,468,13,602,858,818703,1,1.546301e+09,220.110000,4154,0.002112,0.002478,108.102912
3,509,328,397,163,463,1,84,26,930,614,544575,1,1.546301e+09,45.000000,1939,0.003446,0.004694,95.685115
4,368,116,734,336,149,1,216,45,418,231,831111,1,1.546301e+09,41.960000,99,0.003771,0.003139,77.702395
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2578333,232,298,517,170,331,1,179,29,588,750,118402,4,1.582523e+09,256.365550,647,0.006348,0.005895,116.530864
2578334,463,276,799,330,83,0,129,15,637,588,1157521,5,1.561261e+09,672.865617,3475,0.003735,0.003126,58.575576
2578335,60,44,354,142,407,0,683,21,462,518,85373,4,1.557007e+09,732.035556,1015,0.007217,0.007108,123.122100
2578336,757,122,713,49,27,0,275,4,475,610,851113,0,1.558329e+09,920.495997,5107,0.014524,0.017022,81.576862


In [27]:
y1

Unnamed: 0,is_fraud
0,0
1,0
2,0
3,0
4,0
...,...
2578333,1
2578334,1
2578335,1
2578336,1


### Concatenate the data for Sampling

In [28]:
df = pd.concat([X1,y1], axis = 1)

In [29]:
df_reduced = df.sample(frac = 0.075, random_state = 1015)

In [30]:
print(df_reduced.shape)
df_reduced.head()

(193375, 19)


Unnamed: 0,cc_num,job,dob,first,last,gender,city,state,street,zip,trans_num,day,time2,amt,city_pop,merchant,category,distance,is_fraud
1093876,773,387,850,214,426,0,297,43,27,785,1194960,0,1585597000.0,68.2,5875,0.001159,0.001549,77.100845,0
2451745,697,258,190,171,219,0,599,45,589,650,1088432,0,1590421000.0,697.697179,1244,0.016755,0.016128,61.332518,1
595294,145,309,207,3,329,1,412,5,50,842,665672,0,1568034000.0,100.09,207,0.00119,0.001608,72.233664,0
1670245,383,262,291,189,287,0,128,40,203,268,188609,5,1567821000.0,109.203696,4424,0.006895,0.007806,70.480688,1
819109,640,462,131,311,382,1,617,16,255,682,1046976,6,1575822000.0,78.26,2661,0.003156,0.002114,75.892072,0


In [31]:
df_reduced.to_csv('final-train.csv')

In [33]:
test1 = pd.read_csv('fraudTest.csv')

In [35]:
print(test1.shape)
test1.head()

(555719, 23)


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0
