In [1]:
import numpy as np
import pandas as pd

import seaborn as sns

#datetime library
import datetime

In [2]:
#read trainset
train = pd.read_csv('sample_train.csv')

#read testset
test = pd.read_csv('sample_test.csv')

# Feature Engineering

In [3]:
#separate train row
train_rows = train.shape[0]

#separate test row
test_rows = test.shape[0]

#merge both train & test rows together
all_data = pd.concat((train, test)).reset_index(drop=True)

In [4]:
#all data records(both train & test)
all_data.shape

(150000, 23)

In [5]:
#all data columns
all_data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [6]:
#count the number of times each credit card was used for transaction
all_data['cc_frequency'] =all_data['cc_num'].map(all_data['cc_num'].value_counts())

#convert trans_date_trans_time to datetime datatype
all_data['trans_date_trans_time'] = pd.to_datetime(all_data['trans_date_trans_time'])

#extract hour of transaction from datetime
all_data['hour_of_tranx'] = all_data['trans_date_trans_time'].dt.hour

#convert dob to datetime datatype
all_data['dob'] =  pd.to_datetime(all_data['dob'])

#calculate user age from date of birth column
now = pd.to_datetime('now')
all_data['age'] = now.year - all_data['dob'].dt.year

  now = pd.to_datetime('now')


In [7]:
#print all columns
all_data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'cc_frequency', 'hour_of_tranx',
       'age'],
      dtype='object')

In [8]:
#a variable name to_drop stores all columns to drop
to_drop =['Unnamed: 0','cc_num', 'first', 'last', 'gender', 'trans_date_trans_time', 'merchant', 'street', 'city','state', 'zip','city_pop',
          'job','trans_num', 'unix_time' ,'dob']

#drop all columns stored in to_drop variable
all_data =all_data.drop(to_drop, axis=1)

In [9]:
#check data types
all_data.dtypes

category          object
amt              float64
lat              float64
long             float64
merch_lat        float64
merch_long       float64
is_fraud           int64
cc_frequency       int64
hour_of_tranx      int64
age                int64
dtype: object

In [10]:
all_data.head()

Unnamed: 0,category,amt,lat,long,merch_lat,merch_long,is_fraud,cc_frequency,hour_of_tranx,age
0,grocery_pos,81.07,42.0158,-73.2913,41.452457,-72.918018,0,374,1,36
1,home,27.73,38.138,-89.2231,38.778411,-88.854934,0,125,21,62
2,kids_pets,190.34,39.9148,-80.731,39.342257,-81.601052,0,303,22,46
3,kids_pets,134.74,38.832,-77.12,39.194023,-76.55831,0,125,14,32
4,shopping_net,7.66,39.9148,-80.731,40.907353,-80.207589,0,303,17,46


In [11]:
#encode categorical variable to numerical variable
all_data =pd.get_dummies(data = all_data, columns=['category'], drop_first=True)

In [12]:
#extract the train set record back
train = all_data[:train_rows]

#extract the test set record back
test = all_data[train_rows:]

In [13]:
train.head()

Unnamed: 0,amt,lat,long,merch_lat,merch_long,is_fraud,cc_frequency,hour_of_tranx,age,category_food_dining,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,81.07,42.0158,-73.2913,41.452457,-72.918018,0,374,1,36,0,...,1,0,0,0,0,0,0,0,0,0
1,27.73,38.138,-89.2231,38.778411,-88.854934,0,125,21,62,0,...,0,0,1,0,0,0,0,0,0,0
2,190.34,39.9148,-80.731,39.342257,-81.601052,0,303,22,46,0,...,0,0,0,1,0,0,0,0,0,0
3,134.74,38.832,-77.12,39.194023,-76.55831,0,125,14,32,0,...,0,0,0,1,0,0,0,0,0,0
4,7.66,39.9148,-80.731,40.907353,-80.207589,0,303,17,46,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
test.head()

Unnamed: 0,amt,lat,long,merch_lat,merch_long,is_fraud,cc_frequency,hour_of_tranx,age,category_food_dining,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
100000,6.2,41.6964,-96.9858,42.416783,-96.439836,0,195,1,95,0,...,0,0,0,0,0,0,0,0,1,0
100001,19.37,42.6853,-73.8253,41.881164,-73.041856,0,128,12,84,0,...,0,0,0,0,0,0,1,0,0,0
100002,58.7,36.1183,-79.5685,35.374583,-79.337287,0,112,3,89,0,...,0,0,0,0,0,0,0,0,0,0
100003,9.45,34.4596,-93.6743,33.88787,-93.040867,0,272,5,57,0,...,0,0,0,0,0,0,0,1,0,0
100004,1.72,41.6964,-96.9858,41.125792,-97.967163,0,195,8,95,0,...,0,0,0,0,0,1,0,0,0,0


In [15]:
#store actual test result in target
target = test['is_fraud']

#drop actual test result in target 
test = test.drop('is_fraud', axis=1)

# Train the Machine to Learn

In [16]:
#test real values
target.value_counts()

0    49809
1      191
Name: is_fraud, dtype: int64

In [17]:
#separate the input from the output and store as x
input_data = train.drop(['is_fraud'], axis =1)

#separate the output from the input and store as y
output_data = train.is_fraud

In [18]:
#algorithm to learn from the input and predict the output
from sklearn.ensemble import RandomForestClassifier

In [19]:
#instantiate the model using Random Forest Classifier
model =RandomForestClassifier(random_state = 20)

In [20]:
#machine is learning
model = model.fit(input_data, output_data)

# Predict on the Test Set

In [21]:
test.head()

Unnamed: 0,amt,lat,long,merch_lat,merch_long,cc_frequency,hour_of_tranx,age,category_food_dining,category_gas_transport,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
100000,6.2,41.6964,-96.9858,42.416783,-96.439836,195,1,95,0,0,...,0,0,0,0,0,0,0,0,1,0
100001,19.37,42.6853,-73.8253,41.881164,-73.041856,128,12,84,0,0,...,0,0,0,0,0,0,1,0,0,0
100002,58.7,36.1183,-79.5685,35.374583,-79.337287,112,3,89,0,1,...,0,0,0,0,0,0,0,0,0,0
100003,9.45,34.4596,-93.6743,33.88787,-93.040867,272,5,57,0,0,...,0,0,0,0,0,0,0,1,0,0
100004,1.72,41.6964,-96.9858,41.125792,-97.967163,195,8,95,0,0,...,0,0,0,0,0,1,0,0,0,0


In [22]:
target

100000    0
100001    0
100002    0
100003    0
100004    0
         ..
149995    0
149996    0
149997    0
149998    0
149999    0
Name: is_fraud, Length: 50000, dtype: int64

In [23]:
#predict on test
prediction = model.predict(test)

In [24]:
#create a table for predicted value & actual value
df = pd.DataFrame({'Predicted Value': prediction, 'Actual Value': target})

In [25]:
df

Unnamed: 0,Predicted Value,Actual Value
100000,0,0
100001,0,0
100002,0,0
100003,0,0
100004,0,0
...,...,...
149995,0,0
149996,0,0
149997,0,0
149998,0,0
