In [21]:
import numpy as np
import pandas as pd
import mlflow
import warnings
import pickle
import sys, os
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib import ticker
from statsmodels.tsa.stattools import adfuller, acf, pacf
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM


In [22]:
# sns.set()
%matplotlib inline
warnings.filterwarnings("ignore")

In [23]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from file_handler import FileHandler
from df_selector import *
from df_cleaner import *
from df_visualizer import *

# Reading Data

In [24]:
file_handler = FileHandler()

In [25]:
# reading the csv file
credit_df = file_handler.read_csv("../data/creditcard.csv")
credit_df.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [26]:
# reading the csv file
geo_df =file_handler.read_csv("../data/geolocation.csv")
geo_df.head(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,hour_of_day,day_of_week,transaction_frequency,velocity,lower_bound_ip_address,upper_bound_ip_address,country
0,22058.0,2015-02-24 22:55:49,2015-04-18 02:47:11,34.0,QVPSPJUOCKZAR,1.0,1.0,1.0,39.0,732758368,0.0,22.0,24.0,0.041667,0.001736,0.0,0.0,
1,333320.0,2015-06-07 20:39:50,2015-06-08 01:38:54,16.0,EOGFQPIZPYXFZ,2.0,1.0,0.0,53.0,350311387,0.0,20.0,7.0,0.142857,0.020408,0.0,0.0,
2,1359.0,2015-01-01 18:52:44,2015-01-01 18:52:45,15.0,YSSKYOSJHPPLJ,1.0,5.0,1.0,53.0,2621473820,1.0,18.0,1.0,1.0,1.0,0.0,0.0,
3,150084.0,2015-04-28 21:13:25,2015-05-04 13:54:50,44.0,ATGTXKYKUDUQN,1.0,3.0,1.0,41.0,3840542443,0.0,21.0,28.0,0.035714,0.001276,0.0,0.0,
4,221365.0,2015-07-21 07:09:52,2015-09-09 18:40:53,39.0,NAUITBZFJKHWW,2.0,3.0,1.0,45.0,415583117,0.0,7.0,21.0,0.047619,0.002268,0.0,0.0,


# Data preparation

## Data separation as x and y

### credit_df

In [28]:
y = credit_df['Class']

In [29]:
y

0         0
1         0
2         0
3         0
4         0
         ..
283721    0
283722    0
283723    0
283724    0
283725    0
Name: Class, Length: 283726, dtype: int64

In [30]:
x = credit_df.drop('Class', axis=1)
x

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283721,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77
283722,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79
283723,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88
283724,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00


## Data splitting

In [31]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 100)

In [32]:
X_train

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
49360,44122.0,-1.550810,0.731824,2.549028,2.733753,0.250049,2.870749,0.354236,0.007231,-0.037368,...,-0.312468,0.005203,0.831394,0.117041,-0.935059,-0.665162,0.188614,-0.910417,-0.273559,90.91
55795,47246.0,-1.305339,1.059107,1.586936,1.650134,1.669107,-0.680811,0.717706,0.072364,-1.664982,...,0.117144,-0.101308,-0.557447,-0.121542,-0.058211,0.390341,-0.219061,0.017214,0.108434,7.60
271349,165050.0,-1.498534,1.268183,-1.360645,-1.310035,2.744320,3.666680,0.234403,0.502133,0.313536,...,-0.053535,-0.067233,-0.546292,0.212806,0.703104,-0.805702,0.195482,-1.110255,0.304176,3.27
13864,24688.0,1.198304,0.378397,0.541355,0.835317,-0.386506,-0.829408,-0.137542,-0.121206,1.122380,...,-0.149218,-0.377945,-0.909802,0.158471,0.420259,0.117492,0.027304,-0.057095,0.015486,3.57
132159,80068.0,-0.833194,0.769305,0.865990,-2.250385,1.018626,0.090372,0.751702,0.149752,-0.268840,...,-0.038624,-0.020569,-0.044707,-0.211529,-1.094250,-0.174526,0.678921,0.117100,0.158518,0.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212912,139344.0,0.034805,-0.057191,0.470673,-1.418306,0.528228,-0.193095,0.306220,-0.425387,-0.964041,...,0.248454,0.198328,0.839489,-0.345201,-0.994439,-0.447083,-0.038197,-0.059010,-0.061356,20.00
65615,51782.0,1.101096,-0.947563,1.500003,0.447732,-1.472739,1.153264,-1.558795,0.759305,2.128664,...,-0.324216,-0.104567,0.062195,-0.012839,-0.263227,0.104007,1.169966,-0.000594,-0.007131,1.00
253799,156887.0,2.178124,-1.621452,-1.095136,-1.615900,-1.079726,-0.177137,-1.135500,-0.125167,-1.104774,...,-0.245827,-0.143274,-0.066847,0.174442,0.254004,-0.209453,-0.188502,0.006059,-0.031692,88.00
210755,138441.0,0.949311,-1.937607,-3.434253,0.316148,0.442569,-0.785091,1.628958,-0.545787,-0.047268,...,0.989959,0.664053,0.609193,-0.761294,0.452681,0.532385,0.367599,-0.235352,0.008169,623.00


In [33]:
X_test

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
173581,121834.0,1.933240,-1.316851,0.425610,-0.354184,-1.854467,-0.021007,-1.602457,0.219034,0.271361,...,-0.484497,-0.069772,0.311095,0.293697,0.082676,-0.745557,0.597068,0.016508,-0.031391,53.98
276452,167683.0,-1.592290,1.511862,-1.410847,-1.175748,2.867225,3.524474,0.495902,1.614884,-1.271709,...,-0.379965,0.100349,-0.083137,-0.474007,0.695183,1.356110,-0.486276,-0.350054,-0.118218,42.91
104778,69404.0,-0.976564,0.585510,-0.003142,-0.945268,3.539277,2.916666,0.460135,0.831828,-1.095137,...,0.101061,-0.297875,-1.264529,-0.179137,0.936423,0.677656,-0.705164,0.000772,0.064120,2.00
226942,145219.0,1.780293,-1.063671,0.680026,0.841410,-1.653529,0.392116,-1.514345,0.467485,2.093653,...,-0.227392,0.203172,0.798490,0.178572,0.094661,-0.515228,0.371839,0.026977,-0.036803,47.95
115617,74135.0,-1.153910,1.533617,0.434139,-0.495930,0.378126,-0.067809,0.530507,0.229062,0.300117,...,0.497949,-0.456433,-0.912641,-0.160596,-0.944142,0.048795,0.090333,0.100562,-0.249673,9.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194688,131031.0,1.882015,0.456126,0.091758,3.660667,0.211950,0.702324,-0.360926,0.137440,-0.650546,...,-0.173118,-0.193657,-0.516363,0.450914,0.543983,-0.503771,-0.408203,0.016273,-0.016981,4.54
44315,41974.0,-0.993896,-0.525569,1.364635,-1.480272,-1.204705,-0.904076,1.387298,-0.309046,-1.657614,...,0.736064,0.301026,0.165281,0.573837,0.674976,0.146982,-0.523752,-0.036425,0.128248,298.99
119549,75683.0,-2.432158,-7.132518,-3.756704,0.010674,-0.360764,3.921517,1.873421,0.177570,-1.175219,...,3.730666,0.719113,-2.051016,-1.817407,1.155222,-0.145271,0.892078,-0.446909,0.342211,2022.16
139085,83251.0,-0.701949,1.153401,1.050703,0.961192,-0.136477,-0.821117,0.457482,0.256512,-0.910366,...,0.024547,0.028705,-0.107470,0.084042,0.379000,-0.435913,-0.572103,0.053031,0.084548,30.46


# Model Building

## Random Forest

### Training the model

In [34]:
rf = RandomForestRegressor(max_depth=2, random_state=100)
rf.fit(X_train, y_train)

### Applying the model to make a prediction

In [36]:
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)

### Evaluate model performance

In [37]:
rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred)

rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)

In [38]:
rf_results = pd.DataFrame(['Random forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']
rf_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Random forest,0.0006,0.620787,0.000736,0.629747
