# Importing the Dependecies 

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LogisticRegression 

In [5]:
from sklearn.metrics import accuracy_score

In [6]:
from sklearn.utils import resample

In [7]:
fraud_detection_data = pd.read_csv('Fraud.csv')

In [8]:
fraud_detection_data.head() 

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [9]:
fraud_detection_data['newbalanceDest'].nunique() 

3555499

In [10]:
fraud_detection_data['type'].nunique() 

5

last 5 rows of the dataset 

In [11]:
fraud_detection_data.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


Some more informations about the dataset 

In [12]:
fraud_detection_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [13]:
fraud_detection_data.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


Data Cleaning : Checking the number of missing values in each column

In [14]:
fraud_detection_data.isnull().sum() 

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

# Distribution of legit transactions & Fraudulent transaction

In [15]:
fraud_detection_data['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [16]:
fraud_detection_data['isFlaggedFraud'].value_counts()

0    6362604
1         16
Name: isFlaggedFraud, dtype: int64

This dataset is highly unbalanced. 

0 --> Normal Transaction 

1--> Fraudulent Transaction

# Data seperation for analysis

In [17]:
legit = fraud_detection_data [fraud_detection_data.isFraud == 0]
fraud = fraud_detection_data [fraud_detection_data.isFraud == 1]

In [18]:
legit.shape

(6354407, 11)

In [19]:
fraud.shape

(8213, 11)

In [20]:
legit = fraud_detection_data [fraud_detection_data.isFlaggedFraud == 0]
fraud = fraud_detection_data [fraud_detection_data.isFlaggedFraud == 1]

In [21]:
legit.shape

(6362604, 11)

In [22]:
fraud.shape 

(16, 11)

# Statistical measures of the data 

In [23]:
legit.amount.describe()

count    6.362604e+06
mean     1.798501e+05
std      6.037884e+05
min      0.000000e+00
25%      1.338955e+04
50%      7.487127e+04
75%      2.087205e+05
max      9.244552e+07
Name: amount, dtype: float64

In [24]:
fraud.amount.describe()

count    1.600000e+01
mean     4.861598e+06
std      3.572499e+06
min      3.538742e+05
25%      2.242749e+06
50%      4.234245e+06
75%      7.883451e+06
max      1.000000e+07
Name: amount, dtype: float64

# Comparing the values for both transactions

In [25]:
fraud_detection_data.groupby('isFraud').mean()

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,243.235663,178197.0,832828.7,855970.228109,1101421.0,1224926.0,0.0
1,368.413856,1467967.0,1649668.0,192392.631836,544249.6,1279708.0,0.001948


In [26]:
fraud_detection_data.groupby('isFlaggedFraud').mean()

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
isFlaggedFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,243.396506,179850.1,833865.5,855096.2,1100704.0,1224999.0,0.001288
1,537.5625,4861598.0,7817869.0,7817869.0,0.0,0.0,1.0


# Resampling

Build a sample dataset containing similar distribution of normal & fraudulent transactions for Isfraud & IsFlaggedfraud data

No of Fraudulent transactions for Isfraud data -- 8213

In [27]:
legit_downsampled = resample(legit, replace=False, n_samples=8213)

In [28]:
fraud_upsampled = resample(fraud, replace=True, n_samples=8213)

In [29]:
new_dataset = pd.concat([legit_downsampled, fraud_upsampled])

In [30]:
new_dataset.isFraud.value_counts()

1    8229
0    8197
Name: isFraud, dtype: int64

No of Fraudulent transactions for IsFlaggedfraud data -- 16

In [31]:
legit_downsampled = resample(legit, replace=False, n_samples=16)

In [32]:
fraud_upsampled = resample(fraud, replace=True, n_samples=16)

In [33]:
New_dataset = pd.concat([legit_downsampled, fraud_upsampled])

In [34]:
New_dataset.isFraud.value_counts()

0    16
1    16
Name: isFraud, dtype: int64

In [35]:
new_dataset.groupby('isFraud').mean()

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,244.128828,175663.3,824051.4,846512.6,1165062.0,1284855.0,0.0
1,538.165877,4837657.0,7777038.0,7773865.0,252.2587,2467.418,0.998056


In [36]:
New_dataset.groupby('isFlaggedFraud').mean()

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
isFlaggedFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,260.8125,172713.7,1450185.0,1536971.0,2864875.155,2857330.0,0.0
1,534.625,5463001.0,10384200.0,10384200.0,0.0,0.0,1.0


Splitting the data into features and targets

In [37]:
X = new_dataset.drop(columns='isFraud', axis=1)
Y = new_dataset['isFraud']

In [38]:
X

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud
2184128,185,PAYMENT,21331.09,C1387298987,10640.00,0.00,M684942945,0.00,0.00,0
3559186,260,PAYMENT,7383.33,C791716236,29989.00,22605.67,M290866801,0.00,0.00,0
3029043,233,CASH_OUT,40230.03,C1353049374,0.00,0.00,C1829922019,657372.08,697602.12,0
2659326,210,PAYMENT,13657.72,C1307523333,14773.00,1115.28,M852634124,0.00,0.00,0
4232948,306,CASH_OUT,200903.56,C1795087254,1596.00,0.00,C1900471130,545394.99,746298.55,0
...,...,...,...,...,...,...,...,...,...,...
3247297,250,TRANSFER,1343002.08,C1100582606,1343002.08,1343002.08,C1147517658,0.00,0.00,1
6281482,646,TRANSFER,10000000.00,C19004745,10399045.08,10399045.08,C1806199534,0.00,0.00,1
6281482,646,TRANSFER,10000000.00,C19004745,10399045.08,10399045.08,C1806199534,0.00,0.00,1
3247297,250,TRANSFER,1343002.08,C1100582606,1343002.08,1343002.08,C1147517658,0.00,0.00,1


In [39]:
Y

2184128    0
3559186    0
3029043    0
2659326    0
4232948    0
          ..
3247297    1
6281482    1
6281482    1
3247297    1
5996407    1
Name: isFraud, Length: 16426, dtype: int64

In [40]:
x = New_dataset.drop(columns='isFlaggedFraud', axis=1)
y = New_dataset['isFlaggedFraud']

In [41]:
x

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
5408100,378,CASH_IN,153653.13,C421642280,357596.0,511249.13,C304711147,3822700.62,3669047.49,0
2492481,204,PAYMENT,1839.47,C1972896873,227548.0,225708.53,M1484413557,0.0,0.0,0
2689006,210,TRANSFER,267844.09,C423586216,0.0,0.0,C136810328,1622268.73,1890112.83,0
3216770,250,CASH_IN,263042.86,C954178156,1741961.51,2005004.37,C477191283,4705961.78,4442918.93,0
1533576,154,CASH_IN,46828.13,C586659438,7377213.38,7424041.51,C1035613847,294795.72,247967.59,0
6049626,492,CASH_OUT,169806.81,C1996070170,210.0,0.0,C1941873991,4414724.26,4584531.07,0
4528444,326,CASH_IN,306286.2,C2008942698,407190.1,713476.31,C199767531,687685.12,381398.92,0
1113122,130,CASH_OUT,478597.63,C1369824570,49595.0,0.0,C424058203,0.0,478597.63,0
4768910,334,CASH_OUT,136674.26,C1321733090,0.0,0.0,C833816331,1189974.88,1326649.14,0
3322848,253,PAYMENT,1848.75,C1711324490,0.0,0.0,M1378213751,0.0,0.0,0


In [42]:
x['nameOrig'].nunique()

27

In [43]:
x['nameDest'].nunique()

27

In [44]:
x.shape

(32, 10)

In [45]:
y

5408100    0
2492481    0
2689006    0
3216770    0
1533576    0
6049626    0
4528444    0
1113122    0
4768910    0
3322848    0
300226     0
4858655    0
5801063    0
5517605    0
1783267    0
1298171    0
6281482    1
5996407    1
6362462    1
6266413    1
5996409    1
5996407    1
5996407    1
5563713    1
6205439    1
6281484    1
6281484    1
6205439    1
6296014    1
6351225    1
2736446    1
5996407    1
Name: isFlaggedFraud, dtype: int64

split the data into training data & testing data

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [47]:
X.shape, X_train.shape, X_test.shape

((16426, 10), (13140, 10), (3286, 10))

In [48]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [49]:
x.shape, x_train.shape, x_test.shape

((32, 10), (25, 10), (7, 10))

# One Hot Encoding 

In [50]:
X_train_with_dummies = pd.get_dummies(X_train, columns = ['type', 'nameOrig', 'nameDest' ])

In [51]:
X_train_with_dummies

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,...,nameDest_M98789484,nameDest_M987968246,nameDest_M988347446,nameDest_M989971220,nameDest_M994624307,nameDest_M996473630,nameDest_M996967840,nameDest_M997568405,nameDest_M998615190,nameDest_M999166343
4779166,335,26080.81,0.00,0.00,0.00,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1535782,154,197999.18,36329.00,0.00,181363.64,379362.82,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
835555,41,6775.63,6849221.14,6855996.77,567013.15,560237.52,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3698713,277,492605.11,0.00,0.00,16220589.51,16713194.62,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6205439,586,353874.22,353874.22,353874.22,0.00,0.00,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871352,42,189007.02,8067099.93,8256106.95,4564755.63,4375748.61,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3760288,279,536624.41,536624.41,536624.41,0.00,0.00,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2736446,212,4953893.08,4953893.08,4953893.08,0.00,0.00,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5088209,355,79.78,18776.43,18696.65,0.00,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
X_train_with_dummies.shape

(13140, 13137)

In [53]:
X_test_with_dummies = pd.get_dummies(X_test, columns = ['type', 'nameOrig', 'nameDest' ]) 

In [54]:
X_test_with_dummies

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,...,nameDest_M968786168,nameDest_M973613356,nameDest_M975084546,nameDest_M975159909,nameDest_M982412195,nameDest_M98374365,nameDest_M986257226,nameDest_M98823786,nameDest_M993121487,nameDest_M998348241
3219205,250,12374.98,71586.13,59211.15,0.00,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1048244,95,263342.38,381300.48,644642.86,1047807.71,784465.33,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3760288,279,536624.41,536624.41,536624.41,0.00,0.00,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2736446,212,4953893.08,4953893.08,4953893.08,0.00,0.00,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4413110,322,114685.99,5048.00,0.00,0.00,114685.99,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303264,15,14122.77,10528.00,0.00,17088.03,31210.80,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4218237,305,6008.26,199947.00,193938.74,0.00,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6351225,702,3171085.59,3171085.59,3171085.59,0.00,0.00,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6281484,646,399045.08,10399045.08,10399045.08,0.00,0.00,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
x_train_with_dummies = pd.get_dummies(x_train, columns = ['type', 'nameOrig', 'nameDest' ])

In [56]:
x_train_with_dummies

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_PAYMENT,...,nameDest_C1909486199,nameDest_C1941873991,nameDest_C290598253,nameDest_C304711147,nameDest_C424058203,nameDest_C477191283,nameDest_C639921569,nameDest_C790030594,nameDest_M1378213751,nameDest_M1484413557
3216770,250,263042.86,1741961.51,2005004.37,4705961.78,4442918.93,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
6351225,702,3171085.59,3171085.59,3171085.59,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5408100,378,153653.13,357596.0,511249.13,3822700.62,3669047.49,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
6281484,646,399045.08,10399045.08,10399045.08,0.0,0.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4858655,348,183640.52,7074138.65,7257779.17,24998293.64,24814653.12,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5801063,401,37324.13,5223708.28,5261032.41,118196.03,80871.9,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6362462,730,7316255.05,17316255.05,17316255.05,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6281484,646,399045.08,10399045.08,10399045.08,0.0,0.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5996407,425,10000000.0,19585040.37,19585040.37,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1783267,162,39409.02,0.0,0.0,72610.51,112019.53,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [57]:
x_test_with_dummies = pd.get_dummies(x_test, columns = ['type', 'nameOrig', 'nameDest' ]) 

In [58]:
x_test_with_dummies

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_TRANSFER,...,nameOrig_C689608084,nameOrig_C786455622,nameOrig_C908544136,nameDest_C1392803603,nameDest_C1449732927,nameDest_C199767531,nameDest_C661958277,nameDest_C669125918,nameDest_C833816331,nameDest_C891140444
1298171,136,79930.14,0.0,0.0,512554.45,592484.58,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
5996407,425,10000000.0,19585040.37,19585040.37,0.0,0.0,1,0,0,1,...,1,0,0,1,0,0,0,0,0,0
4528444,326,306286.2,407190.1,713476.31,687685.12,381398.92,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5563713,387,4892193.09,4892193.09,4892193.09,0.0,0.0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,1
5517605,380,147242.77,0.0,0.0,2518267.54,2665510.31,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
6266413,617,2542664.27,2542664.27,2542664.27,0.0,0.0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,0
4768910,334,136674.26,0.0,0.0,1189974.88,1326649.14,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


# Model Training

Logistic Regression

In [59]:
model = LogisticRegression()

Training the logistic Regression model with training data

In [60]:
model.fit(X_train_with_dummies, Y_train) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

# Model Evaluation

Accuracy Score

Accuracy on training data

In [61]:
X_train_prediction = model.predict(X_train_with_dummies)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [62]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9833333333333333
