Importing the Dependencies

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('card_transdata.csv')

In [10]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [11]:
credit_card_data.tail()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
999995,2.207101,0.112651,1.626798,1.0,1.0,0.0,0.0,0.0
999996,19.872726,2.683904,2.778303,1.0,1.0,0.0,0.0,0.0
999997,2.914857,1.472687,0.218075,1.0,1.0,0.0,1.0,0.0
999998,4.258729,0.242023,0.475822,1.0,0.0,0.0,1.0,0.0
999999,58.108125,0.31811,0.38692,1.0,1.0,0.0,1.0,0.0


In [12]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  float64
 4   used_chip                       1000000 non-null  float64
 5   used_pin_number                 1000000 non-null  float64
 6   online_order                    1000000 non-null  float64
 7   fraud                           1000000 non-null  float64
dtypes: float64(8)
memory usage: 61.0 MB


In [13]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

In [14]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['fraud'].value_counts()

fraud
0.0    912597
1.0     87403
Name: count, dtype: int64

This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [16]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.fraud == 0]
fraud = credit_card_data[credit_card_data.fraud == 1]

In [17]:
print(legit.shape)
print(fraud.shape)

(912597, 8)
(87403, 8)


In [21]:
fraud.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,87403.0,87403.0,87403.0,87403.0,87403.0,87403.0,87403.0,87403.0
mean,66.261876,12.712185,6.006323,0.880119,0.256399,0.003123,0.946318,1.0
std,134.391608,47.997697,5.56432,0.324825,0.436647,0.055801,0.225391,0.0
min,0.025847,0.000407,0.011966,0.0,0.0,0.0,0.0,1.0
25%,4.585729,0.328199,3.50027,1.0,0.0,0.0,1.0,1.0
50%,15.454219,1.157631,5.071294,1.0,0.0,0.0,1.0,1.0
75%,101.110104,4.598504,7.331222,1.0,1.0,0.0,1.0,1.0
max,10632.723672,2160.499922,266.689692,1.0,1.0,1.0,1.0,1.0


In [20]:
# compare the values for both transactions
credit_card_data.groupby('fraud').mean()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,22.832976,4.301391,1.423642,0.881672,0.359402,0.109944,0.622225
1.0,66.261876,12.712185,6.006323,0.880119,0.256399,0.003123,0.946318


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 2000

In [49]:
legit_sample = legit.sample(n=2000)

Concatenating two DataFrames

In [50]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [51]:
new_dataset.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
601248,12.53043,10.604938,0.333443,1.0,1.0,0.0,1.0,0.0
383317,31.105298,5.667197,0.385191,1.0,1.0,0.0,0.0,0.0
837265,6.28746,0.510941,2.236062,1.0,0.0,0.0,0.0,0.0
414962,32.570967,0.256476,0.937282,1.0,0.0,0.0,1.0,0.0
818376,2.624242,7.041097,1.126885,1.0,1.0,0.0,1.0,0.0


In [52]:
new_dataset.tail()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
999908,45.296658,0.882736,8.856861,1.0,0.0,0.0,1.0,1.0
999916,167.139756,0.282253,0.308468,1.0,0.0,0.0,1.0,1.0
999919,124.640118,0.004416,0.434885,1.0,0.0,0.0,1.0,1.0
999939,51.4129,3.42933,29.914254,1.0,0.0,0.0,1.0,1.0
999949,15.724799,1.875906,11.009366,1.0,1.0,0.0,1.0,1.0


In [53]:
new_dataset['fraud'].value_counts()

fraud
1.0    87403
0.0     2000
Name: count, dtype: int64

In [54]:
new_dataset.groupby('fraud').mean()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,20.27919,4.296692,1.372096,0.8685,0.3695,0.1105,0.6215
1.0,66.261876,12.712185,6.006323,0.880119,0.256399,0.003123,0.946318


Splitting the data into Features & Targets

In [68]:
X = new_dataset.drop(['fraud','distance_from_home','distance_from_last_transaction'],axis=1)
Y = new_dataset['fraud']

In [69]:
print(X)

        ratio_to_median_purchase_price  repeat_retailer  used_chip  \
601248                        0.333443              1.0        1.0   
383317                        0.385191              1.0        1.0   
837265                        2.236062              1.0        0.0   
414962                        0.937282              1.0        0.0   
818376                        1.126885              1.0        1.0   
...                                ...              ...        ...   
999908                        8.856861              1.0        0.0   
999916                        0.308468              1.0        0.0   
999919                        0.434885              1.0        0.0   
999939                       29.914254              1.0        0.0   
999949                       11.009366              1.0        1.0   

        used_pin_number  online_order  
601248              0.0           1.0  
383317              0.0           0.0  
837265              0.0           0.0  

In [70]:
print(Y)

601248    0.0
383317    0.0
837265    0.0
414962    0.0
818376    0.0
         ... 
999908    1.0
999916    1.0
999919    1.0
999939    1.0
999949    1.0
Name: fraud, Length: 89403, dtype: float64


Split the data into Training data & Testing Data

In [71]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [72]:
print(X.shape, X_train.shape, X_test.shape)

(89403, 5) (71522, 5) (17881, 5)


Model Training

Logistic Regression

In [73]:
model = LogisticRegression()

In [74]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [75]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [76]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9832359274069518


In [77]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [78]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9837816676919635
