In [1]:
# import numpy and pandas
import numpy as np
import pandas as pd

In [2]:
# load the dataset
df = pd.read_csv("card_transdata(1).csv")
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1,1,0,0,0
1,10.829943,0.175592,1.294219,1,0,0,0,0
2,5.091079,0.805153,0.427715,1,0,0,1,0
3,2.247564,5.600044,0.362663,1,1,0,1,0
4,44.190936,0.566486,2.222767,1,1,0,1,0


In [3]:
# explore the features available in the dataframe
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  int64  
 4   used_chip                       1000000 non-null  int64  
 5   used_pin_number                 1000000 non-null  int64  
 6   online_order                    1000000 non-null  int64  
 7   fraud                           1000000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 61.0 MB
None


In [4]:
# summary statistics
df.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0
max,10632.72367,11851.10456,267.802942,1.0,1.0,1.0,1.0,1.0


In [5]:
# check for missing values
df.isnull().sum()

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

In [6]:
# count the occurrences of fraud and no fraud cases
df["fraud"].value_counts()

0    912597
1     87403
Name: fraud, dtype: int64

In [7]:
# ratio of fraud and no fraud cases
df["fraud"].value_counts(normalize=True)

0    0.912597
1    0.087403
Name: fraud, dtype: float64

In [8]:
# get the mean for each group
df.groupby("fraud").mean()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,22.832976,4.301391,1.423642,0.881672,0.359402,0.109944,0.622225
1,66.261876,12.712185,6.006323,0.880119,0.256399,0.003123,0.946318


In [9]:
# implement a rule for stating which cases are flagged as fraud
df["flag_as_fraud"] = np.where(np.logical_and(df["distance_from_home"] > 25, df["ratio_to_median_purchase_price"] >2), 1, 0)
df["flag_as_fraud"].head(30)

0     0
1     0
2     0
3     0
4     1
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
Name: flag_as_fraud, dtype: int32

In [10]:
# create a crosstab of flagged fraud cases versus the actual fraud cases
print(pd.crosstab(df.fraud, df.flag_as_fraud, rownames=["Actual Fraud"], colnames=["Flagged Fraud"]))

Flagged Fraud       0      1
Actual Fraud                
0              865351  47246
1               66724  20679


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy:",accuracy_score(df.fraud,df.flag_as_fraud))
print("Precision:",precision_score(df.fraud,df.flag_as_fraud))
print("Recall:",recall_score(df.fraud,df.flag_as_fraud))
print("F1 score:",f1_score(df.fraud,df.flag_as_fraud))

Accuracy: 0.88603
Precision: 0.30443871917556126
Recall: 0.23659370959806872
F1 score: 0.26626236093943134
