## import libraries 

In [2]:
import pandas as pd
import plotly.express as px
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression ,Lasso ,Ridge ,LogisticRegression
from category_encoders import BinaryEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder ,PolynomialFeatures
from sklearn.metrics import mean_squared_error

## Read data

In [3]:
df = pd.read_csv("card_transdata.csv")

In [4]:
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


## Explore Data

In [5]:
df.describe()


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  float64
 4   used_chip                       1000000 non-null  float64
 5   used_pin_number                 1000000 non-null  float64
 6   online_order                    1000000 non-null  float64
 7   fraud                           1000000 non-null  float64
dtypes: float64(8)
memory usage: 61.0 MB


In [7]:
df.isnull().sum()

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

In [8]:
## percentage
df.isnull().mean()*100

distance_from_home                0.0
distance_from_last_transaction    0.0
ratio_to_median_purchase_price    0.0
repeat_retailer                   0.0
used_chip                         0.0
used_pin_number                   0.0
online_order                      0.0
fraud                             0.0
dtype: float64

In [9]:
df[df['distance_from_home']>1000]

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
4038,2033.498174,0.783480,9.534812,1.0,0.0,0.0,0.0,1.0
9703,1071.554384,10.143118,1.219291,1.0,0.0,0.0,1.0,1.0
9931,1207.134491,0.318389,0.276731,1.0,0.0,0.0,0.0,0.0
11584,1058.229818,0.175199,0.340195,1.0,1.0,0.0,1.0,0.0
12032,1002.252888,7.029377,2.886316,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...
989182,1979.896482,0.376109,0.838010,1.0,0.0,0.0,0.0,0.0
991991,1018.764185,0.095766,7.486603,1.0,0.0,0.0,0.0,1.0
994109,2120.336604,12.243327,0.363797,1.0,0.0,0.0,1.0,1.0
994176,1182.070070,25.658925,1.379989,1.0,0.0,0.0,1.0,1.0


In [10]:
df.drop(df[df['distance_from_home']>1000].index , axis = 0 , inplace =True)

In [11]:
df[df['distance_from_last_transaction']> 500]

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
155,3.357481,990.070315,3.157890,1.0,0.0,0.0,0.0,0.0
1849,16.327769,747.949061,0.071035,1.0,0.0,0.0,1.0,1.0
1891,10.056701,556.384989,0.468573,1.0,1.0,0.0,1.0,0.0
2745,4.608208,718.712538,0.126567,1.0,0.0,0.0,1.0,1.0
5767,1.643394,609.534108,0.451352,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...
989944,2.122975,922.315545,0.834157,1.0,1.0,1.0,1.0,0.0
994434,85.171833,760.736129,2.667453,1.0,1.0,1.0,1.0,0.0
995529,2.517822,2222.757993,0.653967,1.0,1.0,0.0,1.0,0.0
995566,7.074682,550.295626,0.155660,1.0,0.0,0.0,1.0,1.0


In [12]:
df.drop(df[df['distance_from_last_transaction']>500].index , axis = 0 , inplace =True)

In [13]:
df[df['ratio_to_median_purchase_price']> 35]

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
389,5.534137,2.160615,36.074366,1.0,0.0,0.0,0.0,0.0
1330,15.815937,1.840256,41.023449,1.0,1.0,0.0,1.0,1.0
2848,1.800455,0.548887,65.150879,0.0,1.0,1.0,1.0,0.0
5113,4.804354,0.011745,39.617621,1.0,1.0,0.0,1.0,1.0
6565,28.762820,0.946693,61.032357,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...
997603,2.091284,0.295557,49.961114,1.0,0.0,0.0,1.0,1.0
997803,40.188094,2.153282,126.713701,1.0,1.0,0.0,1.0,1.0
999235,34.547197,1.059284,61.990614,1.0,0.0,0.0,1.0,1.0
999392,4.375680,2.257465,38.246884,1.0,1.0,0.0,1.0,1.0


In [14]:
df.drop(df[df['ratio_to_median_purchase_price']>35].index , axis = 0 , inplace =True)

In [15]:
df.repeat_retailer.value_counts()

1.0    880230
0.0    118374
Name: repeat_retailer, dtype: int64

In [16]:
df.corr()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
distance_from_home,1.0,0.001111,-0.001531,0.171272,-0.000995,-0.001625,-0.001581,0.214683
distance_from_last_transaction,0.001111,1.0,0.001028,-0.000584,0.000709,-7.6e-05,0.000697,0.127945
ratio_to_median_purchase_price,-0.001531,0.001028,1.0,0.000235,0.00054,0.001108,-0.000523,0.499193
repeat_retailer,0.171272,-0.000584,0.000235,1.0,-0.00136,-0.000399,-0.00054,-0.001535
used_chip,-0.000995,0.000709,0.00054,-0.00136,1.0,-0.001409,-0.000184,-0.060521
used_pin_number,-0.001625,-7.6e-05,0.001108,-0.000399,-0.001409,1.0,-0.00034,-0.099948
online_order,-0.001581,0.000697,-0.000523,-0.00054,-0.000184,-0.00034,1.0,0.191382
fraud,0.214683,0.127945,0.499193,-0.001535,-0.060521,-0.099948,0.191382,1.0


In [17]:
px.imshow(df.corr(), width=800, height=800, title="Heatmap of Car Price Prediction Dataset")

## split the data

In [18]:
x= df.drop('fraud' , axis = 1 )

In [19]:
y= df.fraud

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## scale data 

In [27]:
scaler = MinMaxScaler()

In [28]:
numerical_cols = list(x_train.select_dtypes(include=['int64', 'float64','int32']).columns)

In [29]:
x_train[numerical_cols] = scaler.fit_transform(x_train[numerical_cols])

In [30]:
x_train

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
740433,0.020722,0.012665,0.022245,1.0,0.0,0.0,0.0
552531,0.007162,0.000989,0.048571,1.0,1.0,0.0,1.0
623516,0.046266,0.007099,0.033409,1.0,0.0,0.0,1.0
296767,0.032378,0.000605,0.021156,1.0,0.0,0.0,1.0
878254,0.000332,0.002844,0.006545,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
259534,0.029813,0.000053,0.006651,1.0,1.0,0.0,1.0
366346,0.004843,0.000945,0.112273,1.0,0.0,0.0,1.0
132126,0.004299,0.002942,0.036507,1.0,0.0,0.0,0.0
672088,0.004956,0.000307,0.037257,1.0,1.0,0.0,0.0


## model

In [31]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
Y_pred = logreg.predict(x_test)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log

96.08

In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
r = RandomForestClassifier()

In [34]:
r.fit(x_train, y_train)

In [35]:
Y_pred = r.predict(x_test)

In [36]:
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log

96.08