In [13]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# load in data
df = pd.read_csv("card_transdata.csv")


In [14]:
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  float64
 4   used_chip                       1000000 non-null  float64
 5   used_pin_number                 1000000 non-null  float64
 6   online_order                    1000000 non-null  float64
 7   fraud                           1000000 non-null  float64
dtypes: float64(8)
memory usage: 61.0 MB


In [16]:
# check for null values in data
print("distance_from_home nulls :",df["distance_from_home"].isnull().sum())
print("distance_from_last_transaction nulls :",df["distance_from_last_transaction"].isnull().sum())
print("ratio_to_median_purchase_price nulls :",df["ratio_to_median_purchase_price"].isnull().sum())
print("repeat_retailer nulls :",df["repeat_retailer"].isnull().sum())
print("used_chip nulls :",df["used_chip"].isnull().sum())
print("used_pin_number nulls :",df["used_pin_number"].isnull().sum())
print("online_order nulls :",df["online_order"].isnull().sum())
print("fraud nulls :",df["fraud"].isnull().sum())

distance_from_home nulls : 0
distance_from_last_transaction nulls : 0
ratio_to_median_purchase_price nulls : 0
repeat_retailer nulls : 0
used_chip nulls : 0
used_pin_number nulls : 0
online_order nulls : 0
fraud nulls : 0


In [17]:
# check for duplicates
print("duplicates :",df.duplicated().sum())

duplicates : 0


In [18]:
# standardize some features

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

numerical = ['distance_from_home',
             'distance_from_last_transaction',
             'ratio_to_median_purchase_price',
             ]

preprocessor = ColumnTransformer(
    [
        ('numerical', StandardScaler(), numerical),
    ],
    remainder='passthrough',
)

In [19]:
# getting amount of fraudulent transactions
fraud = df['fraud'].value_counts()
print(fraud)

fraud
0.0    912597
1.0     87403
Name: count, dtype: int64


In [35]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [36]:
# splitting data. x = variables / features, y = targets

x = df.drop('fraud', axis=1)
y = df['fraud']

# training the data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, random_state = 42)

In [37]:

print(f"original data set: x{x.shape}, y{y.shape}")
print(f"train data set: x_train{x_train.shape}, y_train{y_train.shape}")
print(f"test data set: x_test{x_test.shape}, y_test{y_test.shape}")


original data set: x(1000000, 7), y(1000000,)
train data set: x_train(700000, 7), y_train(700000,)
test data set: x_test(300000, 7), y_test(300000,)


In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
# building logistical regression model


model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

In [40]:
# Predicting on the test set
y_pred = model.predict(x_test_scaled)




In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [43]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9132166666666667
Precision: 0.501737266208324
Recall: 0.5194994067893911
F1 Score: 0.5104638700336573
ROC AUC Score: 0.7351395767292217
Confusion Matrix:
 [[260391  13480]
 [ 12555  13574]]
