In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# load in data
df = pd.read_csv("card_transdata.csv")


In [2]:
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [3]:
# check for null values in data
print("distance_from_home nulls :",df["distance_from_home"].isnull().sum())
print("distance_from_last_transaction nulls :",df["distance_from_last_transaction"].isnull().sum())
print("ratio_to_median_purchase_price nulls :",df["ratio_to_median_purchase_price"].isnull().sum())
print("repeat_retailer nulls :",df["repeat_retailer"].isnull().sum())
print("used_chip nulls :",df["used_chip"].isnull().sum())
print("used_pin_number nulls :",df["used_pin_number"].isnull().sum())
print("online_order nulls :",df["online_order"].isnull().sum())
print("fraud nulls :",df["fraud"].isnull().sum())

distance_from_home nulls : 0
distance_from_last_transaction nulls : 0
ratio_to_median_purchase_price nulls : 0
repeat_retailer nulls : 0
used_chip nulls : 0
used_pin_number nulls : 0
online_order nulls : 0
fraud nulls : 0


In [4]:
# check for duplicates
print("duplicates :", df.duplicated().sum())

duplicates : 0


In [5]:
# standardize some features

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

numerical = ['distance_from_home',
             'distance_from_last_transaction',
             'ratio_to_median_purchase_price',
             ]

preprocessor = ColumnTransformer(
    [
        ('numerical', StandardScaler(), numerical),
    ],
    remainder='passthrough',
)

In [6]:
# splitting data. x = variables / features, y = targets

x = df.drop('fraud', axis=1)
y = df['fraud']

# training the data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, random_state = 42)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import precision_recall_curve, auc

for i in range(2, 8, 1):
    pca = PCA(n_components=i) 

    # Fit the PCA model to your data
    pca.fit(x_train)

    # Transform the data to its principal components
    x_pca = pca.transform(x_train)

    # Access the explained variance ratio
    explained_variance_ratio = pca.explained_variance_ratio_


    # Print the transformed data and explained variance ratio
    #print("Transformed data:\n", x_pca)
    print("Explained variance ratio:\n", explained_variance_ratio)

    model = LogisticRegression(max_iter=1000)
    model.fit(x_pca, y_train)
    scores = cross_val_score(model, x_pca, y_train, cv=5)
    #print(scores)
    

    print(np.mean(scores)) # all features are important

    y_pred = model.predict(x_pca)
    precision, recall, _ = precision_recall_curve(y_train, y_pred)
    pr_auc = auc(recall, precision)

    print("PR AUC:", pr_auc)

# variable with low vairence - 5 and 6. So pca doesnt hold
    
    


Explained variance ratio:
 [0.85195829 0.14632901]
0.9119242857142856
PR AUC: 0.28896065879471805
Explained variance ratio:
 [0.85195829 0.14632901 0.00158199]
0.9217414285714286
PR AUC: 0.4795606000018196
Explained variance ratio:
 [8.51958287e-01 1.46329005e-01 1.58198629e-03 4.59351049e-05]
0.9295171428571429
PR AUC: 0.5494145588076352
Explained variance ratio:
 [8.51958287e-01 1.46329005e-01 1.58198629e-03 4.59351049e-05
 4.58598850e-05]
0.9459799999999999
PR AUC: 0.6776486013681535
Explained variance ratio:
 [8.51958287e-01 1.46329005e-01 1.58198629e-03 4.59351049e-05
 4.58598850e-05 2.06508754e-05]
0.9460571428571429
PR AUC: 0.6783805724263183
Explained variance ratio:
 [8.51958287e-01 1.46329005e-01 1.58198629e-03 4.59351049e-05
 4.58598850e-05 2.06508754e-05 1.82752856e-05]
0.9589642857142857
PR AUC: 0.7659903990403628


In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [42]:
# Predicting on the test set
y_pred = model.predict(x_test_scaled)




In [43]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [None]:
from sklearn.metrics import precision_recall_curve, auc

precision, recall, _ = precision_recall_curve(y_true, y_scores)
pr_auc = auc(recall, precision)

print("PR AUC:", pr_auc)

In [44]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9132166666666667
Precision: 0.501737266208324
Recall: 0.5194994067893911
F1 Score: 0.5104638700336573
Confusion Matrix:
 [[260391  13480]
 [ 12555  13574]]
