In [216]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, f1_score, recall_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC

In [231]:
data = pd.read_csv('../../data/card_transdata.csv')

continuous_features = ['distance_from_home', 'distance_from_last_transaction', 'ratio_to_median_purchase_price']
data[continuous_features] = StandardScaler().fit_transform(data[continuous_features])


X = data.drop(columns=['fraud'])
# imputer = SimpleImputer(strategy='mean')
# X_im = imputer.fit_transform(X)
# X = pd.DataFrame(X_im, columns=X.columns, index= X.index)

y = data['fraud']

X, y = ADASYN().fit_resample(X, y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [150]:
logit_model = LogisticRegression()
logit_model.fit(X_train, y_train)
y_preds = logit_model.predict(X_test)

Original performance:

In [26]:
print(f'Accuracy: {logit_model.score(X_test, y_test)}')
print(f'Precision: {precision_score(y_test, y_preds)}')
print(f'F1 score: {f1_score(y_test, y_preds)}')
print(f'Recall: {recall_score(y_test, y_preds)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_preds)}')

Accuracy: 0.9345130269637734
Precision: 0.9204298902407896
F1 score: 0.9354277345122223
Recall: 0.9509224362425694
Confusion Matrix: 
[[167916  14963]
 [  8933 173085]]


In [151]:
dict(zip(logit_model.coef_.flatten(), X.columns))

{1.901227487048369: 'distance_from_home',
 1.7876063023327267: 'distance_from_last_transaction',
 2.994879018465987: 'ratio_to_median_purchase_price',
 -0.9750665133395101: 'repeat_retailer',
 -0.7353613121647347: 'used_chip',
 -4.384090822556376: 'used_pin_number',
 3.498359837712411: 'online_order'}

Now, let's engineer new features

In [232]:
X['price_online'] = X['ratio_to_median_purchase_price'] * X['online_order']
X['chip_and_pin'] = X['used_chip'] * X['used_pin_number']
X['total_distance'] = np.log(X['distance_from_home']+10)  + np.log(X['distance_from_last_transaction']+10)

X = X.drop(columns=['ratio_to_median_purchase_price', 'online_order', 'used_chip', 'used_pin_number', 'distance_from_home', 'distance_from_last_transaction'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [233]:
logit_model.fit(X_train, y_train)
y_preds = logit_model.predict(X_test)

New performance:

In [234]:
print(f'Accuracy: {logit_model.score(X_test, y_test)}')
print(f'Precision: {precision_score(y_test, y_preds)}')
print(f'F1 score: {f1_score(y_test, y_preds)}')
print(f'Recall: {recall_score(y_test, y_preds)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_preds)}')

Accuracy: 0.9173410578875683
Precision: 0.9141602574592266
F1 score: 0.9174430953501866
Recall: 0.920749596193783
Confusion Matrix: 
[[167142  15737]
 [ 14425 167593]]


In [108]:
dict(zip(logit_model.coef_.flatten(), X.columns))

{-0.8133154837481662: 'repeat_retailer',
 3.7798945024499564: 'price_online',
 -6.388928860126977: 'chip_and_pin',
 1.8215524185310086: 'total_distance'}

Experimenting  new features with XGBoost

In [235]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
y_preds = xgb_clf.predict(X_test)

In [236]:
print(f"XGB accuracy: {xgb_clf.score(X_test, y_test)}")
print(f"XGB F1 score: {f1_score(y_test, y_preds)}")
print(f"Conufsion matrix:\n {confusion_matrix(y_test, y_preds)}")

XGB accuracy: 0.9654505243945551
XGB F1 score: 0.9661510703478866
Conufsion matrix:
 [[172369  10510]
 [  2097 179921]]
