In [1]:
!kaggle datasets download -d nelgiriyewithana/credit-card-fraud-detection-dataset-2023

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python311\Scripts\kaggle.exe\__main__.py", line 4, in <module>
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\kaggle\__init__.py", line 23, in <module>
    api.authenticate()
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\kaggle\api\kaggle_api_extended.py", line 403, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in C:\Users\Admin\.kaggle. Or use the environment method.


In [None]:
!unzip credit-card-fraud-detection-dataset-2023.zip

: 

In [None]:
!rm -rf credit-card-fraud-detection-dataset-2023.zip
!mv creditcard_2023.csv ../../data/creditcard_2023.csv

: 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

: 

In [None]:
data = pd.read_csv('../../data/creditcard_2023.csv')
print(data.columns)
data.hist()

: 

In [None]:
X = data.drop(columns=['id'])
y = data['Class']

X.describe()

from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC

# this takes a lot longer
# X, y = SMOTENC(categorical_features=[3,4,5,6]).fit_resample(X, y)
X, y = ADASYN().fit_resample(X, y)
# y.hist()

: 

# Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100)

logit_model = LogisticRegression()
logit_model.fit(X_train, y_train)
y_preds = logit_model.predict(X_test)

for pred, actual in zip(y_test[:20], y_preds[:20]):
    print(f'{actual} : {pred}')

: 

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

print(f'Accuracy: {logit_model.score(X_test, y_test)}')
print(f'Precision: {precision_score(y_test, y_preds)}')
print(f'F1 score: {f1_score(y_test, y_preds)}')
print(f'Recall: {recall_score(y_test, y_preds)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_preds)}')

: 

In [None]:
# now I want to see the regression coefficients
dict(zip(logit_model.coef_.flatten(), X.columns))

: 

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
y_preds = decision_tree_model.predict(X_test)

for pred, actual in zip(y_test[:20], y_preds[:20]):
    print(f'{actual} : {pred}')

: 

In [None]:
print(f'Accuracy: {decision_tree_model.score(X_test, y_test)}')
print(f'Precision: {precision_score(y_test, y_preds)}')
print(f'F1 score: {f1_score(y_test, y_preds)}')
print(f'Recall: {recall_score(y_test, y_preds)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_preds)}')

: 

### XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
y_preds = xgb_model.predict(X_test)

: 

In [None]:
print(f'Accuracy: {xgb_model.score(X_test, y_test)}')
print(f'Precision: {precision_score(y_test, y_preds)}')
print(f'F1 score: {f1_score(y_test, y_preds)}')
print(f'Recall: {recall_score(y_test, y_preds)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_preds)}')
xgb_model.feature_importances_


: 

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get the predicted probabilities for each model
logit_probs = logit_model.predict_proba(X_test)[:, 1]
dt_probs = decision_tree_model.predict_proba(X_test)[:, 1]
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

# Compute the false positive rate, true positive rate, and thresholds for each model
logit_fpr, logit_tpr, _ = roc_curve(y_test, logit_probs)
dt_fpr, dt_tpr, _ = roc_curve(y_test, dt_probs)
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, xgb_probs)

# Compute the AUC score for each model
logit_auc = roc_auc_score(y_test, logit_probs)
dt_auc = roc_auc_score(y_test, dt_probs)
xgb_auc = roc_auc_score(y_test, xgb_probs)

# Plot the ROC curves
plt.plot(logit_fpr, logit_tpr, label=f'Logistic Regression (AUC = {logit_auc:.2f})')
plt.plot(dt_fpr, dt_tpr, label=f'Decision Tree (AUC = {dt_auc:.2f})')
plt.plot(xgb_fpr, xgb_tpr, label=f'XGBoost (AUC = {xgb_auc:.2f})')

# Plot the diagonal line
plt.plot([0, 1], [0, 1], 'k--')

# Set the labels and title
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()

# Show the plot
plt.show()

: 

In [None]:
import statsmodels.api as sm
import numpy as np

# Fit logistic regression model
model = sm.Logit(y_train, X_train)
result = model.fit()

# Get summary statistics
print(result.summary())

: 

: 