In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
train_data_path = "\\card fraud\\archive\\fraudTrain.csv"
test_data_path = "\\card fraud\\archive\\fraudTest.csv"
X_train = pd.read_csv(train_data_path)
X_test = pd.read_csv(test_data_path)
target_column = 'is_fraud'
if target_column not in X_train.columns:
    target_column = X_train.columns[-1]

In [3]:
Y_train = X_train[target_column]
Y_test = X_test[target_column]
X_train = X_train.drop(columns=[target_column, 'trans_date_trans_time'])
X_test = X_test.drop(columns=[target_column, 'trans_date_trans_time'])

In [4]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=[object]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [5]:
oversampler = SMOTE()
X_train_os, Y_train_os = oversampler.fit_resample(X_train, Y_train)

In [6]:
model = LogisticRegression()
model.fit(X_train_os, Y_train_os)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
Y_train_pred = model.predict(X_train_os)
Y_test_pred = model.predict(X_test)

In [8]:
train_accuracy = accuracy_score(Y_train_os, Y_train_pred)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print('Accuracy on training data:', train_accuracy)
print('Accuracy on testing data:', test_accuracy)

Accuracy on training data: 0.974958287082609
Accuracy on testing data: 0.9957928377471348


In [9]:
print('Classification Report on testing data:')
print(classification_report(Y_test, Y_test_pred))

Classification Report on testing data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

