In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import numpy as np

In [10]:
file_path_train = r"C:\Users\Baku\Desktop\bitirme_projesi\card_transdata_part1.csv"
file_path_test = r"C:\Users\Baku\Desktop\bitirme_projesi\card_transdata_part2.csv"

df_train = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)

print("Train Data:")
display(df_train.head())

print("Test Data:")
display(df_test.head())

Train Data:


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,6.896019,0.873476,0.986387,1.0,0.0,0.0,0.0,0.0
1,17.473171,0.81237,1.122472,1.0,0.0,0.0,1.0,0.0
2,13.731686,0.334923,2.363945,1.0,0.0,0.0,1.0,0.0
3,5.537437,7.37454,0.34129,1.0,0.0,0.0,1.0,0.0
4,4.058414,1.531849,4.589428,1.0,1.0,0.0,1.0,1.0


Test Data:


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,24.200532,0.004257,0.212702,1.0,0.0,0.0,1.0,0.0
1,13.33149,0.781121,4.102186,1.0,0.0,0.0,0.0,0.0
2,13.348346,0.995948,0.689753,1.0,1.0,0.0,1.0,0.0
3,48.638892,0.956951,0.701403,1.0,0.0,0.0,1.0,0.0
4,32.037206,0.234327,0.499447,1.0,0.0,0.0,0.0,0.0


In [3]:
# Separate features and target
X_train = df_train.drop("fraud", axis=1)
y_train = df_train["fraud"]
X_test = df_test.drop("fraud", axis=1)
y_test = df_test["fraud"]

# Standardize feature values
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Check class distribution before SMOTE
print("Class distribution before SMOTE:")
print(y_train.value_counts())

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("\nClass distribution after SMOTE:")
print(y_train_resampled.value_counts())

Class distribution before SMOTE:
fraud
0.0    456298
1.0     43702
Name: count, dtype: int64

Class distribution after SMOTE:
fraud
0.0    456298
1.0    456298
Name: count, dtype: int64


In [5]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
model.fit(X_train_resampled, y_train_resampled)

# Predict on test data
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# Evaluation
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
roc_auc = roc_auc_score(y_test, y_pred_proba)

In [6]:
# Results
print("\nConfusion Matrix:")
print(conf_matrix)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("ROC AUC Score:", roc_auc)


Confusion Matrix:
[[425648  30651]
 [  2268  41433]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96    456299
         1.0       0.57      0.95      0.72     43701

    accuracy                           0.93    500000
   macro avg       0.78      0.94      0.84    500000
weighted avg       0.96      0.93      0.94    500000

ROC AUC Score: 0.9793374219788473


In [7]:
# Save confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
conf_matrix_df.to_csv("confusion_matrix.csv", index=True)

# Save classification report
report_df = pd.DataFrame(class_report).transpose()
report_df["roc_auc"] = ""
report_df.loc["avg / total", "roc_auc"] = roc_auc
report_df.to_csv("classification_report.csv", index=True)