Load Data and Prepare Target

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib

In [10]:
# Load data
data = pd.read_csv("../src/data/engineered_transactions.csv")



In [11]:
# Use thresholds to define "bad" customers
threshold_recency = data["Recency"].quantile(0.8)  # High recency = risky
threshold_monetary = data["Monetary"].quantile(0.2)  # Low monetary = risky

# Assign labels
data["is_bad"] = np.where(
    (data["Recency"] > threshold_recency) | 
    (data["Monetary"] < threshold_monetary), 
    1,  # High risk
    0   # Low risk
)

In [12]:
print("Distribution of Target Variable:")
print(data["is_bad"].value_counts(normalize=True))

Distribution of Target Variable:
is_bad
0    0.685382
1    0.314618
Name: proportion, dtype: float64


In [13]:
# Drop non-feature columns
data = data.drop([
    "TransactionId", "BatchId", "AccountId", "SubscriptionId", 
    "CustomerId", "TransactionStartTime", "CurrencyCode", "CountryCode", "ProductId"
], axis=1)

In [14]:
from sklearn.model_selection import train_test_split

X = data.drop("is_bad", axis=1)
y = data["is_bad"]

# Split data (stratify to handle class imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

# Train the model
model = LogisticRegression(class_weight="balanced", max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_prob))
print(classification_report(y_test, model.predict(X_test)))

ROC-AUC Score: 0.9272960600376925
              precision    recall  f1-score   support

           0       0.91      0.90      0.91     19670
           1       0.79      0.81      0.80      9029

    accuracy                           0.87     28699
   macro avg       0.85      0.86      0.85     28699
weighted avg       0.87      0.87      0.87     28699



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
