In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

In [5]:
# Load data
data = pd.read_csv("engineered_transactions.csv")


  data = pd.read_csv("engineered_transactions.csv")


In [6]:
# Create RFM-based target variable
rfm_thresholds = {
    'Recency': data["Recency"].quantile(0.8),
    'Frequency': data["Frequency"].quantile(0.2),
    'Monetary': data["Monetary"].quantile(0.2)
}

data["is_bad"] = np.where(
    (data["Recency"] > rfm_thresholds['Recency']) |
    (data["Frequency"] < rfm_thresholds['Frequency']) |
    (data["Monetary"] < rfm_thresholds['Monetary']),
    1, 0
)

print("Target Distribution:")
print(data["is_bad"].value_counts(normalize=True))


Target Distribution:
is_bad
0    0.615906
1    0.384094
Name: proportion, dtype: float64


In [7]:
# Feature engineering
def feature_engineering(df):
    # Extract temporal features
    if 'TransactionStartTime' in df.columns:
        df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
        df['TransactionHour'] = df['TransactionStartTime'].dt.hour
        df['TransactionDay'] = df['TransactionStartTime'].dt.day
        df['TransactionMonth'] = df['TransactionStartTime'].dt.month
        df = df.drop('TransactionStartTime', axis=1)
    return df

data = feature_engineering(data)

# Drop non-predictive columns
data = data.drop([
    "TransactionId", "BatchId", "AccountId",
    "SubscriptionId", "CustomerId"
], axis=1)


In [8]:
# Split data
X = data.drop("is_bad", axis=1)
y = data["is_bad"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [9]:
# Preprocessing pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [10]:
# Model pipelines
log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000))
])

rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced'))
])

In [11]:
# Hyperparameter tuning
param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['lbfgs', 'saga']
}

param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

In [15]:
# Convert categorical features to string
X_train[categorical_features] = X_train[categorical_features].astype(str)
X_test[categorical_features] = X_test[categorical_features].astype(str)

# Fill NaN values in categorical columns
X_train[categorical_features] = X_train[categorical_features].fillna("Unknown")
X_test[categorical_features] = X_test[categorical_features].fillna("Unknown")

In [16]:
# Train and evaluate models
def train_evaluate(model, param_grid, model_name):
    grid_search = GridSearchCV(model, param_grid, cv=3,
                              scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print(f"\n{model_name} Best Parameters:", grid_search.best_params_)
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, best_model.predict(X_test)))

    return best_model

lr_best = train_evaluate(log_reg, param_grid_lr, "Logistic Regression")
rf_best = train_evaluate(rf, param_grid_rf, "Random Forest")


Logistic Regression Best Parameters: {'classifier__C': 10, 'classifier__solver': 'lbfgs'}
ROC-AUC Score: 0.9862
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      3434
           1       0.90      0.89      0.90      2142

    accuracy                           0.92      5576
   macro avg       0.92      0.92      0.92      5576
weighted avg       0.92      0.92      0.92      5576


Random Forest Best Parameters: {'classifier__max_depth': 20, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
ROC-AUC Score: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3434
           1       1.00      1.00      1.00      2142

    accuracy                           1.00      5576
   macro avg       1.00      1.00      1.00      5576
weighted avg       1.00      1.00      1.00      5576



In [17]:


# Save best model and preprocessing pipeline
joblib.dump(lr_best, 'credit_scoring_lr.pkl')
joblib.dump(rf_best, 'credit_scoring_rf.pkl')
print("\nModels saved successfully!")




Models saved successfully!


In [18]:
# Feature importance analysis (for Random Forest)
try:
    feature_names = numeric_features.tolist() + \
        lr_best.named_steps['preprocessor']\
        .named_transformers_['cat']\
        .named_steps['onehot']\
        .get_feature_names_out(categorical_features).tolist()

    importances = rf_best.named_steps['classifier'].feature_importances_
    feat_imp = pd.DataFrame({'feature': feature_names, 'importance': importances})
    print("\nTop 10 Important Features:")
    print(feat_imp.sort_values(by='importance', ascending=False).head(10))
except Exception as e:
    print("\nFeature importance analysis skipped:", str(e))


Top 10 Important Features:
                     feature  importance
5                    Recency    0.312954
12          TransactionCount    0.164556
6                  Frequency    0.143768
7                   Monetary    0.112825
10    TotalTransactionAmount    0.109422
11  AverageTransactionAmount    0.057920
8                  Stability    0.032727
2                      Value    0.020269
1                     Amount    0.016944
3            PricingStrategy    0.014801
