In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (accuracy_score, confusion_matrix, 
                            classification_report, roc_auc_score)
from sklearn.calibration import CalibratedClassifierCV
import joblib


In [26]:
column_names = [
    "checking_account_status", "duration_month", "credit_history", "purpose",
    "credit_amount", "savings_account", "present_employment_since", "installment_rate",
    "personal_status_sex", "other_debtors", "present_residence_since", "property",
    "age", "other_installment_plans", "housing", "existing_credits", "job",
    "liable_people", "telephone", "foreign_worker", "target"
]
df = pd.read_csv("german.data", delim_whitespace=True, header=None, names=column_names )
df.head()

Unnamed: 0,checking_account_status,duration_month,credit_history,purpose,credit_amount,savings_account,present_employment_since,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,existing_credits,job,liable_people,telephone,foreign_worker,target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [27]:
# SINCE "targrt" feature has 1 and 2 as good and bad we need to change it to 0 and 1
df["target"] = df["target"].map({1:0,2:1})
df.head(10)

Unnamed: 0,checking_account_status,duration_month,credit_history,purpose,credit_amount,savings_account,present_employment_since,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,existing_credits,job,liable_people,telephone,foreign_worker,target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,0
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,1
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,0
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,0
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,1
5,A14,36,A32,A46,9055,A65,A73,2,A93,A101,...,A124,35,A143,A153,1,A172,2,A192,A201,0
6,A14,24,A32,A42,2835,A63,A75,3,A93,A101,...,A122,53,A143,A152,1,A173,1,A191,A201,0
7,A12,36,A32,A41,6948,A61,A73,2,A93,A101,...,A123,35,A143,A151,1,A174,1,A192,A201,0
8,A14,12,A32,A43,3059,A64,A74,2,A91,A101,...,A121,61,A143,A152,1,A172,1,A191,A201,0
9,A12,30,A34,A40,5234,A61,A71,4,A94,A101,...,A123,28,A143,A152,2,A174,1,A191,A201,1


In [28]:
X = df.drop('target', axis=1)
y = df['target']

In [29]:
categorical_cols = X.select_dtypes(exclude='number').columns.tolist()
numeric_cols = X.select_dtypes(include='number').columns.tolist()

In [30]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ])


In [31]:
preprocessor

In [32]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

In [33]:
X_train

Unnamed: 0,checking_account_status,duration_month,credit_history,purpose,credit_amount,savings_account,present_employment_since,installment_rate,personal_status_sex,other_debtors,present_residence_since,property,age,other_installment_plans,housing,existing_credits,job,liable_people,telephone,foreign_worker
10,A12,12,A32,A40,1295,A61,A72,3,A92,A101,1,A123,25,A143,A151,1,A173,1,A191,A201
82,A14,18,A32,A49,1568,A62,A73,3,A92,A101,4,A122,24,A143,A151,1,A172,1,A191,A201
827,A14,18,A30,A49,4165,A61,A73,2,A93,A101,2,A123,36,A142,A152,2,A173,2,A191,A201
410,A12,24,A32,A43,1967,A61,A75,4,A92,A101,4,A123,20,A143,A152,1,A173,1,A192,A201
48,A14,11,A34,A40,7228,A61,A73,1,A93,A101,4,A122,39,A143,A152,2,A172,1,A191,A201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603,A14,36,A32,A42,3349,A61,A73,4,A92,A101,2,A123,28,A143,A152,1,A174,1,A192,A201
759,A11,12,A34,A40,691,A61,A75,4,A93,A101,3,A122,35,A143,A152,2,A173,1,A191,A201
298,A14,18,A32,A42,2515,A61,A73,3,A93,A101,4,A121,43,A143,A152,1,A173,1,A192,A201
417,A11,18,A33,A46,8471,A65,A73,1,A92,A101,2,A123,23,A143,A151,2,A173,1,A192,A201


In [34]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [35]:
# Define cost matrix

def calculate_cost(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fp * 1 + fn * 5  # FP cost=1, FN cost=5

In [36]:
from sklearn.metrics import make_scorer
cost_scorer = make_scorer(calculate_cost, greater_is_better=False)

In [37]:
base_rf = RandomForestClassifier(
    # class_weight={0: 1, 1: 5},  # Penalize misclassifying Bad as Good 5x more
    random_state=42)

In [38]:
param_grid = {
    'class_weight' :['balanced',{0: 1, 1: 5}, {0: 1, 1: 10}, {0: 1, 1: 1}],
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=base_rf,
    param_grid=param_grid,
    scoring=cost_scorer,  # Optimize for lowest cost
    cv=5,
    n_jobs=-1
)

In [39]:
grid_search.fit(X_train_processed, y_train)

In [40]:
# Best model from grid search
best_rf = grid_search.best_estimator_
best_rf

In [41]:
calibrated_rf = CalibratedClassifierCV(best_rf, cv=5)
calibrated_rf.fit(X_train_processed, y_train)

In [42]:
y_probs = calibrated_rf.predict_proba(X_test_processed)[:, 1]
thresholds = np.linspace(0, 1, 100)
costs = []

for thresh in thresholds:
    y_pred_thresh = (y_probs >= thresh).astype(int)
    costs.append(calculate_cost(y_test, y_pred_thresh))

optimal_threshold = thresholds[np.argmin(costs)]
print(f"Optimal threshold: {optimal_threshold:.3f}")

# Evaluate at optimal threshold
y_pred_optimal = (y_probs >= optimal_threshold).astype(int)

Optimal threshold: 0.172


In [43]:
def evaluate_model(y_true, y_pred, y_probs=None):
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    
    cost = calculate_cost(y_true, y_pred)
    print(f"Total Cost: {cost}")
    
    if y_probs is not None:
        print(f"ROC AUC: {roc_auc_score(y_true, y_probs):.4f}")

print("=== Default Threshold (0.5) ===")
y_pred_default = calibrated_rf.predict(X_test_processed)
evaluate_model(y_test, y_pred_default, y_probs)

print("\n=== Optimal Threshold ===")
evaluate_model(y_test, y_pred_optimal, y_probs)

=== Default Threshold (0.5) ===
Confusion Matrix:
[[188  22]
 [ 50  40]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       210
           1       0.65      0.44      0.53        90

    accuracy                           0.76       300
   macro avg       0.72      0.67      0.68       300
weighted avg       0.75      0.76      0.75       300

Total Cost: 272
ROC AUC: 0.7860

=== Optimal Threshold ===
Confusion Matrix:
[[ 97 113]
 [  7  83]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.46      0.62       210
           1       0.42      0.92      0.58        90

    accuracy                           0.60       300
   macro avg       0.68      0.69      0.60       300
weighted avg       0.78      0.60      0.61       300

Total Cost: 148
ROC AUC: 0.7860


In [44]:
from sklearn.pipeline import Pipeline

In [45]:
# # Optimal way to save
# final_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('model', calibrated_rf)
# ])

# deployment_bundle = {
#     'pipeline': final_pipeline,
#     'threshold': optimal_threshold
# }

# joblib.dump(deployment_bundle, 'credit_risk_pipeline.joblib')

In [46]:
# Save all necessary components
joblib.dump(preprocessor, 'credit_preprocessor.joblib')
joblib.dump(calibrated_rf, 'credit_model.joblib')
joblib.dump(optimal_threshold, 'optimal_threshold.joblib')

['optimal_threshold.joblib']