In [1]:
import pandas as pd

df_clean = pd.read_csv(r"E:\Projects Data Scientist\credit-risk-project\data\processed\credit_risk_clean.csv")

In [2]:
df_clean

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,IncomeMissingFlag,TotalDelinquencyCount,HighUtilizationFlag,AgeBucket
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,0,2,0,35-50
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,0,1,35-50
2,3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0,0,2,0,35-50
3,4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0,0,0,0,25-35
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,0,1,1,35-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149996,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0,0,0,0,65+
149996,149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0,0,0,0,35-50
149997,149998,0,0.246044,58,0,3870.000000,5400.0,18,0,1,0,0.0,1,0,0,50-65
149998,149999,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0,0,0,0,25-35


In [3]:
# Define Features & Target
TARGET = "SeriousDlqin2yrs"

FEATURES = [
    "RevolvingUtilizationOfUnsecuredLines",
    "DebtRatio",
    "age",
    "MonthlyIncome",
    "NumberOfOpenCreditLinesAndLoans",
    "NumberRealEstateLoansOrLines",
    "TotalDelinquencyCount",
    "HighUtilizationFlag",
    "IncomeMissingFlag"
]

In [4]:
from sklearn.model_selection import train_test_split

X = df_clean[FEATURES]
y = df_clean[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [5]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Train Logistic Regression (Class-Weighted)
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="lbfgs"
)

log_model.fit(X_train_scaled, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [7]:
# Generate Probability of Default Scores
y_train_pd = log_model.predict_proba(X_train_scaled)[:, 1]
y_test_pd = log_model.predict_proba(X_test_scaled)[:, 1]

In [8]:
# Interpret Coefficients
coef_df = pd.DataFrame({
    "Feature": FEATURES,
    "Coefficient": log_model.coef_[0]
}).sort_values(by="Coefficient", ascending=False)

coef_df

Unnamed: 0,Feature,Coefficient
6,TotalDelinquencyCount,2.892577
0,RevolvingUtilizationOfUnsecuredLines,0.897507
4,NumberOfOpenCreditLinesAndLoans,0.158899
5,NumberRealEstateLoansOrLines,0.126711
8,IncomeMissingFlag,0.008326
7,HighUtilizationFlag,-0.049767
1,DebtRatio,-0.101324
3,MonthlyIncome,-0.207864
2,age,-0.266755


In [10]:
# Save the trained model
import joblib

joblib.dump(log_model, r"E:\Projects Data Scientist\credit-risk-project\models\logistic_pd_model.pkl")
joblib.dump(scaler, r"E:\Projects Data Scientist\credit-risk-project\models\standard_scaler.pkl")

['E:\\Projects Data Scientist\\credit-risk-project\\models\\standard_scaler.pkl']

In [11]:
test_results = X_test.copy()
test_results["actual_default"] = y_test.values
test_results["pd"] = y_test_pd

test_results.to_csv(r"E:\Projects Data Scientist\credit-risk-project\data\processed\test_scored.csv", index=False)