In [1]:
## Executive Summary
# Baseline default rate: 21.81%
# Default risk increases sharply with 'interest rate' and 'worsening loan grade'.
# 'Income' is a strong protective factor: higher income = lower default.
# 'Loan percent of income' is the strongest numeric risk driver (high burden → higher default).
# Employment length shows a stability effect: 5–20 years has the lowest default rates; risk increases again at 20+ years.
# Logistic Regression baseline model was conservative (high non-default recall, low default recall).
# Using `class_weight='balanced'` increased default recall substantially at the cost of lower precision (expected trade-off).


SyntaxError: invalid character '→' (U+2192) (3853793217.py, line 4)

In [None]:
import pandas as pd

In [None]:
import os
os.getcwd()

In [None]:
df = pd.read_csv(r"C:\Users\cbayr\OneDrive\Belgeler\banking-analysis\data\credit_risk_dataset.csv")

In [None]:
# INSPECTING THE DATA

In [None]:
df.head()

In [None]:
import pandas as pd
import os

In [None]:
print(os.getcwd())
df = pd.read_csv(r"C:\Users\cbayr\OneDrive\Belgeler\banking-analysis\data\credit_risk_dataset.csv")
df.head()

In [None]:
whos

In [None]:
%whos

In [None]:
df['loan_intent']

In [None]:
df.describe()

In [None]:
df = df[(df['person_age']>= 18) & (df['person_age']<= 100)]


In [None]:
# Dropping unrealistic ages that are above 100 which is 4 rows.

In [None]:
df.describe()

In [None]:
df.count()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.isna().sum()

In [None]:
(df.isna().sum() / len(df)) * 100

In [None]:
df['loan_status'].value_counts()
df['loan_status'].value_counts(normalize=True)

In [None]:
df.groupby('loan_grade')['loan_status'].mean()

In [None]:
df.groupby('loan_grade')['loan_status'].mean().sort_index()

In [None]:
df['loan_grade'].value_counts()

In [None]:
### Handling Missing Interest Rates

# Approximately 9.6% of loan interest rates are missing. Since interest rates are determined by borrower risk and loan grade, missing values were imputed using the median interest rate within each loan grade to preserve the risk-pricing structure. A missing indicator was retained as a potential risk signal.


In [None]:
df.groupby('loan_grade')['loan_int_rate'].median()


In [None]:
df['loan_int_rate'] = df.groupby('loan_grade')['loan_int_rate']\
                         .transform(lambda x: x.fillna(x.median()))


In [None]:
#Create a missing-value indicator for employment length, as missing employment history itself may carry information about credit risk.


In [None]:
df['emp_length_missing'] = df['person_emp_length'].isna().astype(int)


In [None]:
df['person_emp_length'] = df['person_emp_length'].fillna(
    df['person_emp_length'].median()
)


In [None]:
df.head()

In [None]:
(df.isna().sum() / len(df)) * 100


In [None]:
## ANALYSIS

In [None]:
df['loan_status'].value_counts(normalize=True)


# 78.18% of the credits are good and 21.81% of the credits are bad.

In [None]:
df.groupby('loan_grade')['loan_status'].mean()


In [None]:
# Default risk increases sharply from grade C to D and from F to G

In [None]:
df['int_rate_bin'] = pd.qcut(df['loan_int_rate'], q=5)

df.groupby('int_rate_bin')['loan_status'].mean()


In [None]:
bins = [0, 8, 10, 12, 14, 16, 18, 30]
df['int_rate_custom_bin'] = pd.cut(df['loan_int_rate'], bins=bins)

df.groupby('int_rate_custom_bin')['loan_status'].mean()


In [None]:
# Interest rate bins were created using quantiles to ensure comparable sample sizes. The highest interest rate bin spans a wider range, reflecting the scarcity of very high-rate loans. Despite this, the default rate in this segment exceeds 50%, more than double the baseline, indicating a strong and economically meaningful relationship between pricing and credit risk.

In [None]:
df['int_rate_bin'].value_counts()


In [None]:
#Full Exploratory Data Analysis (EDA)

In [None]:
df.groupby('loan_grade')['loan_status'].mean().sort_values()


In [None]:
df.groupby('loan_grade')['loan_status'].mean().plot(kind='bar')


In [None]:
# Default rate increases as the loan grade worsens. Loan grade is a strong risk indicator.

In [None]:
df['loan_amt_bin'] = pd.qcut(df['loan_amnt'], q=5)
df.groupby('loan_amt_bin')['loan_status'].mean()


In [None]:
df.groupby('loan_amt_bin')['loan_status'].mean().plot(kind='bar')


In [None]:
df.groupby('person_emp_length')['loan_status'].mean()


In [None]:
df = df[df['person_emp_length'] < 60]


In [None]:
#Default rates are lowest for mid-sized loans (₺4,400–10,000), suggesting these borrowers are financially stable.

# The highest default rates occur in the largest loan bin (₺14,500–35,000), likely due to higher monthly payment burden.

#Interestingly, the smallest loan bin also shows above-average default, which may indicate lower-income or subprime borrowers who only qualify for small loan amounts.


In [None]:
df['emp_bin'] = pd.cut(
    df['person_emp_length'],
    bins=[0, 1, 3, 5, 10, 20, 50],
    labels=['0–1 yrs','1–3','3–5','5–10','10–20','20+']
)

df.groupby('emp_bin')['loan_status'].mean()


In [None]:
df.groupby('emp_bin')['person_income'].median()


In [None]:
# The relationship between employment length and default follows the expected credit-risk behavior. Borrowers with short job length (0–3 years) show the 
# highest default rates, likely due to unstable income and limited work history. 
# Default risk decreases steadily for borrowers with 5–20 years of experience, reflecting higher income stability. Interestingly, default rises again in the 20+ years group, which may correspond to older borrowers approaching 
# retirement, experiencing income decline, or having higher financial pressure.


In [None]:
df['income_bin'] = pd.qcut(df['person_income'], q=5)
df.groupby('income_bin')['loan_status'].mean()


In [None]:
# Default rates decrease consistently as income increases. Borrowers earning less than 35k show extremely high default rates (43%), while borrowers 
# earning above 86k have very low default rates (9%). This confirms income  is one of the strongest predictors of credit risk.

#This pattern also explains the behavior observed in the employment-length analysis: as income rises during mid-career (5–20 years of experience), default rates fall. After 20+ years of employment, income declines slightly, 
# which corresponds to the rise in default in that group.

In [None]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])
numeric_df.head()


In [None]:
corr = numeric_df.corr()
corr


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Features and target
X = df[['person_age', 'person_income', 'person_emp_length', 
        'loan_amnt', 'loan_int_rate', 'loan_percent_income',
        'cb_person_cred_hist_length', 'emp_length_missing']]

y = df['loan_status']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = logreg.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': logreg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

coefficients


In [None]:
#Old model kept for reference. Removed person_income later due to multicollinearity.

In [None]:
df_model = df.drop(columns=['person_income'])


In [None]:
import pandas as pd

features = [
    'person_age',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_cred_hist_length',
    'emp_length_missing'
]

X = df[features]
y = df['loan_status']   # 0 = non-default, 1 = default


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=2000)
logreg.fit(X_train_scaled, y_train)


In [None]:
y_pred = logreg.predict(X_test_scaled)
y_proba = logreg.predict_proba(X_test_scaled)[:, 1]  # probability of default


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

plt.figure(figsize=(7,5))
plt.plot(fpr, tpr, label=f"LogReg (AUC = {auc:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — Logistic Regression")
plt.legend()
plt.show()


In [None]:
coef_df = pd.DataFrame({
    "feature": features,
    "coef": logreg.coef_[0]
}).sort_values("coef", ascending=False)

coef_df


In [None]:
from sklearn.linear_model import LogisticRegression

logreg_bal = LogisticRegression(
    max_iter=2000,
    class_weight='balanced'
)

logreg_bal.fit(X_train_scaled, y_train)


In [None]:
y_pred_bal = logreg_bal.predict(X_test_scaled)
y_proba_bal = logreg_bal.predict_proba(X_test_scaled)[:, 1]


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Confusion Matrix (Balanced):\n", confusion_matrix(y_test, y_pred_bal))
print("\nClassification Report (Balanced):\n", classification_report(y_test, y_pred_bal))

auc_bal = roc_auc_score(y_test, y_proba_bal)
print("\nROC-AUC (Balanced):", auc_bal)


In [None]:
### Model Comparison and Interpretation

After applying class weighting to address class imbalance, the model behavior changed significantly.

- **Defaulters (class = 1):**
  - Recall increased from **0.39 to 0.74**, indicating a substantial improvement in catching risky borrowers.
  - Precision decreased from **0.69 to 0.48**, reflecting an increase in false positives.

- **Non-defaulters (class = 0):**
  - Recall decreased from **0.95 to 0.77**, meaning fewer safe borrowers were correctly identified.
  - Precision increased from **0.85 to 0.91**, indicating more confidence when predicting non-default.

These results highlight a clear trade-off between loss prevention and customer approval.

The **baseline logistic regression** model is conservative, favoring safe borrowers and minimizing false rejections, but it misses a large portion of defaulters.

The **balanced logistic regression** model is risk-focused, prioritizing the detection of defaulters at the cost of increased false alarms.

The choice between these models depends on the bank’s current risk appetite, regulatory environment, and strategic objectives.
