In [1]:
## Basic Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Models
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import ConfusionMatrixDisplay

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

# Fairness & Bias Analysis (optional)
!pip install fairlearn
from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference


Collecting fairlearn
  Downloading fairlearn-0.13.0-py3-none-any.whl.metadata (7.3 kB)
Collecting scipy<1.16.0,>=1.9.3 (from fairlearn)
  Downloading scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m62.0/62.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading fairlearn-0.13.0-py3-none-any.whl (251 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m251.6/251.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.3 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m37.3/37.3 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hI

In [None]:
from google.colab import files
uploaded = files.upload()

# Load the CSV file with semicolon delimiter
df = pd.read_csv("bank.csv", sep=';')
df.head()

# Convert target 'y' from 'yes'/'no' to binary (1 = default, 0 = no default)
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# Optional: check for missing values
df = df.dropna()

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# Define features and target
X = df_encoded.drop('y', axis=1)
y = df_encoded['y']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)


In [None]:
dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train)


In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)


In [None]:
models = {"Logistic Regression": lr, "Decision Tree": dt, "Gradient Boosting": gb}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
# Get predicted probabilities
probs = lr.predict_proba(X_test)[:, 1]

def classify_risk(p):
    if p < 0.3:
        return "Low"
    elif p < 0.7:
        return "Medium"
    else:
        return "High"

risk_categories = pd.Series(probs).apply(classify_risk)

# Visualize
risk_categories.value_counts().plot(kind='bar', color=['green', 'orange', 'red'])
plt.title("Borrower Risk Categories")
plt.xlabel("Risk Category")
plt.ylabel("Number of Borrowers")
plt.show()


In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# rename feature labels for clarity
friendly_feature_names = {
    'duration': 'Call Duration (seconds)',
    'poutcome_success': 'Previous Campaign: Success',
    'age': 'Age',
    'balance': 'Account Balance',
    'day': 'Day of Month Contacted',
    'month_may': 'Contacted in May',
    'month_oct': 'Contacted in October',
    'contact_unknown': 'Contact Type: Unknown',
    'previous': 'Previous Campaign Contacts',
    'pdays': 'Days Since Last Contact'
}
features_friendly = [friendly_feature_names.get(col, col) for col in X.columns]


plt.figure(figsize=(14, 6))
plot_tree(dt,
          feature_names=features_friendly,
          class_names=['No Default', 'Default'],
          filled=True,
          rounded = True,
          max_depth=2,
          fontsize=10)  # Limit depth for readability
plt.title("Decision Tree Visualization (Max Depth = 2)")
plt.show()


In [None]:
# üéØ Evaluate Gradient Boosting Model
y_pred_gb = gb.predict(X_test)

print("üìå Model: Gradient Boosting Classifier")
print("üîç Accuracy (Overall Correct Predictions):", accuracy_score(y_test, y_pred_gb))
print("üéØ Precision (Correct Defaults Among Predicted Defaults):", precision_score(y_test, y_pred_gb))
print("üì¢ Recall (Correctly Caught Actual Defaults):", recall_score(y_test, y_pred_gb))

# Confusion matrix
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_gb,
    display_labels=["No Default", "Default"],
    cmap="Purples"
)
plt.title("üìä Confusion Matrix - Gradient Boosting")
plt.show()


In [None]:
friendly_labels = {
    'duration': 'Call Duration (seconds)',
    'poutcome_success': 'Previous Campaign: Success',
    'age': 'Age',
    'day': 'Day of the Month Contacted',
    'month_oct': 'Contact Month: October',
    'balance': 'Account Balance',
    'contact_unknown': 'Contact Type: Unknown',
    'previous': 'Previous Contacts',
    'pdays': 'Days Since Last Contact',
    'month_jun': 'Contact Month: June'
}
gb_importances = gb.feature_importances_
indices = np.argsort(gb_importances)[-10:]
features = [X.columns[i] for i in indices]
labels = [friendly_labels.get(f, f) for f in features]  # Replace if in dict


plt.figure(figsize=(10, 6))
plt.barh(range(len(indices)), gb_importances[indices], color='purple')
plt.yticks(range(len(indices)), labels)
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.title("üåü Top 10 Features Influencing Default Risk (Gradient Boosting)")
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

In [None]:
# Step 1: Copy original DataFrame and add predictions
# Reload the original CSV into df (if not still in memory)
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

df = pd.read_csv("bank.csv", sep=";")
df['y'] = df['y'].map({'yes': 1, 'no': 0})

df_encoded = pd.get_dummies(df, drop_first=True)
X = df_encoded.drop('y', axis=1)
y = df_encoded['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

df = pd.read_csv('bank.csv', sep=';')
df['y'] = df['y'].map({'yes': 1, 'no': 0})

df_bias = df.copy()
df_bias['y_pred'] = lr.predict(df_encoded.drop('y', axis=1))  # or gb.predict(...)

# Step 2: Check outcome rates by group
print("üîç Predicted Default Rate by Marital Status:")
print(df_bias.groupby('marital')['y_pred'].mean())

# Step 3: Confusion Matrix by group
from sklearn.metrics import confusion_matrix

for group in df_bias['marital'].unique():
    sub = df_bias[df_bias['marital'] == group]
    cm = confusion_matrix(sub['y'], sub['y_pred'])
    print(f"\nüìä Confusion Matrix for: {group}")
    print(cm)

# Step 4: Disparate Impact - Approval Rate
approval_rate = df_bias[df_bias['y_pred'] == 0].groupby('marital').size() / df_bias.groupby('marital').size()

print("\nüìà Loan Approval Rate (Model-predicted No Default) by Marital Status:")
print(approval_rate)

# Optional Step 5: Bar plot
approval_rate.plot(kind='bar', color='skyblue')
plt.title("Model Approval Rates by Marital Status")
plt.ylabel("Approval Rate")
plt.xlabel("Group")
plt.ylim(0, 1)
plt.grid(True, linestyle='--', alpha=0.4)
plt.show()
