# Bank Credit Analysis Project - Report Visualizations

This notebook generates visualizations and metrics for the project report. It includes:
1. Model Performance Analysis
2. Clustering Visualizations
3. Deep Learning Training Curves
4. Data Distribution Analysis

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import silhouette_score
import tensorflow as tf
from tensorflow import keras
import joblib

# Set style for plots
plt.style.use('seaborn')
sns.set_palette("husl")

# Display versions for reproducibility
print(f"Python version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"TensorFlow version: {tf.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")

## 1. Load and Prepare Data

First, we'll load the dataset and prepare it for analysis.

In [None]:
# Load the dataset
df = pd.read_csv('cleaned_bank_credit_data.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nSample Data:")
display(df.head())

## 2. Model Performance Analysis

Let's analyze the performance of our various models.

In [None]:
# Load models
models = {
    'Linear Regression (Amount)': joblib.load('bank_credit_model.joblib'),
    'Random Forest (Amount)': joblib.load('rf_amount_model.joblib'),
    'Linear Regression (Accounts)': joblib.load('account_prediction_model.joblib'),
    'Random Forest (Accounts)': joblib.load('rf_accounts_model.joblib')
}

# Prepare features
amount_features = ['region', 'population_group', 'bank_group', 'occupation_group', 'year', 'no_of_accounts']
account_features = ['region', 'population_group', 'bank_group', 'occupation_group', 'year', 'credit_limit']

# Calculate R² scores
amount_scores = {
    'Linear Regression': models['Linear Regression (Amount)'].score(df[amount_features], np.log1p(df['amount_outstanding'])),
    'Random Forest': models['Random Forest (Amount)'].score(df[amount_features], np.log1p(df['amount_outstanding']))
}

account_scores = {
    'Linear Regression': models['Linear Regression (Accounts)'].score(df[account_features], np.log1p(df['no_of_accounts'])),
    'Random Forest': models['Random Forest (Accounts)'].score(df[account_features], np.log1p(df['no_of_accounts']))
}

# Create bar plot
fig = go.Figure(data=[
    go.Bar(name='Amount Prediction', x=list(amount_scores.keys()), y=list(amount_scores.values())),
    go.Bar(name='Account Prediction', x=list(account_scores.keys()), y=list(account_scores.values()))
])

fig.update_layout(
    title='Model Performance Comparison (R² Score)',
    xaxis_title='Model Type',
    yaxis_title='R² Score',
    barmode='group'
)

fig.show()

## 3. Clustering Analysis

Let's analyze the clustering results with different algorithms.

In [None]:
# Import clustering module
import clustering

# Features for clustering
features = ['credit_limit', 'amount_outstanding', 'no_of_accounts']

# Run different clustering algorithms
algorithms = {
    'KMeans': {'algorithm': 'KMeans', 'params': {'n_clusters': 4}},
    'DBSCAN': {'algorithm': 'DBSCAN', 'params': {'eps': 0.5, 'min_samples': 5}},
    'Agglomerative': {'algorithm': 'Agglomerative', 'params': {'n_clusters': 4}}
}

results = {}
for name, config in algorithms.items():
    results[name] = clustering.run_clustering(df, features, 
                                           algorithm=config['algorithm'],
                                           params=config['params'])

# Create visualization of clustering results
fig = plt.figure(figsize=(15, 5))

for i, (name, result) in enumerate(results.items(), 1):
    plt.subplot(1, 3, i)
    plt.scatter(result['X_pca'][:, 0], result['X_pca'][:, 1], c=result['labels'], cmap='tab10')
    plt.title(f'{name}\nSilhouette Score: {result["metrics"]["silhouette"]:.3f}')
    plt.xlabel('PC1')
    plt.ylabel('PC2')

plt.tight_layout()
plt.show()

# Print cluster sizes
for name, result in results.items():
    print(f"\n{name} Cluster Sizes:")
    unique, counts = np.unique(result['labels'], return_counts=True)
    for cluster, count in zip(unique, counts):
        print(f"Cluster {cluster}: {count} samples")

## 4. Deep Learning Analysis

Let's analyze the performance of our deep learning models.

In [None]:
# Load and train autoencoder
import deep_learning

# Select features for autoencoder
features = ['credit_limit', 'amount_outstanding', 'no_of_accounts']
X = df[features].values

# Create and train autoencoder
dl_model = deep_learning.DeepLearningModel()
dl_model.create_autoencoder(input_dim=len(features), encoding_dim=2)
dl_model.train(X, epochs=50)

# Get the training history
history = dl_model.history.history

# Plot training curves
fig = go.Figure()
fig.add_trace(go.Scatter(y=history['loss'], name='Training Loss'))
fig.add_trace(go.Scatter(y=history['val_loss'], name='Validation Loss'))
fig.update_layout(
    title='Autoencoder Training Progress',
    xaxis_title='Epoch',
    yaxis_title='Loss',
    showlegend=True
)
fig.show()

# Get encoded representation
encoded_data = dl_model.get_embeddings(X)

# Plot encoded data
fig = px.scatter(
    x=encoded_data[:, 0],
    y=encoded_data[:, 1],
    color=df['bank_group'],
    title='2D Encoded Representation by Bank Group'
)
fig.show()

## 5. Save Visualizations

Save the visualizations for the report.

In [None]:
# Create report figures directory
import os
os.makedirs('report_figures', exist_ok=True)

# Save model performance plot
fig_performance = go.Figure(data=[
    go.Bar(name='Amount Prediction', x=list(amount_scores.keys()), y=list(amount_scores.values())),
    go.Bar(name='Account Prediction', x=list(account_scores.keys()), y=list(account_scores.values()))
])
fig_performance.write_html("report_figures/model_performance.html")

# Save clustering plots
plt.figure(figsize=(15, 5))
for i, (name, result) in enumerate(results.items(), 1):
    plt.subplot(1, 3, i)
    plt.scatter(result['X_pca'][:, 0], result['X_pca'][:, 1], c=result['labels'], cmap='tab10')
    plt.title(f'{name}\nSilhouette Score: {result["metrics"]["silhouette"]:.3f}')
plt.tight_layout()
plt.savefig('report_figures/clustering_comparison.png')

# Save autoencoder plots
fig_training = go.Figure()
fig_training.add_trace(go.Scatter(y=history['loss'], name='Training Loss'))
fig_training.add_trace(go.Scatter(y=history['val_loss'], name='Validation Loss'))
fig_training.write_html("report_figures/autoencoder_training.html")

print("All visualizations have been saved to the 'report_figures' directory.")