<a href="https://colab.research.google.com/github/aswathi603/Bank_Prediction_Model/blob/main/Bank_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install required packages
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark
!pip install findspark

# Step 2: Set up environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# For dashboard
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
# Step 2: Upload dataset
print("Please upload your banking.csv file:")
uploaded = files.upload()
filename = list(uploaded.keys())[0]
print(f"Uploaded file: {filename}")

# Step 3: Initialize Spark Session
spark = SparkSession.builder \
    .appName("BankingPredictionNoPipeline") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

print("Spark session created successfully!")

Please upload your banking.csv file:


Saving banking.csv to banking.csv
Uploaded file: banking.csv
Spark session created successfully!


In [None]:
# Step 4: Load data without pipeline
df = spark.read.option("header", "true").option("inferSchema", "true").csv(filename)

print("=== DATASET OVERVIEW ===")
print(f" Rows: {df.count()}, Columns: {len(df.columns)}")
print("\nFirst 5 rows:")
df.show(5)

print("\nSchema:")
df.printSchema()

# Check for missing values
print("\n=== MISSING VALUES ===")
for col in df.columns:
    null_count = df.filter(df[col].isNull()).count()
    if null_count > 0:
        print(f" {col}: {null_count} missing values")
    else:
        print(f" {col}: No missing values")

# Check target distribution
print("\n=== TARGET VARIABLE DISTRIBUTION ===")
df.groupBy("y").count().show()

=== DATASET OVERVIEW ===
 Rows: 41188, Columns: 21

First 5 rows:
+---+-----------+-------+-----------------+-------+-------+----+--------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
|age|        job|marital|        education|default|housing|loan| contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor3m|nr_employed|  y|
+---+-----------+-------+-----------------+-------+-------+----+--------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
| 44|blue-collar|married|         basic.4y|unknown|    yes|  no|cellular|  aug|        thu|     210|       1|  999|       0|nonexistent|         1.4|        93.444|        -36.1|    4.963|     5228.1|  0|
| 53| technician|married|          unknown|     no|     no|  no|cellular|  nov|        fri|     138|       1|  999

In [None]:
# Step 5: Handle missing values manually
print("=== CLEANING DATA ===")

# Create a copy of the dataframe
df_clean = df

# Fill missing values column by column
df_clean = df_clean.fillna('unknown', subset=['job'])
df_clean = df_clean.fillna('unknown', subset=['education'])
df_clean = df_clean.fillna('unknown', subset=['contact'])
df_clean = df_clean.fillna('nonexistent', subset=['poutcome'])

print(" Missing values handled successfully!")

# Verify no missing values remain
print("\n=== VERIFYING CLEAN DATA ===")
missing_after_clean = []
for col in df_clean.columns:
    null_count = df_clean.filter(df_clean[col].isNull()).count()
    if null_count > 0:
        missing_after_clean.append((col, null_count))

if len(missing_after_clean) == 0:
    print(" All missing values have been handled!")
else:
    print(" Some missing values remain:")
    for col, count in missing_after_clean:
        print(f"   {col}: {count}")

=== CLEANING DATA ===
 Missing values handled successfully!

=== VERIFYING CLEAN DATA ===
 All missing values have been handled!


In [None]:
# Step 6: Manual feature engineering - StringIndexer for categorical variables
print("=== STEP 6: STRING INDEXING ===")

categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

# Apply StringIndexer to each categorical column one by one
for col in categorical_columns:
    print(f"Indexing column: {col}")
    indexer = StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep")
    df_clean = indexer.fit(df_clean).transform(df_clean)

print(" All categorical columns indexed!")

# Show what the indexed columns look like
print("\nSample of indexed data:")
df_clean.select('job', 'job_index', 'education', 'education_index').show(10)

=== STEP 6: STRING INDEXING ===
Indexing column: job
Indexing column: marital
Indexing column: education
Indexing column: default
Indexing column: housing
Indexing column: loan
Indexing column: contact
Indexing column: month
Indexing column: day_of_week
Indexing column: poutcome
 All categorical columns indexed!

Sample of indexed data:
+-----------+---------+-----------------+---------------+
|        job|job_index|        education|education_index|
+-----------+---------+-----------------+---------------+
|blue-collar|      1.0|         basic.4y|            4.0|
| technician|      2.0|          unknown|            6.0|
| management|      4.0|university.degree|            0.0|
|   services|      3.0|      high.school|            1.0|
|    retired|      5.0|         basic.4y|            4.0|
| management|      4.0|         basic.4y|            4.0|
|blue-collar|      1.0|         basic.4y|            4.0|
|blue-collar|      1.0|         basic.9y|            2.0|
|     admin.|      0.0|

In [None]:
# Step 7: Manual OneHot Encoding (FIXED VERSION)
print("=== STEP 7: ONE-HOT ENCODING ===")

# First, check what columns  currently have
print("Current columns in DataFrame:")
print(df_clean.columns)

# Apply OneHotEncoder to each indexed column only if it doesn't exist
for col in categorical_columns:
    encoded_col_name = col + "_encoded"

    # Check if the encoded column already exists
    if encoded_col_name in df_clean.columns:
        print(f"  Column {encoded_col_name} already exists. Skipping...")
        continue

    print(f"One-hot encoding: {col}")
    encoder = OneHotEncoder(inputCol=col + "_index", outputCol=encoded_col_name, dropLast=False)
    df_clean = encoder.fit(df_clean).transform(df_clean)

print(" One-hot encoding completed!")

# # Alternate: Drop existing encoded columns first
# print("\n=== ALTERNATIVE APPROACH: CLEAN SLATE ===")

# # List columns to drop (if they exist)
# columns_to_drop = [col + "_encoded" for col in categorical_columns]
# existing_columns_to_drop = [col for col in columns_to_drop if col in df_clean.columns]

# if existing_columns_to_drop:
#     print(f"Dropping existing columns: {existing_columns_to_drop}")
#     df_clean = df_clean.drop(*existing_columns_to_drop)

# # Now apply OneHotEncoder fresh
# for col in categorical_columns:
#     encoded_col_name = col + "_encoded"
#     print(f"One-hot encoding: {col}")
#     encoder = OneHotEncoder(inputCol=col + "_index", outputCol=encoded_col_name, dropLast=False)
#     df_clean = encoder.fit(df_clean).transform(df_clean)

# print(" All categorical columns one-hot encoded successfully!")

# # Verify the new columns are created
# encoded_columns = [col + "_encoded" for col in categorical_columns]
# print(f"\nEncoded columns created: {len(encoded_columns)}")
# print("Sample of encoded data:")
# df_clean.select('job', 'job_index', 'job_encoded').show(10, truncate=False)

=== STEP 7: ONE-HOT ENCODING ===
Current columns in DataFrame:
['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y', 'job_index', 'marital_index', 'education_index', 'default_index', 'housing_index', 'loan_index', 'contact_index', 'month_index', 'day_of_week_index', 'poutcome_index', 'job_encoded', 'marital_encoded', 'education_encoded', 'default_encoded', 'housing_encoded', 'loan_encoded', 'contact_encoded', 'month_encoded', 'day_of_week_encoded', 'poutcome_encoded']
  Column job_encoded already exists. Skipping...
  Column marital_encoded already exists. Skipping...
  Column education_encoded already exists. Skipping...
  Column default_encoded already exists. Skipping...
  Column housing_encoded already exists. Skipping...
  Column loan_encoded already exists. Skipping...
  Column contact_encode

In [None]:
# Step 8: Prepare numerical features (continue from here)
print("=== STEP 8: PREPARING NUMERICAL FEATURES ===")

numerical_columns = ['age', 'duration', 'campaign', 'pdays', 'previous',
                    'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed']

print("Numerical columns to use:")
for i, col in enumerate(numerical_columns, 1):
    print(f"  {i}. {col}")

# Combine all feature columns
feature_columns = encoded_columns + numerical_columns
print(f"\nTotal features: {len(feature_columns)}")
print("First 5 feature columns:", feature_columns[:5])

=== STEP 8: PREPARING NUMERICAL FEATURES ===
Numerical columns to use:
  1. age
  2. duration
  3. campaign
  4. pdays
  5. previous
  6. emp_var_rate
  7. cons_price_idx
  8. cons_conf_idx
  9. euribor3m
  10. nr_employed

Total features: 20
First 5 feature columns: ['job_encoded', 'marital_encoded', 'education_encoded', 'default_encoded', 'housing_encoded']


In [None]:
# Step 9: Encode target variable manually
print("=== STEP 9: TARGET VARIABLE ENCODING ===")

label_indexer = StringIndexer(inputCol="y", outputCol="label")
df_final = label_indexer.fit(df_clean).transform(df_clean)

print("Target variable distribution:")
df_final.groupBy("y", "label").count().show()

# Step 10: Manual feature vector assembly
print("=== STEP 5: FEATURE VECTOR ASSEMBLY ===")

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_assembled = assembler.transform(df_final)

print(" Feature vector created successfully!")
print("Final dataframe schema:")
df_assembled.printSchema()

# Show sample of features
print("\nSample of features and labels:")
df_assembled.select("features", "label", "y").show(5, truncate=False)

=== STEP 9: TARGET VARIABLE ENCODING ===
Target variable distribution:
+---+-----+-----+
|  y|label|count|
+---+-----+-----+
|  0|  0.0|36548|
|  1|  1.0| 4640|
+---+-----+-----+

=== STEP 5: FEATURE VECTOR ASSEMBLY ===
 Feature vector created successfully!
Final dataframe schema:
root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = false)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = false)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = false)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = false)
 |-- emp_var_rate: double (nullable = true)
 |-- cons_price_idx: double (nullable = true)
 |-- cons_conf_idx: double (nullab

In [None]:
# Step 10: Manual train-test split
print("=== STEP 10: TRAIN-TEST SPLIT ===")

train_data, test_data = df_assembled.randomSplit([0.7, 0.3], seed=42)

print(f"Training data: {train_data.count()} rows")
print(f"Test data: {test_data.count()} rows")

# Step 11: Train Random Forest without Pipeline
print("=== STEP 11: TRAINING RANDOM FOREST ===")

# Initialize Random Forest with parameters
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    numTrees=100,           # Number of trees in the forest
    maxDepth=10,            # Maximum depth of each tree
    seed=42,               # For reproducible results
    featureSubsetStrategy="auto",  # How many features to consider for splits
    impurity="gini"        # Splitting criterion
)

print("Starting model training...")
model = rf.fit(train_data)
print(" Model training completed!")

=== STEP 10: TRAIN-TEST SPLIT ===
Training data: 28928 rows
Test data: 12260 rows
=== STEP 11: TRAINING RANDOM FOREST ===
Starting model training...
 Model training completed!


In [None]:
# Step 12: Make predictions manually
print("=== STEP 12: MAKING PREDICTIONS ===")

predictions = model.transform(test_data)

print("Predictions sample:")
predictions.select("y", "label", "prediction", "probability").show(10)

# Step 13: Manual model evaluation
print("=== STEP 13: MODEL EVALUATION ===")

# Initialize evaluators
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

evaluator_auc = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

# Calculate metrics manually
accuracy = evaluator_accuracy.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)
auc_score = evaluator_auc.evaluate(predictions)

print(" MODEL PERFORMANCE METRICS:")
print(f" Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f" F1-Score:  {f1_score:.4f}")
print(f" AUC-ROC:   {auc_score:.4f}")

# Additional manual calculations
print("\n=== DETAILED MANUAL CALCULATIONS ===")

=== STEP 12: MAKING PREDICTIONS ===
Predictions sample:
+---+-----+----------+--------------------+
|  y|label|prediction|         probability|
+---+-----+----------+--------------------+
|  0|  0.0|       1.0|[0.24101303451513...|
|  0|  0.0|       0.0|[0.83371123702462...|
|  0|  0.0|       0.0|[0.50020004005393...|
|  1|  1.0|       0.0|[0.53169505151636...|
|  1|  1.0|       0.0|[0.92891478115240...|
|  1|  1.0|       1.0|[0.30701060673678...|
|  0|  0.0|       0.0|[0.93689003011112...|
|  0|  0.0|       0.0|[0.80648095547968...|
|  0|  0.0|       0.0|[0.76673491575338...|
|  0|  0.0|       1.0|[0.20101269028585...|
+---+-----+----------+--------------------+
only showing top 10 rows

=== STEP 13: MODEL EVALUATION ===
 MODEL PERFORMANCE METRICS:
 Accuracy:  0.9038 (90.38%)
 F1-Score:  0.8831
 AUC-ROC:   0.9387

=== DETAILED MANUAL CALCULATIONS ===


In [None]:
# Step 14: Manual performance calculations
def calculate_manual_metrics(predictions_df):
    # Convert to pandas for manual calculations
    pred_pandas = predictions_df.select("label", "prediction").toPandas()

    # Calculate confusion matrix manually
    y_true = pred_pandas['label']
    y_pred = pred_pandas['prediction']

    # Manual confusion matrix calculation
    tp = len([(t, p) for t, p in zip(y_true, y_pred) if t == 1 and p == 1])
    fp = len([(t, p) for t, p in zip(y_true, y_pred) if t == 0 and p == 1])
    tn = len([(t, p) for t, p in zip(y_true, y_pred) if t == 0 and p == 0])
    fn = len([(t, p) for t, p in zip(y_true, y_pred) if t == 1 and p == 0])

    # Manual metric calculations
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    manual_f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(" MANUALLY CALCULATED METRICS:")
    print(f"True Positives (TP): {tp}")
    print(f"False Positives (FP): {fp}")
    print(f"True Negatives (TN): {tn}")
    print(f"False Negatives (FN): {fn}")
    print(f"Manual Precision: {precision:.4f}")
    print(f"Manual Recall: {recall:.4f}")
    print(f"Manual F1-Score: {manual_f1:.4f}")

    return tp, fp, tn, fn

tp, fp, tn, fn = calculate_manual_metrics(predictions)

 MANUALLY CALCULATED METRICS:
True Positives (TP): 393
False Positives (FP): 128
True Negatives (TN): 10687
False Negatives (FN): 1052
Manual Precision: 0.7543
Manual Recall: 0.2720
Manual F1-Score: 0.3998


In [None]:
# Step 15: Manual feature importance analysis
print("=== STEP 15: FEATURE IMPORTANCE ANALYSIS ===")

feature_importance = model.featureImportances

# Create feature importance DataFrame manually
feature_importance_list = []
for feature, importance in zip(feature_columns, feature_importance):
    feature_importance_list.append({
        'feature': feature,
        'importance': float(importance)
    })

feature_importance_df = spark.createDataFrame(feature_importance_list)
feature_importance_pd = feature_importance_df.toPandas().sort_values('importance', ascending=False)

print(" TOP 10 MOST IMPORTANT FEATURES:")
print(feature_importance_pd.head(10))

# Save feature importance
feature_importance_pd.to_csv('feature_importance_manual.csv', index=False)
print(" Feature importance saved to 'feature_importance_manual.csv'")

=== STEP 15: FEATURE IMPORTANCE ANALYSIS ===
 TOP 10 MOST IMPORTANT FEATURES:
              feature  importance
18          euribor3m    0.004836
14           previous    0.004328
1     marital_encoded    0.003984
19        nr_employed    0.003436
2   education_encoded    0.003390
13              pdays    0.003287
5        loan_encoded    0.003283
0         job_encoded    0.003203
10                age    0.002656
15       emp_var_rate    0.002449
 Feature importance saved to 'feature_importance_manual.csv'


In [None]:
# Step 16: Create comprehensive dashboard without Pipeline dependencies
print("=== 16. CREATING INTERACTIVE DASHBOARD ===")

class ManualBankingDashboard:
    def __init__(self, original_df, predictions_df, model, feature_columns):
        self.original_df = original_df
        self.predictions_df = predictions_df
        self.model = model
        self.feature_columns = feature_columns
        self.prepare_dashboard_data()

    def prepare_dashboard_data(self):
        # Convert to pandas for visualization
        self.original_pd = self.original_df.limit(1000).toPandas()
        self.pred_pd = self.predictions_df.select(
            "y", "label", "prediction", "probability", "rawPrediction"
        ).toPandas()

        # Extract probability manually
        self.pred_pd['probability_yes'] = self.pred_pd['probability'].apply(
            lambda x: float(x[1]) if x else 0.0
        )
        self.pred_pd['probability_no'] = self.pred_pd['probability'].apply(
            lambda x: float(x[0]) if x else 0.0
        )

        print(" Dashboard data prepared!")

    def create_performance_metrics(self, accuracy, f1, auc):
        """Create performance metrics visualization"""
        fig = make_subplots(
            rows=1, cols=3,
            specs=[[{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}]],
            subplot_titles=['Accuracy', 'F1-Score', 'AUC-ROC']
        )

        # Accuracy gauge
        fig.add_trace(go.Indicator(
            mode="gauge+number+delta",
            value=accuracy,
            title={'text': "Accuracy"},
            domain={'row': 0, 'column': 0},
            gauge={'axis': {'range': [0, 1]},
                   'bar': {'color': "darkblue"},
                   'steps': [{'range': [0, 0.6], 'color': "lightgray"},
                            {'range': [0.6, 0.8], 'color': "gray"},
                            {'range': [0.8, 1], 'color': "darkgray"}]}
        ), row=1, col=1)

        # F1-Score gauge
        fig.add_trace(go.Indicator(
            mode="gauge+number+delta",
            value=f1,
            title={'text': "F1-Score"},
            domain={'row': 0, 'column': 1},
            gauge={'axis': {'range': [0, 1]},
                   'bar': {'color': "darkgreen"}}
        ), row=1, col=2)

        # AUC-ROC gauge
        fig.add_trace(go.Indicator(
            mode="gauge+number+delta",
            value=auc,
            title={'text': "AUC-ROC"},
            domain={'row': 0, 'column': 2},
            gauge={'axis': {'range': [0, 1]},
                   'bar': {'color': "darkred"}}
        ), row=1, col=3)

        fig.update_layout(height=300, margin=dict(t=50, b=10))
        return fig

    def create_confusion_matrix_plot(self):
        """Create confusion matrix visualization"""
        y_true = self.pred_pd['label']
        y_pred = self.pred_pd['prediction']
        cm = confusion_matrix(y_true, y_pred)

        fig = ff.create_annotated_heatmap(
            z=cm,
            x=['Predicted No', 'Predicted Yes'],
            y=['Actual No', 'Actual Yes'],
            colorscale='Blues',
            annotation_text=cm.astype(str),
            showscale=True
        )
        for annotation in fig.layout.annotations:
          annotation.font.color = 'black'

        fig.update_layout(
            title='Confusion Matrix',
            xaxis_title='Predicted Label',
            yaxis_title='True Label'
        )

        return fig

    def create_feature_importance_plot(self):
        """Create feature importance visualization"""
        # Get feature importance manually
        importance_values = self.model.featureImportances
        feature_imp_list = []

        for feature, imp in zip(self.feature_columns, importance_values):
            feature_imp_list.append({'feature': feature, 'importance': float(imp)})

        feature_imp_df = pd.DataFrame(feature_imp_list)
        top_10_features = feature_imp_df.nlargest(10, 'importance')

        fig = px.bar(
            top_10_features,
            x='importance',
            y='feature',
            orientation='h',
            color='importance',
            color_continuous_scale='Viridis',
            title='Top 10 Feature Importance'
        )

        fig.update_layout(yaxis={'categoryorder': 'total ascending'})
        return fig

    def create_actual_vs_predicted(self):
        """Create actual vs predicted comparison"""
        actual_counts = self.pred_pd['label'].value_counts().sort_index()
        predicted_counts = self.pred_pd['prediction'].value_counts().sort_index()

        fig = go.Figure()

        fig.add_trace(go.Bar(
            x=['No', 'Yes'],
            y=actual_counts.values,
            name='Actual',
            marker_color='blue',
            opacity=0.7
        ))

        fig.add_trace(go.Bar(
            x=['No', 'Yes'],
            y=predicted_counts.values,
            name='Predicted',
            marker_color='orange',
            opacity=0.7
        ))

        fig.update_layout(
            title='Actual vs Predicted Distribution',
            xaxis_title='Subscription',
            yaxis_title='Count',
            barmode='group'
        )

        return fig

    def create_roc_curve(self):
        """Create ROC curve manually"""
        y_true = self.pred_pd['label']
        y_prob = self.pred_pd['probability_yes']

        fpr, tpr, thresholds = roc_curve(y_true, y_prob)
        roc_auc = auc(fpr, tpr)

        fig = go.Figure()

        fig.add_trace(go.Scatter(
            x=fpr, y=tpr,
            mode='lines',
            line=dict(color='darkorange', width=2),
            name=f'ROC curve (AUC = {roc_auc:.3f})'
        ))

        fig.add_trace(go.Scatter(
            x=[0, 1], y=[0, 1],
            mode='lines',
            line=dict(color='navy', width=2, dash='dash'),
            name='Random Classifier'
        ))

        fig.update_layout(
            title='Receiver Operating Characteristic (ROC) Curve',
            xaxis_title='False Positive Rate',
            yaxis_title='True Positive Rate',
            showlegend=True
        )

        return fig

# Create dashboard instance
dashboard = ManualBankingDashboard(df_clean, predictions, model, feature_columns)

=== 16. CREATING INTERACTIVE DASHBOARD ===
 Dashboard data prepared!


In [None]:
# Step 17: Display all dashboard components
print(" DISPLAYING INTERACTIVE DASHBOARD COMPONENTS...")

# 1. Performance Metrics
print("\n1.  PERFORMANCE METRICS")
metrics_fig = dashboard.create_performance_metrics(accuracy, f1_score, auc_score)
metrics_fig.show()

# 2. Confusion Matrix
print("\n2.  CONFUSION MATRIX")
confusion_fig = dashboard.create_confusion_matrix_plot()
confusion_fig.show()

# 3. Feature Importance
print("\n3.  FEATURE IMPORTANCE")
feature_fig = dashboard.create_feature_importance_plot()
feature_fig.show()

# 4. Actual vs Predicted
print("\n4.  ACTUAL VS PREDICTED")
actual_pred_fig = dashboard.create_actual_vs_predicted()
actual_pred_fig.show()

# 5. ROC Curve
print("\n5.  ROC CURVE")
roc_fig = dashboard.create_roc_curve()
roc_fig.show()

# 6. Additional Analysis - Age Distribution
print("\n6.  AGE DISTRIBUTION BY SUBSCRIPTION")
age_fig = px.box(dashboard.original_pd, x='y', y='age', color='y',
                 title='Age Distribution by Subscription Status')
age_fig.show()

# 7. Job Analysis
print("\n7.  JOB TYPE ANALYSIS")
job_counts = dashboard.original_pd.groupby(['job', 'y']).size().reset_index(name='count')
job_fig = px.bar(job_counts, x='job', y='count', color='y', barmode='group',
                 title='Subscription by Job Type')
job_fig.update_layout(xaxis_tickangle=45)
job_fig.show()

 DISPLAYING INTERACTIVE DASHBOARD COMPONENTS...

1.  PERFORMANCE METRICS



2.  CONFUSION MATRIX



3.  FEATURE IMPORTANCE



4.  ACTUAL VS PREDICTED



5.  ROC CURVE



6.  AGE DISTRIBUTION BY SUBSCRIPTION



7.  JOB TYPE ANALYSIS


In [None]:
# Step 18: Save results of dashboard
print("=== SAVING RESULTS OF DASHBOARD ===")

# Prepare data for Excel
excel_data = df_clean.select([
    'age', 'job', 'marital', 'education', 'duration', 'campaign',
    'previous', 'poutcome', 'y'
]).limit(2000).toPandas()

# Add predictions
excel_data['prediction'] = excel_data.index.map(
    lambda x: dashboard.pred_pd['prediction'].iloc[x % len(dashboard.pred_pd)]
    if x < len(dashboard.pred_pd) else 0
)
excel_data['probability_yes'] = excel_data.index.map(
    lambda x: dashboard.pred_pd['probability_yes'].iloc[x % len(dashboard.pred_pd)]
    if x < len(dashboard.pred_pd) else 0.5
)
excel_data['prediction_label'] = excel_data['prediction'].apply(
    lambda x: 'yes' if x == 1.0 else 'no'
)

# Save files
excel_data.to_csv('banking_predictions_manual.csv', index=False)

# Save metrics
metrics_data = {
    'Metric': ['Accuracy', 'F1-Score', 'AUC-ROC', 'Precision', 'Recall'],
    'Value': [accuracy, f1_score, auc_score, tp/(tp+fp) if (tp+fp)>0 else 0, tp/(tp+fn) if (tp+fn)>0 else 0]
}
pd.DataFrame(metrics_data).to_csv('model_metrics_manual.csv', index=False)

print(" Files saved successfully!")
print("   - banking_predictions_manual.csv")
print("   - model_metrics_manual.csv")
print("   - feature_importance_manual.csv")

# Download files
files.download('banking_predictions_manual.csv')
files.download('model_metrics_manual.csv')
files.download('feature_importance_manual.csv')

=== SAVING RESULTS OF DASHBOARD ===
 Files saved successfully!
   - banking_predictions_manual.csv
   - model_metrics_manual.csv
   - feature_importance_manual.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>