# <Center> Week 12 Model Selection and Model Building/Dashboard

####  Group Name: Data Visionaries
#### Name: Abdukhakimov Asatilla
#### Email: tremendous54@gmail.com
#### Country: Uzbekistan
#### Company: Data Glacier
#### Specialization: Data Science

### Problem Description

ABC Bank wants to sell its term deposit product to customers. Before launching the product, the bank aims to develop a model to understand whether a particular customer will buy their product or not, based on the customer's past interaction with the bank or other financial institutions.

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('bank-full.csv', sep=';')

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Make a copy for processing
data = df.copy()

# Data Cleansing
data.columns = data.columns.str.strip()
data.replace("unknown", np.nan, inplace=True)
missing = data.isnull().sum()

# Impute missing categorical values with mode
for col in data.select_dtypes(include='object').columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Encode categorical variables
categorical_cols = data.select_dtypes(include='object').columns.tolist()
categorical_cols.remove('y')  # target variable
le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Encode target
data['y'] = data['y'].map({'yes': 1, 'no': 0})

# Standardize numerical features
num_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols.remove('y')
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

# Prepare data with and without duration
X_all = data.drop('y', axis=1)
y = data['y']
X_with_duration = X_all.copy()
X_without_duration = X_all.drop('duration', axis=1)

# Train/test split
X_train_wd, X_test_wd, y_train, y_test = train_test_split(X_with_duration, y, test_size=0.2, random_state=42)
X_train_wod, X_test_wod, _, _ = train_test_split(X_without_duration, y, test_size=0.2, random_state=42)

# Model training
lr_wd = LogisticRegression(max_iter=1000).fit(X_train_wd, y_train)
rf_wd = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_wd, y_train)

lr_wod = LogisticRegression(max_iter=1000).fit(X_train_wod, y_train)
rf_wod = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_wod, y_train)

# Model evaluation
def evaluate_model(model, X_test, y_test, label):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    print(f"=== {label} ===")
    print(classification_report(y_test, y_pred))
    return roc_auc_score(y_test, y_proba), roc_curve(y_test, y_proba)

# Evaluate all models
scores = {}
roc_curves = {}

for name, model, X in [
    ("LR with duration", lr_wd, X_test_wd),
    ("RF with duration", rf_wd, X_test_wd),
    ("LR w/o duration", lr_wod, X_test_wod),
    ("RF w/o duration", rf_wod, X_test_wod),
]:
    auc, roc = evaluate_model(model, X, y_test, name)
    scores[name] = auc
    roc_curves[name] = roc

scores

=== LR with duration ===
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7952
           1       0.64      0.30      0.41      1091

    accuracy                           0.90      9043
   macro avg       0.78      0.64      0.68      9043
weighted avg       0.88      0.90      0.88      9043

=== RF with duration ===
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7952
           1       0.63      0.41      0.49      1091

    accuracy                           0.90      9043
   macro avg       0.78      0.69      0.72      9043
weighted avg       0.89      0.90      0.89      9043

=== LR w/o duration ===
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      7952
           1       0.70      0.16      0.26      1091

    accuracy                           0.89      9043
   macro avg       0.80      0.57      0.60      9043


{'LR with duration': np.float64(0.8781105514848947),
 'RF with duration': np.float64(0.918170341941659),
 'LR w/o duration': np.float64(0.7064076715102716),
 'RF w/o duration': np.float64(0.7677658526779375)}

#### We trained models, and found Random Forest with duration to be the best

##  Final Model & Predictions

In [4]:
final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X_train_wd, y_train)
y_pred = final_model.predict(X_test_wd)
y_proba = final_model.predict_proba(X_test_wd)[:, 1]

##  Dashboard

In [5]:
from jupyter_dash import JupyterDash
from dash import dcc, html, Input, Output

In [6]:
from jupyter_dash import JupyterDash
from dash import Dash, dcc, html, Input, Output
import plotly.graph_objects as go

# Feature importances (Random Forest with duration)
feature_importance = rf_wd.feature_importances_
features = X_with_duration.columns

# AUC scores sorted
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

# App layout
app = JupyterDash(__name__)
app.layout = html.Div([
    html.H1("ABC Bank Term Deposit Prediction Dashboard"),
    
    html.H3("Model AUC Scores"),
    dcc.Graph(
        figure=go.Figure(
            data=[go.Bar(
                x=[s[0] for s in sorted_scores],
                y=[s[1] for s in sorted_scores],
                marker_color='indianred'
            )],
            layout_title_text="AUC Comparison Across Models"
        )
    ),

    html.H3("Random Forest (with Duration) - Feature Importances"),
    dcc.Graph(
        figure=go.Figure(
            data=[go.Bar(
                x=features,
                y=feature_importance,
                marker_color='teal'
            )],
            layout_title_text="Top Predictive Features"
        )
    ),

    html.H3("Select Model for ROC Curve"),
    dcc.Dropdown(
        id='model-dropdown',
        options=[{'label': k, 'value': k} for k in roc_curves.keys()],
        value='RF with duration'
    ),

    dcc.Graph(id='roc-curve')
])

# Callback for ROC Curve
@app.callback(
    Output('roc-curve', 'figure'),
    Input('model-dropdown', 'value')
)
def update_roc(selected_model):
    fpr, tpr, _ = roc_curves[selected_model]
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve'))
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random', line=dict(dash='dash')))
    fig.update_layout(title=f'ROC Curve: {selected_model}', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate')
    return fig


app.run_server(mode='inline')
