
# 📈 Bank Marketing Campaign Analysis (Auto-Loading Data)

This notebook is **self-sufficient** — no file upload is required.

### ✅ Instructions
- Run the notebook in **Google Colab** or Jupyter
- The dataset will load automatically from GitHub


In [None]:

import pandas as pd
url = "https://raw.githubusercontent.com/bedrock510/bank_marketing_colab_ready/main/bank-additional-full.csv"
df = pd.read_csv(url, sep=';')
print("✅ Data loaded from GitHub. Shape:", df.shape)


In [None]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [None]:

# Drop 'duration' due to leakage
df = df.drop(columns=['duration'])

# Prepare target and features
y = df['y'].map({'no': 0, 'yes': 1})
X = df.drop(columns=['y'])

categorical_features = X.select_dtypes(include='object').columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Pipelines
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])


In [None]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
logreg_pipeline = Pipeline([
    ('pre', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])
logreg_pipeline.fit(X_train, y_train)
log_preds = logreg_pipeline.predict(X_test)
log_proba = logreg_pipeline.predict_proba(X_test)[:,1]

# Random Forest
rf_pipeline = Pipeline([
    ('pre', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])
rf_pipeline.fit(X_train, y_train)
rf_preds = rf_pipeline.predict(X_test)
rf_proba = rf_pipeline.predict_proba(X_test)[:,1]


In [None]:

# Logistic Regression
print("Logistic Regression Results:")
print(confusion_matrix(y_test, log_preds))
print(classification_report(y_test, log_preds))
print("ROC AUC:", roc_auc_score(y_test, log_proba))

# Random Forest
print("\nRandom Forest Results:")
print(confusion_matrix(y_test, rf_preds))
print(classification_report(y_test, rf_preds))
print("ROC AUC:", roc_auc_score(y_test, rf_proba))



## ✅ Findings

- The Random Forest model showed better overall recall and AUC
- Categorical features like `contact`, `month`, and `education` were key predictors
- This model could help marketing teams better target high-conversion leads

## 🚀 Next Steps

- Tune hyperparameters using GridSearchCV
- Try boosting models (XGBoost, LightGBM)
- Deploy model into bank's call strategy system
