
# 📊 Bank Marketing Campaign Analysis

## 📌 Business Understanding

This notebook analyzes a Portuguese bank’s telemarketing campaigns to predict whether a client will subscribe to a term deposit. It uses real customer data and builds machine learning models to identify key indicators of success.


In [None]:

import pandas as pd

url = "https://raw.githubusercontent.com/bedrock510/bank-marketing-complete-fresh/main/bank-additional-full.csv"
df = pd.read_csv(url, sep=';')
print("✅ Data loaded. Shape:", df.shape)


In [None]:

# Drop 'duration' column to avoid data leakage
if 'duration' in df.columns:
    df = df.drop(columns=['duration'])
    print("✅ Dropped 'duration'")
else:
    print("ℹ️ 'duration' already removed or not present")


In [None]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

sns.set(style="whitegrid")


In [None]:

plt.figure(figsize=(6, 4))
sns.countplot(x='y', data=df)
plt.title("Target Distribution: Subscription")
plt.show()

categorical_cols = df.select_dtypes(include='object').columns.tolist()
categorical_cols.remove('y')

fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(18, 20))
for ax, col in zip(axes.flatten(), categorical_cols):
    sns.countplot(x=col, data=df, ax=ax, order=df[col].value_counts().index)
    ax.set_title(col)
    ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()


In [None]:

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(18, 12))
for ax, col in zip(axes.flatten(), numerical_cols):
    sns.histplot(df[col], kde=True, ax=ax)
    ax.set_title(f"Distribution of {col}")
plt.tight_layout()
plt.show()


In [None]:

y = df['y'].map({'no': 0, 'yes': 1})
X = df.drop(columns=['y'])

categorical_features = X.select_dtypes(include='object').columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
logreg_pipeline = Pipeline([
    ('pre', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])
logreg_pipeline.fit(X_train, y_train)
log_preds = logreg_pipeline.predict(X_test)
log_proba = logreg_pipeline.predict_proba(X_test)[:, 1]

# Random Forest
rf_pipeline = Pipeline([
    ('pre', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])
rf_pipeline.fit(X_train, y_train)
rf_preds = rf_pipeline.predict(X_test)
rf_proba = rf_pipeline.predict_proba(X_test)[:, 1]

# Evaluation
print("📊 Logistic Regression Results:")
print(confusion_matrix(y_test, log_preds))
print(classification_report(y_test, log_preds))
print("ROC AUC:", roc_auc_score(y_test, log_proba))

print("\n📊 Random Forest Results:")
print(confusion_matrix(y_test, rf_preds))
print(classification_report(y_test, rf_preds))
print("ROC AUC:", roc_auc_score(y_test, rf_proba))



## 🧠 Findings

- The dataset is imbalanced — most customers do not subscribe.
- Random Forest generally performs better than Logistic Regression.
- Important features: `month`, `contact`, `education`, `poutcome`.

## ✅ Next Steps

- Tune hyperparameters (e.g., using GridSearchCV)
- Address class imbalance (e.g., SMOTE)
- Try boosting algorithms (XGBoost, LightGBM)
