# Final Classification Project
**Bank Marketing (UCI) â€” Auto-download + Full ML pipeline**

Run each cell in order (Shift+Enter). This notebook will:
- Download dataset automatically if not present
- Clean and preprocess data (handle 'unknown' as missing)
- Encode categorical features and scale numeric features
- Train Logistic Regression and Random Forest
- Show accuracy, classification report, confusion matrix, and ROC curve
- Save the trained Random Forest pipeline to disk


In [None]:
# Install required packages (runs only if packages not available)
import sys, subprocess, pkgutil
required = ['pandas','numpy','scikit-learn','matplotlib','seaborn','joblib','wget']
to_install = [p for p in required if not pkgutil.find_loader(p)]
if to_install:
    print("Installing:", to_install)
    subprocess.check_call([sys.executable, "-m", "pip", "install", *to_install])
else:
    print("All required packages are already installed.")


In [None]:
# Standard imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib
import wget
print('Imports done.')


In [None]:
# Download dataset automatically if not already present in the notebook directory
data_filename = 'bank-full.csv'
if not os.path.exists(data_filename):
    print("Dataset not found locally. Downloading...")
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip"
    zipname = "bank.zip"
    try:
        wget.download(url, zipname)
        print("\nDownloaded", zipname)
    except Exception as e:
        print("wget failed:", e)
        # fallback to curl
        try:
            os.system(f"curl -O {url}")
        except Exception as e2:
            print("curl failed:", e2)
    # unzip
    import zipfile
    with zipfile.ZipFile(zipname, 'r') as z:
        z.extractall()
    print("Extracted files. You should now have 'bank-full.csv' in the current folder.")
else:
    print("Dataset already present:", data_filename)


In [None]:
# Load dataset (semicolon separated)
data = pd.read_csv("bank-full.csv", sep=';')
print("Loaded data shape:", data.shape)
data.head()


In [None]:
# Quick info & distribution of target
display(data.info())
display(data.describe(include='all').T)
print("\nTarget value counts (y):")
print(data['y'].value_counts(dropna=False))


In [None]:
# Treat 'unknown' as missing
data = data.replace('unknown', np.nan)
print('Missing value counts (first 20 cols):')
print(data.isnull().sum().sort_values(ascending=False).head(20))


In [None]:
# Define features and target
target = 'y'
data[target] = data[target].map({'yes':1, 'no':0})

X = data.drop(columns=[target])
y = data[target]

num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)
print("X shape, y shape:", X.shape, y.shape)


In [None]:
# Preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])
print('Preprocessor ready.')


In [None]:
# Train-test split (stratify on y to keep class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


In [None]:
# Train RandomForest pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])

print("Training Random Forest (this can take a minute)...")
rf_pipeline.fit(X_train, y_train)
print("Random Forest training complete.")


In [None]:
# Evaluate on test set
y_pred = rf_pipeline.predict(X_test)
y_proba = rf_pipeline.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
auc = roc_auc_score(y_test, y_proba)
print("\nROC AUC:", auc)

# Plot ROC curve
RocCurveDisplay.from_estimator(rf_pipeline, X_test, y_test)
plt.show()


In [None]:
# Logistic Regression baseline (faster)
log_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(max_iter=2000))
])
print("Training Logistic Regression...")
log_pipeline.fit(X_train, y_train)
y_pred_log = log_pipeline.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log, digits=4))


In [None]:
# Feature importances from RandomForest - need feature names from preprocessor
ohe = rf_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
ohe_features = list(ohe.get_feature_names_out(cat_cols))
feature_names = num_cols + ohe_features
importances = rf_pipeline.named_steps['clf'].feature_importances_
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False).head(20)
plt.figure(figsize=(8,6))
feat_imp.plot(kind='barh')
plt.gca().invert_yaxis()
plt.title('Top 20 Feature Importances (RandomForest)')
plt.tight_layout()
plt.show()


In [None]:
# Save trained RandomForest pipeline
model_filename = 'bank_marketing_rf_pipeline.joblib'
joblib.dump(rf_pipeline, model_filename)
print("Saved trained pipeline to", model_filename)


### Notes & Next steps
- If you plan to upload to GitHub, include:
  - `Final_Classification_Project.ipynb`
  - `bank_marketing_rf_pipeline.joblib` (optional, can be large)
  - A small `README.md` describing how to run the notebook
- If your internet environment blocks `wget` or `curl`, manually download `bank-full.csv` from UCI and upload into the notebook folder before running.
- To reduce runtime, you can lower `n_estimators` in RandomForest or use a sample of the dataset for quick testing.
