In [None]:

import zipfile, os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib


zip_path = "bank+marketing.zip"   
extract_dir = "bank_marketing_data"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zf:
    zf.extractall(extract_dir)


nested_zips = ["bank.zip", "bank-additional.zip"]
all_csvs = []
for nz in nested_zips:
    nz_path = os.path.join(extract_dir, nz)
    if os.path.exists(nz_path):
        with zipfile.ZipFile(nz_path, 'r') as zf2:
            folder = os.path.join(extract_dir, nz.replace(".zip", ""))
            os.makedirs(folder, exist_ok=True)
            zf2.extractall(folder)
            for name in zf2.namelist():
                if name.endswith((".csv", ".data")):
                    all_csvs.append(os.path.join(folder, name))

print("CSV files found:", all_csvs)


csv_path = [c for c in all_csvs if "full" in c][0]  
df = pd.read_csv(csv_path, sep=';')
print("Dataset shape:", df.shape)


y = df['y'].map({'yes': 1, 'no': 0})
X = df.drop(columns=['y'])

cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(exclude='object').columns.tolist()


preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'
)


clf = Pipeline(steps=[
    ('prep', preprocess),
    ('model', DecisionTreeClassifier(
        criterion='gini',
        max_depth=8,
        min_samples_leaf=45,
        random_state=40
    ))
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=41
)


clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No','Yes'])
disp.plot(values_format='d')
plt.title("Decision Tree - Confusion Matrix")
plt.show()


ohe = clf.named_steps['prep'].named_transformers_['cat']
ohe_features = ohe.get_feature_names_out(cat_cols).tolist()
feature_names = ohe_features + num_cols

importances = clf.named_steps['model'].feature_importances_
fi = (pd.DataFrame({'feature': feature_names, 'importance': importances})
        .sort_values('importance', ascending=False)
        .head(15))
print("Top Features:\n", fi)


joblib.dump(clf, "decision_tree_bank.pkl")
print(" Model saved as decision_tree_bank.pkl")
