In [None]:
# Decision Tree classifier: complete end-to-end notebook code
# ---------------------------------------------------------
# 1) unzip (if needed), load dataset
# 2) basic cleaning and EDA
# 3) preprocessing (impute, encode)
# 4) train/test split, baseline Decision Tree
# 5) hyperparameter tuning (GridSearchCV)
# 6) evaluate & visualize, save model
# ---------------------------------------------------------

# -----------------------------
# Step 0: Install imports (if needed)
# -----------------------------
# !pip install -q scikit-learn joblib seaborn matplotlib pandas

# -----------------------------
# Step 1: Libraries & file paths
# -----------------------------
import zipfile, os, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
import joblib

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10,6)

# --- Update these paths if needed ---
ZIP_PATH = "C:/Users/Renuka/Desktop/Skillcraft/Dataset/bank+marketing.zip"      # correct zip file path
CSV_PATH = None  # will be set automatically after extraction

if ZIP_PATH and os.path.exists(ZIP_PATH):
    extract_to = "dataset_extracted"
    os.makedirs(extract_to, exist_ok=True)
    with zipfile.ZipFile(ZIP_PATH, 'r') as z:
        z.extractall(extract_to)
    # Print all extracted files for debugging
    for root, dirs, files in os.walk(extract_to):
        for name in files:
            print("Extracted file:", os.path.join(root, name))
    # find all CSVs inside extracted folder
    csv_files = glob.glob(os.path.join(extract_to, '**', '*.csv'), recursive=True)
    print("CSV files found:", csv_files)
    if len(csv_files) == 0:
        raise FileNotFoundError("No CSV found inside the zip. Please set CSV_PATH manually.")
    CSV_PATH = csv_files[0]  # auto-load first CSV

print("Loading CSV:", CSV_PATH)
df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
display(df.head())

# -----------------------------
# Step 2: Identify target label
# -----------------------------
# Replace the value below with the exact column name that indicates purchase (label)
# Common names: 'Purchase','Purchased','Bought','WillBuy','Buy','Outcome'
TARGET_COLUMN = None  # <-- set this to the name of your label column (string)

# Try auto-detect if user hasn't set TARGET_COLUMN
if TARGET_COLUMN is None:
    # look for common names
    candidates = ['Purchase','Purchased','Bought','WillBuy','Will_Purchase','Buy','Outcome','Purchased?','Purchase_Flag']
    for c in candidates:
        if c in df.columns:
            TARGET_COLUMN = c
            break

# fallback: find first column with 2 unique values (binary)
if TARGET_COLUMN is None:
    for col in df.columns:
        if df[col].nunique() == 2:
            TARGET_COLUMN = col
            break

if TARGET_COLUMN is None:
    raise ValueError("Could not auto-find target column. Please set TARGET_COLUMN to the label column name. Available columns:\n" + ", ".join(df.columns))

print("Using target column:", TARGET_COLUMN)
print(df[TARGET_COLUMN].value_counts(dropna=False))

# -----------------------------
# Step 3: Basic cleaning
# -----------------------------
# Drop duplicates
df = df.drop_duplicates().reset_index(drop=True)

# Remove columns with >50% missing data
thresh = int(0.5 * len(df))
drop_cols = [c for c in df.columns if df[c].isnull().sum() > thresh]
# Do not drop the target even if it has missing (we'll handle later)
if TARGET_COLUMN in drop_cols:
    drop_cols.remove(TARGET_COLUMN)
print("Dropping columns with >50% missing:", drop_cols)
df = df.drop(columns=drop_cols)

# If any ID columns (all unique), drop them automatically (likely not useful)
id_like = [c for c in df.columns if df[c].nunique() == len(df)]
if TARGET_COLUMN in id_like:
    id_like.remove(TARGET_COLUMN)
print("Dropping ID-like columns:", id_like)
df = df.drop(columns=id_like)

# Show missing counts
print("\nMissing values per column:")
print(df.isnull().sum())

# If target has missing values, drop those rows
df = df[~df[TARGET_COLUMN].isnull()].reset_index(drop=True)

# -----------------------------
# Step 4: Quick EDA (summary & plots)
# -----------------------------
print("\n--- Dataset summary ---")
display(df.describe(include='all').transpose())

# Plot target distribution
plt.figure(figsize=(6,4))
sns.countplot(x=TARGET_COLUMN, data=df)
plt.title("Target distribution")
plt.show()

# For numeric columns: histograms
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c != TARGET_COLUMN]  # exclude target if numeric
if len(num_cols) > 0:
    df[num_cols].hist(bins=20, figsize=(12, 8))
    plt.suptitle("Histograms for numeric features")
    plt.show()

# Correlation heatmap for numeric columns (if >1 numeric)
if len(num_cols) > 1:
    plt.figure(figsize=(10,8))
    sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm")
    plt.title("Numeric feature correlation")
    plt.show()

# -----------------------------
# Step 5: Preprocessing (impute & encode)
# -----------------------------
# Separate X and y
y = df[TARGET_COLUMN].copy()
X = df.drop(columns=[TARGET_COLUMN])

# Impute missing values:
for col in X.columns:
    if X[col].dtype == 'object' or X[col].dtype.name == 'category':
        # fill categorical with mode
        X[col] = X[col].fillna(X[col].mode()[0])
    else:
        # fill numerical with median
        X[col] = X[col].fillna(X[col].median())

# Identify categorical columns
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print("Categorical columns:", cat_cols)
print("Numerical columns:", num_cols)

# Encoding strategy:
# - If categorical column has many unique values (>20), use LabelEncoder
# - Otherwise use one-hot encoding (get_dummies)
high_card = [c for c in cat_cols if X[c].nunique() > 20]
onehot = [c for c in cat_cols if c not in high_card]

from sklearn.preprocessing import OrdinalEncoder
# Label encode high-card columns
for c in high_card:
    X[c] = X[c].astype(str)
    X[c] = LabelEncoder().fit_transform(X[c])

# One-hot encode the remaining categorical columns
if len(onehot) > 0:
    X = pd.get_dummies(X, columns=onehot, drop_first=True)

print("Shape after encoding:", X.shape)

# Encode target into 0/1 if necessary
if y.dtype == 'object' or y.dtype.name == 'category':
    y_enc = LabelEncoder().fit_transform(y.astype(str))
else:
    # if numeric but not 0/1, map the two unique values to 0/1
    uniq = sorted(y.unique())
    if set(uniq) <= {0,1}:
        y_enc = y.astype(int)
    else:
        mapping = {uniq[0]:0, uniq[1]:1}
        y_enc = y.map(mapping)

y = y_enc

print("Final X shape:", X.shape, "Final y distribution:\n", pd.Series(y).value_counts())

# -----------------------------
# Step 6: Train/Test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y if len(np.unique(y))>1 else None
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# -----------------------------
# Step 7: Baseline Decision Tree
# -----------------------------
dt_baseline = DecisionTreeClassifier(random_state=42)
dt_baseline.fit(X_train, y_train)
y_pred = dt_baseline.predict(X_test)

print("Baseline accuracy:", accuracy_score(y_test, y_pred))
print("\nBaseline classification report:\n", classification_report(y_test, y_pred))

# Confusion matrix plot
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Baseline DT')
plt.show()

# -----------------------------
# Step 8: Cross-validation
# -----------------------------
cv_scores = cross_val_score(dt_baseline, X, y, cv=5, scoring='accuracy')
print("5-fold CV accuracy:", np.round(cv_scores, 4))
print("CV mean accuracy:", np.round(cv_scores.mean(), 4))

# -----------------------------
# Step 9: Hyperparameter tuning (GridSearchCV)
# -----------------------------
param_grid = {
    'max_depth': [3,5,7,10,None],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,5,10],
    'criterion': ['gini','entropy']
}

grid = GridSearchCV(DecisionTreeClassifier(random_state=42),
                    param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
best_dt = grid.best_estimator_

# Evaluate best model
y_pred_best = best_dt.predict(X_test)
print("Best model accuracy:", accuracy_score(y_test, y_pred_best))
print("\nClassification report (best):\n", classification_report(y_test, y_pred_best))

cm2 = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(5,4))
sns.heatmap(cm2, annot=True, fmt='d', cmap='Oranges')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Best DT')
plt.show()

# -----------------------------
# Step 10: Feature importances & rules
# -----------------------------
feat_imp = pd.Series(best_dt.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Top 15 important features:\n", feat_imp.head(15))

# Plot top features
plt.figure(figsize=(8,6))
feat_imp.head(15).plot(kind='barh')
plt.gca().invert_yaxis()
plt.title("Top 15 Feature Importances")
plt.show()

# Export textual rules (good for small trees)
try:
    tree_text = export_text(best_dt, feature_names=list(X.columns))
    print("\nDecision tree rules (truncated):\n")
    print(tree_text[:2000])  # first 2000 chars
except Exception as e:
    print("Could not export textual tree:", e)

# Plot tree (limit depth for readability)
plt.figure(figsize=(20,10))
plot_tree(best_dt, feature_names=X.columns, class_names=[str(c) for c in best_dt.classes_], filled=True, max_depth=4, fontsize=8)
plt.show()

# -----------------------------
# Step 11: ROC-AUC (if possible)
# -----------------------------
if hasattr(best_dt, "predict_proba"):
    try:
        y_proba = best_dt.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_proba)
        print("ROC AUC:", auc)
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.figure(figsize=(6,4))
        plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
        plt.plot([0,1],[0,1],'k--')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve - Best Decision Tree")
        plt.legend()
        plt.show()
    except Exception as e:
        print("ROC/AUC failed:", e)

# -----------------------------
# Step 12: Save model
# -----------------------------
MODEL_PATH = "decision_tree_model.joblib"
joblib.dump({'model': best_dt, 'features': X.columns.tolist()}, MODEL_PATH)
print("Saved model to", MODEL_PATH)

# -----------------------------
# Step 13: Sample predictions
# -----------------------------
sample = X_test.copy().iloc[:10].copy()
sample['actual'] = y_test[:10].values
sample['predicted'] = best_dt.predict(sample.drop(columns=['actual']))
display(sample)

# -----------------------------
# End: Short summary (print)
# -----------------------------
print("\n--- Summary ---")
print(f"Dataset rows: {len(df)} | Features after preprocess: {X.shape[1]}")
print(f"Best DT params: {grid.best_params_}")
print("You can load the saved model using joblib.load('decision_tree_model.joblib') and use it for predictions on new data.")


CSV files found: []


FileNotFoundError: No CSV found inside the zip. Please set CSV_PATH manually.

In [None]:
# 📌 Data Cleaning & EDA Template
# Works for any dataset inside a ZIP file

# ----------------------------
# Step 1: Import Libraries
# ----------------------------
import zipfile
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# ----------------------------
# Step 2: Unzip & Load Dataset
# ----------------------------
zip_path = "C:/Users/Renuka/Desktop/Skillcraft/Dataset/titanic.zip"  # your zip file
extract_path = "dataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# list extracted files to see which CSV is inside
extracted_files = os.listdir(extract_path)
print("Extracted Files:", extracted_files)

# Automatically find the first CSV file in the extracted folder
csv_files = [f for f in extracted_files if f.endswith('.csv')]
if len(csv_files) == 0:
    raise FileNotFoundError("No CSV file found in the extracted folder.")
csv_filename = csv_files[0]
print(f"Using CSV file: {csv_filename}")

df = pd.read_csv(os.path.join(extract_path, csv_filename), skiprows=4)

print("\n🔹 First 5 Rows:")
print(df.head())

# ----------------------------
# Step 3: Data Cleaning
# ----------------------------
print("\n🔹 Missing Values:")
print(df.isnull().sum())

# Remove duplicates
df = df.drop_duplicates()

# Fill missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

print("\n✅ Missing values handled")

# Check datatypes
print("\n🔹 Data Types:")
print(df.dtypes)

# ----------------------------
# Step 4: Exploratory Data Analysis (EDA)
# ----------------------------

# Summary statistics
print("\n🔹 Summary Statistics:")
print(df.describe(include='all'))

# Pairplot (only numerical columns)
sns.pairplot(df.select_dtypes(include=['float64','int64']))
plt.show()

# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Histogram for Age column (if exists)
if "Age" in df.columns:
    plt.figure(figsize=(8,5))
    sns.histplot(df["Age"], kde=True, bins=30, color="skyblue")
    plt.title("Distribution of Age")
    plt.show()

# Bar chart for Gender column (if exists)
if "Gender" in df.columns:
    plt.figure(figsize=(6,4))
    sns.countplot(x="Gender", data=df, palette="Set2")
    plt.title("Gender Distribution")
    plt.show()

# Group analysis example
if "Gender" in df.columns and "Age" in df.columns:
    print("\n🔹 Average Age by Gender:")
    print(df.groupby("Gender")["Age"].mean())


Extracted Files: ['API_SP.POP.TOTL_DS2_en_csv_v2_763389.csv', 'bank+marketing.zip', 'gender_submission.csv', 'Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2_763389.csv', 'Metadata_Indicator_API_SP.POP.TOTL_DS2_en_csv_v2_763389.csv', 'skillcraftdata.zip', 'test.csv', 'titanic.zip', 'train.csv']
Using CSV file: API_SP.POP.TOTL_DS2_en_csv_v2_763389.csv

🔹 First 5 Rows:
                  Country Name Country Code     Indicator Name Indicator Code  \
0                        Aruba          ABW  Population, total    SP.POP.TOTL   
1  Africa Eastern and Southern          AFE  Population, total    SP.POP.TOTL   
2                  Afghanistan          AFG  Population, total    SP.POP.TOTL   
3   Africa Western and Central          AFW  Population, total    SP.POP.TOTL   
4                       Angola          AGO  Population, total    SP.POP.TOTL   

          1960         1961         1962         1963         1964  \
0      54922.0      55578.0      56320.0      57002.0      57619.0   
1  1

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)



✅ Missing values handled

🔹 Data Types:
Country Name       object
Country Code       object
Indicator Name     object
Indicator Code     object
1960              float64
                   ...   
2021              float64
2022              float64
2023              float64
2024              float64
Unnamed: 69       float64
Length: 70, dtype: object

🔹 Summary Statistics:
       Country Name Country Code     Indicator Name Indicator Code  \
count           266          266                266            266   
unique          266          266                  1              1   
top           Aruba          ABW  Population, total    SP.POP.TOTL   
freq              1            1                266            266   
mean            NaN          NaN                NaN            NaN   
std             NaN          NaN                NaN            NaN   
min             NaN          NaN                NaN            NaN   
25%             NaN          NaN                NaN            N