In [5]:
# -------------------------------------------
# 1. IMPORT LIBRARIES
# -------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import entropy

# -------------------------------------------
# 2. LOAD THE DATA
# -------------------------------------------
file_path = "E-commerce Customer Behavior - Sheet1 (1).csv"  # Replace with your dataset
df = pd.read_csv(file_path)

# Display the first few rows
print("\nDataset Preview:")
print(df.head())

# -------------------------------------------
# 3. HANDLE MISSING VALUES (Pandas 3.0 Compatible)
# -------------------------------------------
# Fill numerical missing values with median
num_cols = df.select_dtypes(include='number').columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical missing values with mode
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df = df.assign(**{col: df[col].fillna(df[col].mode()[0])})

# -------------------------------------------
# 4. ENCODE CATEGORICAL VARIABLES
# -------------------------------------------
df = pd.get_dummies(df, drop_first=True)

# Display the column names to verify the target column
print("\nColumns after encoding:")
print(df.columns)

# -------------------------------------------
# 5. SPLIT INTO TRAINING AND TESTING SETS
# -------------------------------------------
# Replace with the correct target column name
target_col = "Loan_Status_Y"  # Update this if your target column is different

# Check if the target column exists
if target_col not in df.columns:
    print(f"\n[ERROR] Column '{target_col}' not found. Verify target column name.")
else:
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # -------------------------------------------
    # 6. TRAIN DECISION TREE CLASSIFIER
    # -------------------------------------------
    clf = DecisionTreeClassifier(criterion="entropy", random_state=42)
    clf.fit(X_train, y_train)

    # -------------------------------------------
    # 7. MODEL PREDICTION AND EVALUATION
    # -------------------------------------------
    y_pred = clf.predict(X_test)

    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {accuracy:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(cm)

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # -------------------------------------------
    # 8. VISUALIZE THE DECISION TREE
    # -------------------------------------------
    plt.figure(figsize=(16, 8))
    plot_tree(clf, feature_names=X.columns, class_names=["Not Approved", "Approved"], filled=True)
    plt.title("Decision Tree Visualization")
    plt.show()

    # -------------------------------------------
    # 9. ENTROPY CALCULATION FUNCTION
    # -------------------------------------------
    def calculate_entropy(y):
        class_counts = np.bincount(y)
        probabilities = class_counts / len(y)
        return entropy(probabilities, base=2)

    # Calculate and display the entropy of the target variable
    dataset_entropy = calculate_entropy(y)
    print(f"\nDataset Entropy: {dataset_entropy:.4f}")



Dataset Preview:
   Customer ID  Gender  Age           City Membership Type  Total Spend  \
0          101  Female   29       New York            Gold      1120.20   
1          102    Male   34    Los Angeles          Silver       780.50   
2          103  Female   43        Chicago          Bronze       510.75   
3          104    Male   30  San Francisco            Gold      1480.30   
4          105    Male   27          Miami          Silver       720.40   

   Items Purchased  Average Rating  Discount Applied  \
0               14             4.6              True   
1               11             4.1             False   
2                9             3.4              True   
3               19             4.7             False   
4               13             4.0              True   

   Days Since Last Purchase Satisfaction Level  
0                        25          Satisfied  
1                        18            Neutral  
2                        42        Unsatisfied 