### Import libraries

In [19]:

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.preprocessing import LabelEncoder
import numpy as np

### Customize

In [21]:
# Specify file names
past_data_path = '0509 Decision tree past data.csv'
new_data_path = '0509 Decision tree new data.csv'
tree_image_path = "0509 decision_tree.png"
rules_file_path = "0509 decision_rules.txt"
decision_file_path = '0509 decision.csv'

# Explicitly define features and target
features = ['Age', 'Income', 'Gender']
target = 'Purchase'

### Load and clean past data

In [23]:
# Load past data 
past_data = pd.read_csv(past_data_path)
print("Past Data Columns:", past_data.columns.tolist())  # Debugging line

# Drop unnamed columns if they exist
past_data = past_data.loc[:, ~past_data.columns.str.contains('^Unnamed')]

# Ensure Age and Income are numeric
past_data['Age'] = pd.to_numeric(past_data['Age'], errors='coerce')
past_data['Income'] = pd.to_numeric(past_data['Income'], errors='coerce')

# Encode categorical variable (Gender)
label_encoders = {}
if past_data['Gender'].dtype == 'object':
    le = LabelEncoder()
    past_data['Gender'] = le.fit_transform(past_data['Gender'])
    label_encoders['Gender'] = le
    print("Recognized Gender Classes in Past Data:", le.classes_)  # Debugging line

# Drop rows with missing values after conversions
past_data = past_data.dropna()

Past Data Columns: ['Cust_Id', 'Age', 'Income', 'Gender', 'Purchase']


### Train and save Decision Tree and decision rules

In [25]:
# Train Decision Tree
X_train = past_data[features]
y_train = past_data[target]
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Save decision tree diagram 
plt.figure(figsize=(16, 10), dpi=300)
plot_tree(dt_model, feature_names=features, class_names=["No Purchase", "Purchase"], filled=True, fontsize=10, rounded=True, precision=2)
plt.savefig(tree_image_path, bbox_inches='tight')
plt.close()
print(f"Decision tree diagram saved to {tree_image_path}")

# Convert decision rules to plain English
def explain_decision_tree(rules):
    lines = rules.split("\n")
    explanation = []
    for line in lines:
        depth = line.count("|")
        condition = line.split("-")[-1].strip()
        if "class" in condition:
            explanation.append("  " * depth + f"THEN the decision is: {condition.replace('class:', '').strip()}")
        else:
            explanation.append("  " * depth + f"IF {condition}")
    return "\n".join(explanation)

rules = export_text(dt_model, feature_names=features, spacing=3)
plain_english_rules = explain_decision_tree(rules)

# Save decision rules in plain English to a file
with open(rules_file_path, "w") as f:
    f.write(plain_english_rules)
print(f"Decision rules saved to {rules_file_path}")

Decision tree diagram saved to 0509 decision_tree.png
Decision rules saved to 0509 decision_rules.txt


### Use decision tree on new data and print results 

In [27]:
# Load new customer data
new_data = pd.read_csv(new_data_path)

print("New Data Columns:", new_data.columns.tolist())  # Debugging line

# Drop unnamed columns if they exist
new_data = new_data.loc[:, ~new_data.columns.str.contains('^Unnamed')]

# Ensure new data has the same structure
if not all(col in new_data.columns for col in features):
    raise ValueError("Missing expected columns in new customer data")

# Convert Age and Income to numeric
new_data['Age'] = pd.to_numeric(new_data['Age'], errors='coerce')
new_data['Income'] = pd.to_numeric(new_data['Income'], errors='coerce')

print("New Data Before Gender Encoding:\n", new_data.head())  # Debugging line

# Encode Gender using the same encoder
if 'Gender' in label_encoders:
    known_genders = label_encoders['Gender'].classes_
    print("Recognized Gender Classes in Model:", known_genders)  # Debugging line
    new_data['Gender'] = new_data['Gender'].apply(lambda x: label_encoders['Gender'].transform([x])[0] if x in known_genders else np.nan)

print("New Data After Gender Encoding:\n", new_data.head())  # Debugging line

# Drop rows with NaN values
new_data_before_drop = new_data.copy()
new_data = new_data.dropna()
print("Dropped Rows:\n", new_data_before_drop.loc[~new_data_before_drop.index.isin(new_data.index)])  # Debugging line

new_data_copy = new_data.copy()

# Ensure new data has the same features as training data
new_data = new_data[features]

# Preserve all original fields while extracting only features for 
X_new = new_data_copy[features]

# Predict purchase decision for new customers
if not X_new.empty:
    predictions = dt_model.predict(X_new)
    new_data_copy['Purchase Decision'] = predictions
    print("Predicted Purchase Decisions for New Customers:")
    print(new_data_copy)
    
    # Save predictions with all original fields
    new_data_copy.to_csv(decision_file_path, index=False)
    print(f"Predictions saved to {decision_file_path}")
else:
    print("No valid data available for prediction after encoding.")


New Data Columns: ['Cust_Id', 'Age', 'Income', 'Gender']
New Data Before Gender Encoding:
    Cust_Id  Age  Income  Gender
0     1001   42  129521       1
1     1002   39   89277       0
2     1003   34   83659       0
3     1004   56  188839       0
4     1005   47   99501       0
New Data After Gender Encoding:
    Cust_Id  Age  Income  Gender
0     1001   42  129521       1
1     1002   39   89277       0
2     1003   34   83659       0
3     1004   56  188839       0
4     1005   47   99501       0
Dropped Rows:
 Empty DataFrame
Columns: [Cust_Id, Age, Income, Gender]
Index: []
Predicted Purchase Decisions for New Customers:
    Cust_Id  Age  Income  Gender  Purchase Decision
0      1001   42  129521       1                  0
1      1002   39   89277       0                  1
2      1003   34   83659       0                  1
3      1004   56  188839       0                  0
4      1005   47   99501       0                  1
5      1006   43  126594       1                  1