# CA03 – Decision Tree **Algorithm**

# **1. Data Source and Contents**

In [None]:
import pandas as pd
import numpy as np

In [None]:
url = "https://github.com/ArinB/MSBA-CA-03-Decision-Trees/blob/master/census_data.csv?raw=true"
df = pd.read_csv(url)

df.head()

# **2. Data Quality Analysis (DQA)**

In [None]:
# Basic structure
print("Shape:", df.shape)
print("\nData Types:\n", df.dtypes)


In [None]:
# Summary statistics
print("\nSummary Statistics:\n", df.describe())


In [None]:
# Missing values
print("\nMissing Values:\n", df.isnull().sum())

In [None]:
# Check duplicates
print("\nDuplicate Rows:", df.duplicated().sum())

In [None]:
#using ydataprofiling library to generate report and correlation matrix.
!pip install ydata-profiling
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Census Data Report", explorative=True)
profile.to_file("census_report.html")

from google.colab import files
files.download("census_report.html")
#file will download. Correlation matrix and bar charts of frequency per instance are included.

In [None]:
#descriptive statistics for each column
df.describe(include='all')

**Q.1 Why does it makes sense to discretize columns for this problem?**

It makes sense to discretize columns for this problem because several features, such as age, hours worked per week, and capital gains, are continuous numerical variables. If these values were left continuous, the decision tree could create splits based on very specific numeric thresholds, resulting in a large number of branches and a more complex tree. By discretizing these values into bins, ranges of values are grouped together, reducing unnecessary splits and simplifying the model. This also reflects real-world reasoning, where small differences (for example, between a 50-year-old and a 51-year-old) are unlikely to meaningfully affect income outcomes. Overall, discretization helps reduce model complexity and lowers the risk of overfitting.

**Q.2 What might be the issues (if any) if we DID NOT discretize the column**

If the columns were not discretized, the decision tree would be more prone to overfitting, as it could create splits based on every distinct numeric value. Small variations in age, capital gain, or hours worked per week, which may have minimal real-world impact, could lead to very different split decisions in the tree. This would result in a highly complex model that fits the training data too closely and may not generalize well to new data.

In [None]:
#data quality report
dq_report = pd.DataFrame({
    'Data Type': df.dtypes,
    'Non-Null Count': df.count(),
    'Missing Values': df.isnull().sum(),
    'Missing %': (df.isnull().sum() / len(df)) * 100,
    'Unique Values': df.nunique()
})


In [None]:
#outlier check
numeric_df = df.select_dtypes(include=['int64', 'float64'])

Q1 = numeric_df.quantile(0.25)
Q3 = numeric_df.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = ((numeric_df < lower_bound) | (numeric_df > upper_bound))

dq_report['Outlier'] = outliers.sum()
dq_report


**Data** **cleaning**

This data set is based on "bins" and not continuous variables so it is likely there will be duplicates as each variable is in a category rather than being represented by the individual number. Dropping the duplicates will skew the data as the duplicates represents categories that often appear together.

In [None]:
#data cleaning

#dropping one of the education columns since highly correlated
df.drop('education_num_bin', axis=1, inplace=True)


In [None]:
#label encoding and splitting the data

from sklearn.preprocessing import LabelEncoder
le_income = LabelEncoder()
df['y'] = le_income.fit_transform(df['y'])


In [None]:
label_encoders = {}

categorical_cols = df.select_dtypes(include='object').columns

# remove flag from encoding if it's categorical
categorical_cols = categorical_cols.drop('flag')

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [None]:
train_df = df[df['flag'] == 'train']
test_df  = df[df['flag'] == 'test']

x_train = train_df.drop(['y', 'flag'], axis=1)
y_train = train_df.y

x_test = test_df.drop(['y', 'flag'], axis=1)
y_test = test_df.y


# **3. Build Decision Tree Classifier Models**

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(
    max_depth=10,
    random_state=101,
    max_features=None,
    min_samples_leaf=15
)

dtree.fit(x_train, y_train)

y_pred = dtree.predict(x_test)

# **4. Evaluate Decision Tree Performance**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


In [None]:
#extracting results
TN, FP, FN, TP = cm.ravel()

print("True Negatives (TN):", TN)
print("False Positives (FP):", FP)
print("False Negatives (FN):", FN)
print("True Positives (TP):", TP)


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

In [None]:
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

In [None]:
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# **5. Tune Decision Tree Performance**

**Q.3 Decision Tree Hyper-parameter variation vs. performance**

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

run1_results = []

for criter in ["gini", "entropy"]:

    classf = DecisionTreeClassifier(
        criterion=criter,
        random_state=101
    )

    classf.fit(x_train, y_train)
    pred = classf.predict(x_test)

    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

    run1_results.append({
        "criterion": criter,
        "accuracy": accuracy_score(y_test, pred),
        "precision": precision_score(y_test, pred),
        "recall": recall_score(y_test, pred),
        "f1": f1_score(y_test, pred),
        "TP": tp,
        "TN": tn,
        "FP": fp,
        "FN": fn
    })

run1_df = pd.DataFrame(run1_results)
display(run1_df)

best_criterion = run1_df.sort_values("accuracy", ascending=False).iloc[0]["criterion"]
print("Best criterion:", best_criterion)

In [None]:
run2_results = []

for leaf in [5, 10, 15, 20, 25, 30, 35, 40]:

    classf = DecisionTreeClassifier(
        criterion=best_criterion,
        min_samples_leaf=leaf,
        random_state=101
    )

    classf.fit(x_train, y_train)
    pred = classf.predict(x_test)

    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

    run2_results.append({
        "min_samples_leaf": leaf,
        "accuracy": accuracy_score(y_test, pred),
        "precision": precision_score(y_test, pred),
        "recall": recall_score(y_test, pred),
        "f1": f1_score(y_test, pred),
        "TP": tp,
        "TN": tn,
        "FP": fp,
        "FN": fn
    })

run2_df = pd.DataFrame(run2_results)
display(run2_df)

best_leaf = int(run2_df.sort_values("accuracy", ascending=False).iloc[0]["min_samples_leaf"])
print("Best min_samples_leaf:", best_leaf)

In [None]:
plt.plot(run2_df["min_samples_leaf"], run2_df["accuracy"], marker="o")
plt.xlabel("min_samples_leaf")
plt.ylabel("accuracy")
plt.title("Run 2: min_samples_leaf vs accuracy")
plt.show()

In [None]:
run3_results = []

for mf in [None, "sqrt", 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:

    classf = DecisionTreeClassifier(
        criterion=best_criterion,
        min_samples_leaf=best_leaf,
        max_features=mf,
        random_state=101
    )

    classf.fit(x_train, y_train)
    pred = classf.predict(x_test)

    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

    run3_results.append({
        "max_features": mf,
        "accuracy": accuracy_score(y_test, pred),
        "precision": precision_score(y_test, pred),
        "recall": recall_score(y_test, pred),
        "f1": f1_score(y_test, pred),
        "TP": tp,
        "TN": tn,
        "FP": fp,
        "FN": fn
    })

run3_df = pd.DataFrame(run3_results)
display(run3_df)

best_max_features = run3_df.sort_values("accuracy", ascending=False).iloc[0]["max_features"]
print("Best max_features:", best_max_features)


In [None]:
plt.plot(run3_df["max_features"].astype(str), run3_df["accuracy"], marker="o")
plt.xlabel("max_features")
plt.ylabel("accuracy")
plt.title("Run 3: max_features vs accuracy")
plt.show()

In [None]:
run4_results = []

for depth in [2, 4, 6, 8, 10, 12, 14, 16]:

    classf = DecisionTreeClassifier(
        criterion=best_criterion,
        min_samples_leaf=best_leaf,
        max_features=best_max_features,
        max_depth=depth,
        random_state=101
    )

    classf.fit(x_train, y_train)
    pred = classf.predict(x_test)

    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

    run4_results.append({
        "max_depth": depth,
        "accuracy": accuracy_score(y_test, pred),
        "precision": precision_score(y_test, pred),
        "recall": recall_score(y_test, pred),
        "f1": f1_score(y_test, pred),
        "TP": tp,
        "TN": tn,
        "FP": fp,
        "FN": fn
    })

run4_df = pd.DataFrame(run4_results)
display(run4_df)

best_depth = int(run4_df.sort_values("accuracy", ascending=False).iloc[0]["max_depth"])
print("Best max_depth:", best_depth)

In [None]:
plt.plot(run4_df["max_depth"], run4_df["accuracy"], marker="o")
plt.xlabel("max_depth")
plt.ylabel("accuracy")
plt.title("Run 4: max_depth vs accuracy")
plt.show()

In [None]:
final_model = DecisionTreeClassifier(
    criterion=best_criterion,
    min_samples_leaf=best_leaf,
    max_features=best_max_features,
    max_depth=best_depth,
    random_state=101
)

final_model.fit(x_train, y_train)
final_pred = final_model.predict(x_test)

print("Final Accuracy:", accuracy_score(y_test, final_pred))
print("Best Parameters:")
print("criterion =", best_criterion)
print("min_samples_leaf =", best_leaf)
print("max_features =", best_max_features)
print("max_depth =", best_depth)

In [None]:
import time

start_time = time.time()

final_model.fit(x_train, y_train)

end_time = time.time()

total_time = end_time - start_time

print("Total training time (seconds):", total_time)


In [None]:
!pip install graphviz

# **6. Visualize Your Best Decision Tree using GraphViz**

In [None]:
import re
from sklearn.tree import export_graphviz
import graphviz

safe_feature_names = [re.sub(r'[^0-9a-zA-Z_]+', '_', str(c)) for c in x_train.columns]

dot_data = export_graphviz(
    final_model,
    out_file=None,
    feature_names=safe_feature_names,
    class_names=["LE_50K", "GT_50K"],
    filled=True,
    rounded=True,
    special_characters=False
)

graphviz.Source(dot_data)

#the visual is too large to upload to github. Please run the code to see the final decision tree visualization.

# **7. Conclusion**

**Q.4 How long was your total run time to train the best model?**

Based on the time measurement cell in our notebook, the total run time to train the best-performing decision tree was approximately 0.021 seconds. Given the dataset size and the selected hyperparameters, this indicates that training was computationally efficient. Since decision trees do not require iterative optimization like some other models, the training process was relatively fast even with tuning applied.

**Q.5 Did you find the BEST TREE?**

Not necessarily. We selected the best-performing tree from the specific hyperparameter combinations we tested (including max_depth = 16, min_samples_leaf = 40, max_features = 0.6, and criterion = gini). However, this does not guarantee that it is the absolute best possible tree overall.

There are additional hyperparameters (such as min_samples_split, max_leaf_nodes, or different depth limits) that were not explored in this analysis. It is possible that a different combination outside our tested grid could achieve better performance. The final model achieved approximately 82.35% test accuracy, which shows strong predictive ability but confirms that the model is not perfect and could potentially be further optimized.

**Q.6 Write your observations from the visualization of the best tree**

From the visualization of the best-performing decision tree, the root node begins with the MSR (Marriage Status & Relationship) bin, indicating that it was the most informative feature for the first split based on Gini impurity. If that condition is false, the tree evaluates capital gains; if true, it moves to occupation_bin. This suggests that relationship status, capital gains, and occupation are strong predictors of income category in this dataset.

The tree was limited to a maximum depth of 16, which was one of the selected hyperparameter constraints. Although the tree is relatively deep, several terminal nodes still show non-zero Gini impurity values, meaning the leaves are not perfectly pure. This indicates that the model does not completely separate the income classes.

Additionally, occupation_bin appears more frequently in upper and mid-level splits compared to race_sex_bin, suggesting occupation has greater predictive influence. Toward the bottom of the tree, age_bin appears more often, indicating that age refines predictions after broader splits have already been made.

**Q.7 Will this Tree “overfit”? (Hint: Is this tree “fully grown”)**

This tree is not fully grown. A fully grown decision tree would have no constraints on parameters such as max_depth, min_samples_leaf, or min_samples_split, allowing it to continue splitting until all leaves are pure.

In our model, we imposed constraints including max_depth = 16 and min_samples_leaf = 40, which prevent excessive splitting. Because of these limitations, the tree cannot perfectly memorize the training data. The presence of non-zero Gini values in the leaf nodes further confirms that the model is not fully grown. While some risk of overfitting may still exist due to the tree’s depth, the imposed constraints help reduce that risk.

# **8. Prediction using your “trained” Decision Tree Mode**

In [None]:
import pandas as pd
import re


model = final_model

def normalize(s: str) -> str:
    """Normalize strings so tiny dash differences don’t break matching."""
    return (str(s)
            .replace("–", "-")
            .replace("—", "-")
            .replace("  ", " ")
            .strip()
            .lower())

def pick_label_contains(col, target_text):
    """Pick the EXACT label from label_encoders[col].classes_ that contains target_text."""
    target = normalize(target_text)
    classes = list(label_encoders[col].classes_)
    for c in classes:
        if target in normalize(c):
            return c
    raise ValueError(f"Could not find a class in {col} containing '{target_text}'. Classes were: {classes}")

def pick_range_label(value, classes):
    """
    Map a numeric value to a class label like:
    'a. 0-30' or 'd. 41-50 & 61-70'
    """
    v = float(value)
    for lab in classes:
        txt = str(lab)

        # find all ranges like 41-50 or 61-70 inside the label
        ranges = re.findall(r'(\d+)\s*-\s*(\d+)', txt)
        for lo, hi in ranges:
            lo, hi = float(lo), float(hi)
            if lo <= v <= hi:
                return lab

    raise ValueError(f"Could not map value={value} to any bin label in: {list(classes)}")

# 1) Build the row using the assignment’s info
row_labels = {}

# hours worked per week = 48
row_labels["hours_per_week_bin"] = pick_range_label(
    48, label_encoders["hours_per_week_bin"].classes_
)

# occupation = Mid - Low
row_labels["occupation_bin"] = pick_label_contains("occupation_bin", "Mid - Low")

# marriage status & relationships = High
row_labels["msr_bin"] = pick_label_contains("msr_bin", "High")

# capital gain = Yes -> capital_gl_bin classes look like: 'a. = 0', 'b. < 0', 'c. > 0'
row_labels["capital_gl_bin"] = pick_label_contains("capital_gl_bin", "> 0")

# race-sex group = Mid
row_labels["race_sex_bin"] = pick_label_contains("race_sex_bin", "Mid")

# education category = High
row_labels["education_bin"] = pick_label_contains("education_bin", "High")

# work class = Income
row_labels["workclass_bin"] = pick_label_contains("workclass_bin", "income")

# age = 58
row_labels["age_bin"] = pick_range_label(
    58, label_encoders["age_bin"].classes_
)

# 2) Convert to DataFrame in the SAME column order as training
new_df = pd.DataFrame([row_labels]).reindex(columns=x_train.columns)

# 3) Apply the SAME encoders used in training
for col in new_df.columns:
    if col in label_encoders:
        new_df[col] = label_encoders[col].transform(new_df[col].astype(str))

# 4) Predict + probability
pred_num = model.predict(new_df)[0]
proba = model.predict_proba(new_df)[0]          # [P(class0), P(class1)]

pred_label = "<=50K" if pred_num == 0 else ">50K"
prob_correct = float(proba.max())              # probability of the predicted class

print("Q8 Prediction:", pred_label)
print("Probability prediction is correct:", prob_correct)
print("Probabilities [<=50K, >50K]:", proba)
print("Row used (encoded):")
display(new_df)


chatgpt link. Please scroll up to see project related chats.
https://chatgpt.com/share/e/6994dd80-f6a4-800d-89c1-2dc0231349b3
