In [39]:
%pip install scikit-learn pandas numpy matplotlib

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv("data/titanic.csv")

# Select features from PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = data[features]
y = data["Survived"]

X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [2]:
# Handle missing values
X["Age"].fillna(X["Age"].median(), inplace=True)
X["Embarked"].fillna(X["Embarked"].mode()[0], inplace=True)

# Encode categorical variables
le = LabelEncoder()
X["Sex"] = le.fit_transform(X["Sex"]).astype(np.int8)
X["Embarked"] = le.fit_transform(X["Embarked"]).astype(np.int8)

# Convert to numpy arrays
X = X.values
y = y.values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X["Age"].fillna(X["Age"].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Age"].fillna(X["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(val

In [3]:
# Assuming the SimpleDecisionTree class is already defined
from SimpleDecisionTree import SimpleDecisionTree

tree = SimpleDecisionTree(max_depth=5)
tree.fit(X_train, y_train)

In [4]:
y_pred = tree.predict(X_test)

In [5]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.80

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       105
           1       0.83      0.66      0.74        74

    accuracy                           0.80       179
   macro avg       0.81      0.78      0.79       179
weighted avg       0.81      0.80      0.80       179



In [6]:
def visualize_tree(node, depth=0):
    if "value" in node:
        print("  " * depth + f"Predict: {node['value']}")
    else:
        feature_names = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
        print(
            "  " * depth
            + f"Split on {feature_names[node['feature_index']]} < {node['threshold']:.2f}"
        )
        print("  " * (depth + 1) + "Left:")
        visualize_tree(node["left"], depth + 2)
        print("  " * (depth + 1) + "Right:")
        visualize_tree(node["right"], depth + 2)


visualize_tree(tree.tree)

Split on Sex < 1.00
  Left:
    Split on Pclass < 3.00
      Left:
        Split on Age < 3.00
          Left:
            Split on Pclass < 2.00
              Left:
                Predict: 0
              Right:
                Predict: 1
          Right:
            Split on Age < 28.00
              Left:
                Split on Age < 24.00
                  Left:
                    Predict: 1
                  Right:
                    Predict: 1
              Right:
                Split on Age < 50.00
                  Left:
                    Predict: 1
                  Right:
                    Predict: 1
      Right:
        Split on Fare < 23.45
          Left:
            Split on Embarked < 2.00
              Left:
                Split on Fare < 15.50
                  Left:
                    Predict: 1
                  Right:
                    Predict: 1
              Right:
                Split on Fare < 7.75
                  Left:
                    Predi

In [7]:
def calculate_feature_importance(tree):
    def count_feature_usage(node, counts):
        if "feature_index" in node:
            counts[node["feature_index"]] += 1
            count_feature_usage(node["left"], counts)
            count_feature_usage(node["right"], counts)

    feature_counts = [0] * tree.n_features
    count_feature_usage(tree.tree, feature_counts)
    total = sum(feature_counts)
    return [count / total for count in feature_counts]


feature_importance = calculate_feature_importance(tree)
for i, importance in enumerate(feature_importance):
    print(f"Feature {features[i]}: {importance:.4f}")

Feature Pclass: 0.1304
Feature Sex: 0.0435
Feature Age: 0.4783
Feature SibSp: 0.0435
Feature Parch: 0.0000
Feature Fare: 0.2609
Feature Embarked: 0.0435
