Decision tree classifier

Mathematics into coding

In [None]:
import numpy as np
import pandas as pd

#  dataset
data = {
    "Feature": ["Red", "Red", "Green", "Green", "Red"],
    "Label": ["Apple", "Apple", "Grape", "Grape", "Apple"]
}
df = pd.DataFrame(data)

# categorical variables
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df["Feature"] = encoder.fit_transform(df["Feature"])  # Encode 'Red' as 0 and 'Green' as 1
df["Label"] = encoder.fit_transform(df["Label"])      # Encode 'Apple' as 0 and 'Grape' as 1

print("Encoded Dataset:")
print(df)

# Gini Impurity function
def gini_impurity(groups, classes):
    # Total number of samples
    total_samples = sum([len(group) for group in groups])

    # Calculate the Gini impurity for each group
    gini = 0.0
    for group in groups:
        size = len(group)
        if size == 0:  # Avoid division by zero
            continue

        score = 0.0
        for class_val in classes:
            proportion = [row[-1] for row in group].count(class_val) / size
            score += proportion ** 2
        gini += (1 - score) * (size / total_samples)

    return gini

# Split  dataset
def test_split(index, value, dataset):
    left, right = [], []
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

# Find  best split
def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    best_index, best_value, best_score, best_groups = 999, 999, 999, None
    for index in range(len(dataset[0]) - 1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_impurity(groups, class_values)
            if gini < best_score:
                best_index, best_value, best_score, best_groups = index, row[index], gini, groups
    return {"index": best_index, "value": best_value, "groups": best_groups}

# Build  tree recursively
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

def split(node, max_depth, min_size, depth):
    left, right = node["groups"]
    del(node["groups"])
    if not left or not right:
        node["left"] = node["right"] = to_terminal(left + right)
        return
    if depth >= max_depth:
        node["left"], node["right"] = to_terminal(left), to_terminal(right)
        return
    if len(left) <= min_size:
        node["left"] = to_terminal(left)
    else:
        node["left"] = get_split(left)
        split(node["left"], max_depth, min_size, depth + 1)
    if len(right) <= min_size:
        node["right"] = to_terminal(right)
    else:
        node["right"] = get_split(right)
        split(node["right"], max_depth, min_size, depth + 1)

def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

#  predictions
def predict(node, row):
    if row[node["index"]] < node["value"]:
        if isinstance(node["left"], dict):
            return predict(node["left"], row)
        else:
            return node["left"]
    else:
        if isinstance(node["right"], dict):
            return predict(node["right"], row)
        else:
            return node["right"]

# Convert DataFrame to list for processing
dataset = df.values.tolist()

# Build and test the decision tree
tree = build_tree(dataset, max_depth=3, min_size=1)
print("Decision Tree:", tree)

# Test prediction
for row in dataset:
    prediction = predict(tree, row)
    print(f"Expected={row[-1]}, Predicted={prediction}")


Encoded Dataset:
   Feature  Label
0        1      0
1        1      0
2        0      1
3        0      1
4        1      0
Decision Tree: {'index': 0, 'value': 1, 'left': {'index': 0, 'value': 0, 'left': 1, 'right': 1}, 'right': {'index': 0, 'value': 1, 'left': 0, 'right': 0}}
Expected=0, Predicted=0
Expected=0, Predicted=0
Expected=1, Predicted=1
Expected=1, Predicted=1
Expected=0, Predicted=0


Experiments

Eperiment1

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# Space Exploration Dataset
data_space = {
    "Distance_from_Star_AU": [0.39, 0.72, 1.0, 1.5, 5.2],
    "Planet_Size_Earth_Radius": [0.38, 0.95, 1.0, 0.53, 11.2],
    "Atmosphere_Composition": ["O2", "CO2", "N2O", "CO2", "H2He"],
    "Habitable": ["No", "No", "Yes", "No", "No"]
}

df_space = pd.DataFrame(data_space)

# Encode categorical features
df_space["Atmosphere_Composition"] = LabelEncoder().fit_transform(df_space["Atmosphere_Composition"])
df_space["Habitable"] = LabelEncoder().fit_transform(df_space["Habitable"])

X = df_space[["Distance_from_Star_AU", "Planet_Size_Earth_Radius", "Atmosphere_Composition"]]
y = df_space["Habitable"]

model_space = DecisionTreeClassifier(max_depth=3)
model_space.fit(X, y)

# Test data with feature names
test_data = pd.DataFrame([[1.2, 1.1, 1]], columns=["Distance_from_Star_AU", "Planet_Size_Earth_Radius", "Atmosphere_Composition"])
print("Prediction (Space Exploration):", model_space.predict(test_data))


Prediction (Space Exploration): [1]


Expwriment2

In [None]:
# Cybersecurity Dataset
data_hacking = {
    "Login_Frequency": [10, 200, 3, 50, 1],
    "Country": ["US", "RU", "IN", "CN", "BR"],
    "Access_Time": ["Day", "Night", "Day", "Night", "Day"],
    "Hacker": ["No", "Yes", "No", "Yes", "No"]
}

df_hacking = pd.DataFrame(data_hacking)

# Encode categorical features
df_hacking["Country"] = LabelEncoder().fit_transform(df_hacking["Country"])
df_hacking["Access_Time"] = LabelEncoder().fit_transform(df_hacking["Access_Time"])
df_hacking["Hacker"] = LabelEncoder().fit_transform(df_hacking["Hacker"])

X = df_hacking[["Login_Frequency", "Country", "Access_Time"]]
y = df_hacking["Hacker"]

model_hacking = DecisionTreeClassifier(max_depth=3)
model_hacking.fit(X, y)

# Test data with feature names
test_data = pd.DataFrame([[30, 2, 1]], columns=["Login_Frequency", "Country", "Access_Time"])
print("Prediction (Cybersecurity):", model_hacking.predict(test_data))

Prediction (Cybersecurity): [1]


Experiment3

In [None]:
# Evolutionary Biology Dataset
data_evolution = {
    "Bone_Density": [1.2, 0.8, 0.5, 1.1, 0.7],
    "Leg_Count": [4, 4, 2, 4, 4],
    "Egg_Laying": ["No", "Yes", "Yes", "No", "Yes"],
    "Category": ["Mammal", "Reptile", "Bird", "Mammal", "Reptile"]
}

df_evolution = pd.DataFrame(data_evolution)

# Encode categorical features
df_evolution["Egg_Laying"] = LabelEncoder().fit_transform(df_evolution["Egg_Laying"])
df_evolution["Category"] = LabelEncoder().fit_transform(df_evolution["Category"])

X = df_evolution[["Bone_Density", "Leg_Count", "Egg_Laying"]]
y = df_evolution["Category"]

model_evolution = DecisionTreeClassifier(max_depth=3)
model_evolution.fit(X, y)

# Test data with feature names
test_data = pd.DataFrame([[0.6, 2, 1]], columns=["Bone_Density", "Leg_Count", "Egg_Laying"])
print("Prediction (Evolutionary Biology):", model_evolution.predict(test_data))

Prediction (Evolutionary Biology): [0]


Experiment4

In [None]:
# Simple Biology Dataset
data_biology = {
    "Height_cm": [500, 50, 150, 3, 20],
    "Leaf_Size_cm2": [30, 5, 10, 0.5, 2],
    "Flowering_Season": ["Spring", "Spring", "Summer", "Summer", "Winter"],
    "Plant_Type": ["Tree", "Shrub", "Shrub", "Grass", "Grass"]
}

df_biology = pd.DataFrame(data_biology)

# Encode categorical features
df_biology["Flowering_Season"] = LabelEncoder().fit_transform(df_biology["Flowering_Season"])
df_biology["Plant_Type"] = LabelEncoder().fit_transform(df_biology["Plant_Type"])

X = df_biology[["Height_cm", "Leaf_Size_cm2", "Flowering_Season"]]
y = df_biology["Plant_Type"]

model_biology = DecisionTreeClassifier(max_depth=3)
model_biology.fit(X, y)

# Test data with feature names
test_data = pd.DataFrame([[300, 15, 1]], columns=["Height_cm", "Leaf_Size_cm2", "Flowering_Season"])
print("Prediction (Simple Biology):", model_biology.predict(test_data))



Prediction (Simple Biology): [1]


Experiment5

In [None]:
# Alien Life Dataset
data_alien = {
    "Body_Shape": ["Humanoid", "Insectoid", "Humanoid", "Blob", "Insectoid"],
    "Limb_Count": [4, 6, 4, 0, 8],
    "Habitat": ["Land", "Land", "Water", "Water", "Land"],
    "Species_Type": ["TypeA", "TypeB", "TypeA", "TypeC", "TypeB"]
}

df_alien = pd.DataFrame(data_alien)

# Encode categorical features
df_alien["Body_Shape"] = LabelEncoder().fit_transform(df_alien["Body_Shape"])
df_alien["Habitat"] = LabelEncoder().fit_transform(df_alien["Habitat"])
df_alien["Species_Type"] = LabelEncoder().fit_transform(df_alien["Species_Type"])

X = df_alien[["Body_Shape", "Limb_Count", "Habitat"]]
y = df_alien["Species_Type"]

model_alien = DecisionTreeClassifier(max_depth=3)
model_alien.fit(X, y)

# Test data with feature names
test_data = pd.DataFrame([[0, 4, 1]], columns=["Body_Shape", "Limb_Count", "Habitat"])
print("Prediction (Alien Life):", model_alien.predict(test_data))

Prediction (Alien Life): [2]
