In [49]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
)

In [4]:
df = df.drop(["Name", "Ticket", "Cabin"], axis=1)

In [230]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [6]:
cat_col = ["Pclass", "Sex", "Embarked"]
num_col = ["Age", "SibSp", "Parch", "Fare"]

In [7]:
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

In [119]:
preprocessing = make_column_transformer(
    (
        OrdinalEncoder(
            handle_unknown="use_encoded_value",
            unknown_value=-1,
            encoded_missing_value=-2,
        ),
        cat_col,
    ),
    (SimpleImputer(strategy="mean"), num_col),
    remainder="passthrough",
)

In [120]:
test = pd.DataFrame(
    preprocessing.fit_transform(df), columns=preprocessing.get_feature_names_out()
)

In [121]:
Y = test.remainder__Survived

In [122]:
x = test.drop("remainder__Survived", axis=1)

In [126]:
x

Unnamed: 0,ordinalencoder__Pclass,ordinalencoder__Sex,ordinalencoder__Embarked,simpleimputer__Age,simpleimputer__SibSp,simpleimputer__Parch,simpleimputer__Fare,remainder__PassengerId
0,2.0,1.0,2.0,22.000000,1.0,0.0,7.2500,1.0
1,0.0,0.0,0.0,38.000000,1.0,0.0,71.2833,2.0
2,2.0,0.0,2.0,26.000000,0.0,0.0,7.9250,3.0
3,0.0,0.0,2.0,35.000000,1.0,0.0,53.1000,4.0
4,2.0,1.0,2.0,35.000000,0.0,0.0,8.0500,5.0
...,...,...,...,...,...,...,...,...
886,1.0,1.0,2.0,27.000000,0.0,0.0,13.0000,887.0
887,0.0,0.0,2.0,19.000000,0.0,0.0,30.0000,888.0
888,2.0,0.0,2.0,29.699118,1.0,2.0,23.4500,889.0
889,0.0,1.0,0.0,26.000000,0.0,0.0,30.0000,890.0


In [207]:
def get_gini_score_cat(note_split: pd.Series, y: pd.Series):

    gini_list = []

    for i in np.sort(note_split.unique()):
        child_left_y = y[note_split == i]
        child_right_y = y[~(note_split == i)]

        left_impu = (
            1
            - (child_left_y.sum() / child_left_y.count()) ** 2
            - ((child_left_y.count() - child_left_y.sum()) / child_left_y.count()) ** 2
        )

        right_impu = (
            1
            - (child_right_y.sum() / child_right_y.count()) ** 2
            - ((child_right_y.count() - child_right_y.sum()) / child_right_y.count())
            ** 2
        )

        total_impu = (
            child_left_y.sum() / y.count() * left_impu
            + child_right_y.sum() / y.count() * right_impu
        )
        gini_list.append([note_split.name, i, total_impu])

    return gini_list

In [208]:
def get_gini_score_num(note_split: pd.Series, y: pd.Series):

    val_list = np.unique((np.sort(note_split)[:-1] + np.sort(note_split)[1:]) / 2)

    gini_list = []

    for i in val_list:
        child_left_y = y[note_split <= i]
        child_right_y = y[~(note_split <= i)]

        left_impu = (
            1
            - (child_left_y.sum() / child_left_y.count()) ** 2
            - ((child_left_y.count() - child_left_y.sum()) / child_left_y.count()) ** 2
        )

        right_impu = (
            1
            - (child_right_y.sum() / child_right_y.count()) ** 2
            - ((child_right_y.count() - child_right_y.sum()) / child_right_y.count())
            ** 2
        )

        total_impu = (
            child_left_y.sum() / y.count() * left_impu
            + child_right_y.sum() / y.count() * right_impu
        )

        gini_list.append([note_split.name, i, total_impu])

    return gini_list

In [236]:
np.argmin(np.array(get_gini_score_cat(x.simpleimputer__Fare, Y))[:, 2])

38

In [237]:
get_gini_score_cat(x.simpleimputer__Fare, Y)[38]

['simpleimputer__Fare', 7.8792, 0.17894138604029308]

In [278]:
def find_node(x: pd.DataFrame, y: pd.Series):
    split_list = []

    for i in x.columns:
        split_list.extend(get_gini_score_num(x[i], y))

    best_split = split_list[np.argmin(np.array(split_list)[:, 2])]
    return best_split

In [282]:
find_node(x, Y)

  - (child_right_y.sum() / child_right_y.count()) ** 2
  - ((child_right_y.count() - child_right_y.sum()) / child_right_y.count())


['ordinalencoder__Sex', 0.0, 0.13760148246050433]

In [281]:
test

['ordinalencoder__Sex', 0.0, 0.13760148246050433]

In [None]:
class DecisionTree:
    def __init__(self, node):
        self.value = node
        self.true_left = None
        self.false_right = None