In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [None]:
def load_and_preprocess(path):
    df = pd.read_csv(path)

    # Keep relevant columns
    df = df[['Pclass', 'Gender', 'Age', 'Fare', 'Embarked', 'Survived']]

    # TODO: Drop rows with missing values
    df = df.dropna()      # inplace = True means original dataframe changed, false by default

    # TODO: Convert Age to AgeGroup (Child if < 16, else Adult)
    df['AgeGroup'] = df['Age'].apply(lambda x: 'Child' if x < 16 else 'Adult')
    df = df.drop('Age', axis=1)       # axis=1 means columns are to be dropped and not rows

    # TODO: Encode categorical features ('Gender', 'Embarked', 'AgeGroup')
    df = pd.get_dummies(df, columns=['Gender', 'Embarked', 'AgeGroup'])  # one hot encoding using get_dummies

    # TODO: Prepare X and y
    X = df.drop('Survived', axis=1)
    y = df['Survived']

    # Return train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test



In [None]:
def entropy(y):
    counts = np.bincount(y)
    ## np.bincount() is a NumPy function that counts the occurrences of each non-negative integer value in an array. It returns an array where the index represents the value and the element at that index represents its count in the input array.
    probabilities = counts / len(y)
    entropy = -np.sum([p * np.log2(p) for p in probabilities if p > 0])
    return entropy




In [None]:
def information_gain(X_column, y, threshold):
    parent_entropy = entropy(y) # class entropy
    # conditional entropy
    y_left = y[X_column < threshold]
    y_right = y[X_column >= threshold]
    if len(y_left) == 0 or len(y_right) == 0:
        return 0
    prob_left = len(y_left) / len(y)
    prob_right = len(y_right) / len(y)
    child_entropy = prob_left * entropy(y_left) + prob_right * entropy(y_right)
    ig = parent_entropy - child_entropy
    return ig


In [None]:
def best_split(X, y):
    best_split = {}
    best_info_gain = -1
    for feature in X.columns:
        X_column = X[feature]
        thresholds = np.unique(X_column)
        for threshold in thresholds:
            info_gain = information_gain(X_column, y, threshold)
            if info_gain > best_info_gain:
                best_split = {
                    'feature': feature,
                    'threshold': threshold,
                    'info_gain': info_gain
                }
                best_info_gain = info_gain
    return best_split


In [None]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

In [None]:
def build_tree(X, y, depth=0, max_depth=5):
    if depth == max_depth:
        most_common_label = Counter(y).most_common(1)[0][0]
        return Node(value=most_common_label)
    else :
        bestsplit = best_split(X, y)
        if bestsplit['info_gain'] <= 0:
            most_common_label = Counter(y).most_common(1)[0][0]
            return Node(value=most_common_label)
        else:
            left_indices = X[bestsplit['feature']] < bestsplit['threshold']
            right_indices = ~left_indices
            left_subtree = build_tree(X[left_indices], y[left_indices], depth + 1, max_depth)
            right_subtree = build_tree(X[right_indices], y[right_indices], depth + 1, max_depth)
            return Node(bestsplit['feature'], bestsplit['threshold'], left_subtree, right_subtree)


In [None]:
def predict_one(x, node):
    if node.value is not None:
        return node.value
    if x[node.feature] < node.threshold:
        return predict_one(x, node.left)
    else:
        return predict_one(x, node.right)


In [None]:
def predict(X, tree):
    y_pred = [predict_one(x, tree) for x in X.to_dict(orient='records')]
    return y_pred


In [None]:
if __name__ == "__main__":
    X_train, X_test, y_train, y_test = load_and_preprocess("/titanic.csv")

    tree = build_tree(X_train, y_train, max_depth=5)
    y_pred = predict(X_test, tree)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

Accuracy: 0.7972027972027972
              precision    recall  f1-score   support

           0       0.79      0.88      0.83        80
           1       0.81      0.70      0.75        63

    accuracy                           0.80       143
   macro avg       0.80      0.79      0.79       143
weighted avg       0.80      0.80      0.79       143

