In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
observation = pd.read_csv("./094/observation.csv", sep='\t', engine="python")

In [None]:
y = observation['oximetry'].values
X = observation.drop(columns=['oximetry'], axis=1).values
feature_names = observation.drop(columns=['oximetry'], axis=1).columns

In [None]:
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.impute import SimpleImputer

quantile_features = [20, 13]
scaled_features = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21]

qt = QuantileTransformer(output_distribution='normal', random_state=42, n_quantiles=2)
scaler = StandardScaler()
imputer = SimpleImputer(strategy="median")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
imputer.fit(X_train)

X_train_imp = imputer.transform(X_train)
X_test_imp  = imputer.transform(X_test)

qt.fit(X_train_imp[quantile_features])
scaler.fit(X_train_imp[scaled_features])

X_train_tr = X_train_imp.copy()
X_test_tr  = X_test_imp.copy()

X_train_tr[quantile_features] = qt.transform(X_train_imp[quantile_features])
X_test_tr[quantile_features]  = qt.transform(X_test_imp[quantile_features])

X_train_tr[scaled_features] = scaler.transform(X_train_imp[scaled_features])
X_test_tr[scaled_features]  = scaler.transform(X_test_imp[scaled_features])

## Functions

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

results = []

def print_metrics(name, y_true, y_pred):
    print(f"=== {name} ===")
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    print("Precision (macro):", precision_score(y_true, y_pred, average='macro', zero_division=0))
    print("Recall (macro):   ", recall_score(y_true, y_pred, average='macro', zero_division=0))
    print()

def evaluate_model(name, y_train, y_train_pred, y_test, y_test_pred, average='binary'):
    acc_train = accuracy_score(y_train, y_train_pred)
    acc_test = accuracy_score(y_test, y_test_pred)

    prec_train = precision_score(y_train, y_train_pred, average=average)
    prec_test = precision_score(y_test, y_test_pred, average=average)

    rec_train = recall_score(y_train, y_train_pred, average=average)
    rec_test = recall_score(y_test, y_test_pred, average=average)

    return {
        "model": name,
        "accuracy_train": acc_train,
        "precision_train": prec_train,
        "recall_train": rec_train,
        "accuracy_test": acc_test,
        "precision_test": prec_test,
        "recall_test": rec_test,
    }

## 3.1 A

In [None]:
def entropy(y):
    counts = np.bincount(y)
    probs = counts[counts > 0] / len(y)
    return -np.sum(probs * np.log2(probs))

def information_gain(y, y_left, y_right):
    H_before = entropy(y)
    n = len(y)
    H_after = (len(y_left) / n) * entropy(y_left) + (len(y_right) / n) * entropy(y_right)
    return H_before - H_after

In [None]:
class TreeNode:
    def __init__(self, *, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
    def is_leaf_node(self):
        return self.value is not None

In [None]:
class ID3Classifier:
    def __init__(self, max_depth=3, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y).astype(int)
        self.n_classes_ = len(np.unique(y))
        self.n_features_ = X.shape[1]
        self.root = self._grow_tree(X, y, depth=1)
        return self
    
    def _grow_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))

        if (depth >= self.max_depth or 
            num_labels == 1 or 
            num_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return TreeNode(value=leaf_value)

        best_feature, best_threshold, best_gain = self._id3(X, y)
        
        if best_gain == 0 or best_feature is None:
            leaf_value = self._most_common_label(y)
            return TreeNode(value=leaf_value)

        indices_left = X[:, best_feature] <= best_threshold
        X_left, y_left = X[indices_left], y[indices_left]
        X_right, y_right = X[~indices_left], y[~indices_left]

        left_child = self._grow_tree(X_left, y_left, depth + 1)
        right_child = self._grow_tree(X_right, y_right, depth + 1)

        return TreeNode(feature_index=best_feature, threshold=best_threshold, left=left_child, right=right_child)

    def _id3(self, X, y):
        best_gain = 0.0
        best_feature = None
        best_threshold = None
        n_samples, n_features = X.shape

        for feature_index in range(n_features):
            X_column = X[:, feature_index]
            thresholds = np.unique(X_column)

            for threshold in thresholds:
                left_indices = X_column <= threshold
                right_indices = X_column > threshold

                if left_indices.sum() == 0 or right_indices.sum() == 0:
                    continue

                y_left, y_right = y[left_indices], y[right_indices]
                gain = information_gain(y, y_left, y_right)

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_index
                    best_threshold = threshold

        return best_feature, best_threshold, best_gain

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        X = np.array(X)
        return np.array([self._predict(x, self.root) for x in X])

    def _predict(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature_index] <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)

In [None]:
id3 = ID3Classifier(max_depth=6, min_samples_split=2)
id3.fit(X_train, y_train)

y_pred_train = id3.predict(X_train_tr)
y_pred_test = id3.predict(X_test_tr)

max_depth=3, min_samples_split=2 => train_acc=0.806, test_acc=0.798

max_depth=3, min_samples_split=5 => train_acc=0.806, test_acc=0.798

max_depth=3, min_samples_split=10 => train_acc=0.806, test_acc=0.798

max_depth=3, min_samples_split=20 => train_acc=0.806, test_acc=0.798

max_depth=4, min_samples_split=2 => train_acc=0.816, test_acc=0.817

max_depth=4, min_samples_split=5 => train_acc=0.816, test_acc=0.817

max_depth=4, min_samples_split=10 => train_acc=0.816, test_acc=0.817

max_depth=4, min_samples_split=20 => train_acc=0.816, test_acc=0.817

max_depth=5, min_samples_split=2 => train_acc=0.849, test_acc=0.843

max_depth=5, min_samples_split=5 => train_acc=0.849, test_acc=0.843

max_depth=5, min_samples_split=10 => train_acc=0.849, test_acc=0.843

max_depth=5, min_samples_split=20 => train_acc=0.848, test_acc=0.843

max_depth=6, min_samples_split=2 => train_acc=0.871, test_acc=0.865

max_depth=6, min_samples_split=5 => train_acc=0.871, test_acc=0.865

max_depth=6, min_samples_split=10 => train_acc=0.871, test_acc=0.865

max_depth=6, min_samples_split=20 => train_acc=0.871, test_acc=0.865

max_depth=8, min_samples_split=2 => train_acc=0.914, test_acc=0.889

max_depth=8, min_samples_split=5 => train_acc=0.914, test_acc=0.890

max_depth=8, min_samples_split=10 => train_acc=0.913, test_acc=0.889

max_depth=8, min_samples_split=20 => train_acc=0.912, test_acc=0.889

max_depth=10, min_samples_split=2 => train_acc=0.944, test_acc=0.904

max_depth=10, min_samples_split=5 => train_acc=0.944, test_acc=0.904

max_depth=10, min_samples_split=10 => train_acc=0.941, test_acc=0.902

max_depth=10, min_samples_split=20 => train_acc=0.937, test_acc=0.902

## 3.1 B

In [None]:
print_metrics("ID3Classifier (Train)",  y_train, y_pred_train)
print_metrics("ID3Classifier (Test)",  y_test, y_pred_test)

results.append(
    evaluate_model("ID3Classifier", 
                   y_train, y_pred_train, 
                   y_test, y_pred_test,
                   average='binary')
)

## 3.1 C

Accuracy: **0.8709** − **0.8626** = **0.0083** => **0.83%**

Precision: **0.8806** - **0.8683** = **0.0123** => **1.23%**

Recall: **0.8538** - **0.8411** = **0.0127** => **1.27%**

The differences between training and testing values are minimal (less than 2%), which means that the model does not overfit and generalizes very well to new data. We achieved this stable performance by selecting appropriate hyperparameters for the ID3 classifier: *max_depth=6, min_samples_split=2*

## 3.2 A

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=6,     
    min_samples_leaf=2,
    min_samples_split=20,
    random_state=42
)

dt.fit(X_train_tr, y_train)

y_pred_dt_train = dt.predict(X_train_tr)
y_pred_dt_test = dt.predict(X_test_tr)

print_metrics("DecisionTreeClassifier (Train)",  y_train, y_pred_dt_train)
print_metrics("DecisionTreeClassifier (Test)",  y_test, y_pred_dt_test)

results.append(
    evaluate_model("DecisionTreeClassifier", 
                   y_train, y_pred_dt_train, 
                   y_test, y_pred_dt_test,
                   average='binary')
)

## 3.2 B

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    n_jobs=-1
)

log_reg.fit(X_train_tr, y_train)

y_pred_lr_train = log_reg.predict(X_train_tr)
y_pred_lr_test = log_reg.predict(X_test_tr)

print_metrics("LogisticRegression (Train)",  y_train, y_pred_lr_train)
print_metrics("LogisticRegression (Test)",  y_test, y_pred_lr_test)

results.append(
    evaluate_model("LogisticRegression", 
                   y_train, y_pred_lr_train, 
                   y_test, y_pred_lr_test,
                   average='binary')
)

## 3.2 C

DecisionTreeClassifier performed best. It is fast, well optimized, and supports splitting by numerical thresholds. ID3 performed good as well. It is slightly weaker than the scikit-learn tree, but the difference is not significant. LogisticRegression has the lowest accuracy. It is a linear model, so it cannot capture nonlinear relationships as well as trees. 

One more thing – speed. ID3 was the slowest of all models.

## 3.2 D

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(20, 10))
plot_tree(
    dt,
    feature_names=feature_names,
    class_names=[str(c) for c in np.unique(y_train)],
    filled=True,
    rounded=True,
    fontsize=8
)
plt.tight_layout()
plt.show()

## 3.2 E

In [None]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df

## 3.3 A