In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_linnerud
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from mlxtend.plotting import plot_decision_regions
from copy import deepcopy
from pprint import pprint

In [2]:
df_titanic = pd.read_csv('/content/train.csv')
df_boston = pd.read_csv('/content/BostonHousing.csv')

In [3]:
df_titanic_factorized = pd.DataFrame(df_titanic)
df_titanic = df_titanic_factorized
for column in df_titanic_factorized.select_dtypes(include=['object']).columns:
    df_titanic_factorized[column], unique = pd.factorize(df_titanic_factorized[column])

In [4]:
df_titanic_factorized.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,1.0,-0.005007,-0.035144,1.0,-0.042939,0.036847,-0.057527,-0.001652,0.760875,0.012658,0.241918,-0.030323
Survived,-0.005007,1.0,-0.338481,-0.005007,0.543351,-0.077221,-0.035322,0.081629,-0.047298,0.257307,0.270495,0.101849
Pclass,-0.035144,-0.338481,1.0,-0.035144,-0.1319,-0.369226,0.083081,0.018443,-0.017489,-0.5495,-0.623554,0.050992
Name,1.0,-0.005007,-0.035144,1.0,-0.042939,0.036847,-0.057527,-0.001652,0.760875,0.012658,0.241918,-0.030323
Sex,-0.042939,0.543351,-0.1319,-0.042939,1.0,-0.093254,0.114631,0.245489,-0.132709,0.182333,0.082104,0.111249
Age,0.036847,-0.077221,-0.369226,0.036847,-0.093254,1.0,-0.308247,-0.189119,0.133553,0.096067,0.231448,0.002626
SibSp,-0.057527,-0.035322,0.083081,-0.057527,0.114631,-0.308247,1.0,0.414838,-0.303229,0.159651,-0.058893,-0.058008
Parch,-0.001652,0.081629,0.018443,-0.001652,0.245489,-0.189119,0.414838,1.0,-0.273002,0.216225,-0.003678,-0.076625
Ticket,0.760875,-0.047298,-0.017489,0.760875,-0.132709,0.133553,-0.303229,-0.273002,1.0,-0.142578,0.212438,-0.020135
Fare,0.012658,0.257307,-0.5495,0.012658,0.182333,0.096067,0.159651,0.216225,-0.142578,1.0,0.397105,0.058462


In [5]:
df_titanic = df_titanic_factorized.drop(columns = ['PassengerId', 'Name'] )

In [6]:
X = df_titanic.drop(columns=['Survived'])
y = df_titanic['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=3, stratify=y)

In [18]:

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def gini_index(self, labels):
        _, counts = np.unique(labels, return_counts=True)
        probabilities = counts / counts.sum()
        gini = 1 - np.sum(probabilities ** 2)
        return gini

    def entropy(self, labels):
        _, counts = np.unique(labels, return_counts=True)
        probabilities = counts / counts.sum()
        ent = -np.sum(probabilities * np.log2(probabilities))
        return ent

    def chosen_feature(self, X, y):
        ind = []
        for i in range(X.shape[1]):
            feature = X.iloc[:, i]
            _, gini = self.find_optimal_split(feature, y)
            ind.append(gini)
        min_index = ind.index(min(ind))
        column_name = X.columns[min_index]
        return column_name

    def find_optimal_split(self, feature, labels):
        sorted_indices = np.argsort(feature)
        feature = np.array(feature)[sorted_indices]
        labels = np.array(labels)[sorted_indices]

        min_gini = float("inf")
        optimal_threshold = None

        for i in range(1, len(feature)):
            threshold = (feature[i - 1] + feature[i]) / 2

            left_labels = labels[:i]
            right_labels = labels[i:]

            gini_left = self.gini_index(left_labels)
            gini_right = self.gini_index(right_labels)

            gini_total = (len(left_labels) * gini_left + len(right_labels) * gini_right) / len(labels)

            if gini_total < min_gini:
                min_gini = gini_total
                optimal_threshold = threshold

        return optimal_threshold, min_gini

    def split_data(self, feature, labels, optimal_threshold):

        left_data = []
        left_labels = []
        right_data = []
        right_labels = []

        for i in range(len(feature)):
            if feature[i] <= optimal_threshold:
                left_data.append(feature[i])
                left_labels.append(labels[i])
            else:
                right_data.append(feature[i])
                right_labels.append(labels[i])

        return (left_data, left_labels), (right_data, right_labels)

    def build_tree(self, X, y, depth=0):
        # Базовый случай: если глубина превышена или все метки одинаковы
        if len(set(y)) == 1 or (self.max_depth and depth >= self.max_depth):
            return {"label": y[0]}

        # Выбор лучшего признака
        best_feature = self.chosen_feature(X, y)
        feature_index = X.columns.get_loc(best_feature)

        # Оптимальный порог для разделения
        threshold, _ = self.find_optimal_split(X.iloc[:, feature_index], y)

        # Разделение данных
        left_indices = X.iloc[:, feature_index] <= threshold
        right_indices = ~left_indices

        left_tree = self.build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self.build_tree(X[right_indices], y[right_indices], depth + 1)

        tree = {
            "feature": best_feature,
            "threshold": threshold,
            "left": left_tree,
            "right": right_tree,
        }
        return tree

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict_sample(self, sample, tree):
        if "label" in tree:
            return tree["label"]

        feature = tree["feature"]
        threshold = tree["threshold"]
        if sample[feature] <= threshold:
            return self.predict_sample(sample, tree["left"])
        else:
            return self.predict_sample(sample, tree["right"])

    def predict(self, X):
        return [self.predict_sample(sample, self.tree) for _, sample in X.iterrows()]

In [19]:
tree = DecisionTree(max_depth=3)

In [20]:
tree.fit(X_train, y_train)

  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)


KeyError: 0

In [None]:
tree.predict(X_test)

**RandomForest**

In [None]:
import random

class RandomForest:
    def __init__(self, n_trees=10, max_depth=None, sample_size=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.sample_size = sample_size
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            sample_indices = random.sample(range(len(X)), self.sample_size or len(X))
            sample_X = [X[i] for i in sample_indices]
            sample_y = [y[i] for i in sample_indices]

            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(sample_X, sample_y)
            self.trees.append(tree)

    def predict(self, X):
        tree_predictions = [tree.predict(X) for tree in self.trees]
        # Усреднение предсказаний деревьев (например, для классификации)
        return [max(set(preds), key=preds.count) for preds in zip(*tree_predictions)]
