In [5]:
# Импорт необходимых библиотек
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Загрузка данных для Titanic и House Prices
def load_and_preprocess_titanic(filepath):
    data = pd.read_csv(filepath)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data.dropna(subset=['Embarked'], inplace=True)
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])
    features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    X = data[features]
    y = data['Survived']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def load_and_preprocess_house_data(filepath):
    data = pd.read_csv(filepath)
    data['LotFrontage'].fillna(data['LotFrontage'].median(), inplace=True)
    data = pd.get_dummies(data)
    features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF']
    X = data[features]
    y = data['SalePrice']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Загрузка и разделение данных
X_train_titanic, X_test_titanic, y_train_titanic, y_test_titanic = load_and_preprocess_titanic('titanic/train.csv')
X_train_house, X_test_house, y_train_house, y_test_house = load_and_preprocess_house_data('house-prices-advanced-regression-techniques/train.csv')

In [3]:
# Обучение модели Decision Tree для Titanic
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_titanic, y_train_titanic)

# Предсказания
y_pred_titanic = clf.predict(X_test_titanic)

# Оценка качества модели
print("Decision Tree Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic))

Decision Tree Classifier - Titanic Dataset:
Accuracy: 0.7415730337078652
Precision: 0.6455696202531646
Recall: 0.7391304347826086
F1 Score: 0.6891891891891891


In [6]:
# Обучение модели Decision Tree для House Prices
regr = DecisionTreeRegressor(random_state=42)
regr.fit(X_train_house, y_train_house)

# Предсказания
y_pred_house = regr.predict(X_test_house)

# Оценка качества модели
print("\nDecision Tree Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house))


Decision Tree Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 25180.48595890411
Mean Squared Error (MSE): 1256737448.1441362
Root Mean Squared Error (RMSE): 35450.49291821111
R-squared (R²): 0.8361559320688906


In [16]:
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score

# Гипотеза 1: Тщательный препроцессинг
def preprocess_titanic_data_extended(filepath):
    data = pd.read_csv(filepath)
    # Заполнение пропусков более интеллектуальным методом
    data['Age'].fillna(data.groupby('Pclass')['Age'].transform('median'), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data.dropna(subset=['Embarked'], inplace=True)
    # Предположим, что Cabin было заменено, если неизвестно
    data['Cabin'] = data['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])
    # Лог-преобразование Fare
    data['Fare'] = np.log1p(data['Fare'])
    features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Cabin']
    X = data[features]
    y = data['Survived']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Обновляем данные для Titanic
X_train_titanic_ext, X_test_titanic_ext, y_train_titanic_ext, y_test_titanic_ext = preprocess_titanic_data_extended('titanic/train.csv')

# Гипотеза 4: Подбор гиперпараметров
param_grid_classifier = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = DecisionTreeClassifier(random_state=42)
grid_clf = GridSearchCV(clf, param_grid_classifier, cv=5, scoring='accuracy')
grid_clf.fit(X_train_titanic_ext, y_train_titanic_ext)

print("Best parameters for Decision Tree Classifier:", grid_clf.best_params_)

Best parameters for Decision Tree Classifier: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [18]:
def preprocess_house_data_extended(filepath):
    data = pd.read_csv(filepath)
    data['LotFrontage'].fillna(data['LotFrontage'].median(), inplace=True)
    # Лог-преобразование SalePrice
    data['SalePrice'] = np.log1p(data['SalePrice'])
    data = pd.get_dummies(data)
    features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF']
    X = data[features]
    y = data['SalePrice']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Обновляем данные для House Prices
X_train_house_ext, X_test_house_ext, y_train_house_ext, y_test_house_ext = preprocess_house_data_extended('house-prices-advanced-regression-techniques/train.csv')

param_grid_regressor = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

regr = DecisionTreeRegressor(random_state=42)
grid_regr = GridSearchCV(regr, param_grid_regressor, cv=5, scoring='neg_mean_squared_error')
grid_regr.fit(X_train_house_ext, y_train_house_ext)

print("Best parameters for Decision Tree Regressor:", grid_regr.best_params_)

Best parameters for Decision Tree Regressor: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [19]:
# Обучение и оценка улучшенной модели Decision Tree для Titanic
best_clf = grid_clf.best_estimator_
y_pred_titanic_ext = best_clf.predict(X_test_titanic_ext)

print("\nImproved Decision Tree Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic_ext, y_pred_titanic_ext))
print("Precision:", precision_score(y_test_titanic_ext, y_pred_titanic_ext))
print("Recall:", recall_score(y_test_titanic_ext, y_pred_titanic_ext))
print("F1 Score:", f1_score(y_test_titanic_ext, y_pred_titanic_ext))

# Обучение и оценка улучшенной модели Decision Tree для House Prices
best_regr = grid_regr.best_estimator_
y_pred_house_ext = best_regr.predict(X_test_house_ext)

print("\nImproved Decision Tree Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house_ext, y_pred_house_ext))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house_ext, y_pred_house_ext))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house_ext, y_pred_house_ext)))
print("R-squared (R²):", r2_score(y_test_house_ext, y_pred_house_ext))


Improved Decision Tree Classifier - Titanic Dataset:
Accuracy: 0.7865168539325843
Precision: 0.7384615384615385
Recall: 0.6956521739130435
F1 Score: 0.7164179104477613

Improved Decision Tree Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 0.14482998825841226
Mean Squared Error (MSE): 0.04099732501375583
Root Mean Squared Error (RMSE): 0.20247796179771227
R-squared (R²): 0.7803059438046104


In [23]:
from collections import Counter

class SimpleDecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if (self.max_depth is not None and depth >= self.max_depth) or len(set(y)) == 1:
            return self._create_leaf(y)

        feature, threshold = self._find_best_split(X, y)
        if feature is None:
            return self._create_leaf(y)

        left_indices = X[:, feature] <= threshold
        right_indices = X[:, feature] > threshold

        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return feature, threshold, left_tree, right_tree

    def _find_best_split(self, X, y):
        best_gini = float('inf')
        best_feature = None
        best_threshold = None

        for feature in range(X.shape[1]):
            thresholds, ginis = self._calculate_gini_index(X[:, feature], y)
            if ginis:
                min_gini = min(ginis)
                if min_gini < best_gini:
                    best_gini = min_gini
                    best_feature = feature
                    best_threshold = thresholds[ginis.index(min_gini)]

        return best_feature, best_threshold

    def _calculate_gini_index(self, feature, y):
        thresholds = sorted(set(feature))
        ginis = []

        for threshold in thresholds:
            left_indices = feature <= threshold
            right_indices = feature > threshold

            left_gini = self._gini(y[left_indices])
            right_gini = self._gini(y[right_indices])
            weighted_gini = (left_gini * sum(left_indices) + right_gini * sum(right_indices)) / len(y)

            ginis.append(weighted_gini)

        return thresholds, ginis

    def _gini(self, y):
        proportions = [np.sum(y == c) / len(y) for c in np.unique(y)]
        return 1 - sum([p**2 for p in proportions])

    def _create_leaf(self, y):
        return Counter(y).most_common(1)[0][0]

    def predict(self, X):
        return [self._predict_single(instance, self.tree) for instance in X]

    def _predict_single(self, instance, node):
        if isinstance(node, (np.integer, int)):
            return node
        feature, threshold, left_tree, right_tree = node
        if instance[feature] <= threshold:
            return self._predict_single(instance, left_tree)
        else:
            return self._predict_single(instance, right_tree)

In [24]:
class SimpleDecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if (self.max_depth is not None and depth >= self.max_depth) or len(set(y)) == 1:
            return np.mean(y)

        feature, threshold = self._find_best_split(X, y)
        if feature is None:
            return np.mean(y)

        left_indices = X[:, feature] <= threshold
        right_indices = X[:, feature] > threshold

        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return feature, threshold, left_tree, right_tree

    def _find_best_split(self, X, y):
        best_mse = float('inf')
        best_feature = None
        best_threshold = None

        for feature in range(X.shape[1]):
            thresholds, mses = self._calculate_mse_split(X[:, feature], y)
            if mses:
                min_mse = min(mses)
                if min_mse < best_mse:
                    best_mse = min_mse
                    best_feature = feature
                    best_threshold = thresholds[mses.index(min_mse)]

        return best_feature, best_threshold

    def _calculate_mse_split(self, feature, y):
        thresholds = sorted(set(feature))
        mses = []

        for threshold in thresholds:
            left_indices = feature <= threshold
            right_indices = feature > threshold

            left_mse = self._mse(y[left_indices])
            right_mse = self._mse(y[right_indices])
            weighted_mse = (left_mse * sum(left_indices) + right_mse * sum(right_indices)) / len(y)

            mses.append(weighted_mse)

        return thresholds, mses

    def _mse(self, y):
        mean = np.mean(y)
        return np.mean((y - mean) ** 2)

    def predict(self, X):
        return [self._predict_single(instance, self.tree) for instance in X]

    def _predict_single(self, instance, node):
        if isinstance(node, (np.float32, np.float64, float)):
            return node
        feature, threshold, left_tree, right_tree = node
        if instance[feature] <= threshold:
            return self._predict_single(instance, left_tree)
        else:
            return self._predict_single(instance, right_tree)


In [25]:
# Обучение самописной модели Decision Tree для Titanic
simple_clf = SimpleDecisionTreeClassifier(max_depth=3)
simple_clf.fit(X_train_titanic.values, y_train_titanic.values)
y_pred_titanic_simple = simple_clf.predict(X_test_titanic.values)

# Обучение самописной модели Decision Tree для House Prices
simple_regr = SimpleDecisionTreeRegressor(max_depth=3)
simple_regr.fit(X_train_house.values, y_train_house.values)
y_pred_house_simple = simple_regr.predict(X_test_house.values)


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [26]:
print("Simple Decision Tree Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic_simple))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic_simple))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic_simple))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic_simple))

print("\nSimple Decision Tree Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house_simple))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house_simple))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house_simple)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house_simple))


Simple Decision Tree Classifier - Titanic Dataset:
Accuracy: 0.8202247191011236
Precision: 0.7605633802816901
Recall: 0.782608695652174
F1 Score: 0.7714285714285714

Simple Decision Tree Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 30274.09226166296
Mean Squared Error (MSE): 1986959536.6803293
Root Mean Squared Error (RMSE): 44575.324302581685
R-squared (R²): 0.7409550150789492


In [28]:
# Используем функции улучшенной предобработки из пункта 3
X_train_titanic_ext, X_test_titanic_ext, y_train_titanic_ext, y_test_titanic_ext = preprocess_titanic_data_extended('titanic/train.csv')
X_train_house_ext, X_test_house_ext, y_train_house_ext, y_test_house_ext = preprocess_house_data_extended('house-prices-advanced-regression-techniques/train.csv')

In [29]:
# Обучаем улучшенную самописную модель Decision Tree для Titanic
improved_simple_clf = SimpleDecisionTreeClassifier(max_depth=5)  # Используем более глубокое дерево для улучшения.
improved_simple_clf.fit(X_train_titanic_ext.values, y_train_titanic_ext.values)
y_pred_titanic_improved_simple = improved_simple_clf.predict(X_test_titanic_ext.values)

In [30]:
# Обучаем улучшенную самописную модель Decision Tree для House Prices
improved_simple_regr = SimpleDecisionTreeRegressor(max_depth=5)  # Используем более глубокое дерево для улучшения.
improved_simple_regr.fit(X_train_house_ext.values, y_train_house_ext.values)
y_pred_house_improved_simple = improved_simple_regr.predict(X_test_house_ext.values)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [31]:
print("Improved Simple Decision Tree Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic_ext, y_pred_titanic_improved_simple))
print("Precision:", precision_score(y_test_titanic_ext, y_pred_titanic_improved_simple))
print("Recall:", recall_score(y_test_titanic_ext, y_pred_titanic_improved_simple))
print("F1 Score:", f1_score(y_test_titanic_ext, y_pred_titanic_improved_simple))

print("\nImproved Simple Decision Tree Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house_ext, y_pred_house_improved_simple))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house_ext, y_pred_house_improved_simple))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house_ext, y_pred_house_improved_simple)))
print("R-squared (R²):", r2_score(y_test_house_ext, y_pred_house_improved_simple))

Improved Simple Decision Tree Classifier - Titanic Dataset:
Accuracy: 0.797752808988764
Precision: 0.7391304347826086
Recall: 0.7391304347826086
F1 Score: 0.7391304347826085

Improved Simple Decision Tree Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 0.1482116937914148
Mean Squared Error (MSE): 0.04184203682180933
Root Mean Squared Error (RMSE): 0.2045532615770507
R-squared (R²): 0.775779351804641
