Модели sklearn

In [2]:
# Импорт необходимых библиотек
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Загрузка данных для Titanic и House Prices
def load_and_preprocess_titanic(filepath):
    data = pd.read_csv(filepath)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data.dropna(subset=['Embarked'], inplace=True)
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])
    features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    X = data[features]
    y = data['Survived']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def load_and_preprocess_house_data(filepath):
    data = pd.read_csv(filepath)
    data['LotFrontage'].fillna(data['LotFrontage'].median(), inplace=True)
    data = pd.get_dummies(data)
    features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF']
    X = data[features]
    y = data['SalePrice']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Загрузка и разделение данных
X_train_titanic, X_test_titanic, y_train_titanic, y_test_titanic = load_and_preprocess_titanic('titanic/train.csv')
X_train_house, X_test_house, y_train_house, y_test_house = load_and_preprocess_house_data('house-prices-advanced-regression-techniques/train.csv')

In [3]:
# Обучение модели градиентного бустинга для классификации
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train_titanic, y_train_titanic)

# Предсказания
y_pred_titanic = gb_clf.predict(X_test_titanic)

In [4]:
# Обучение модели градиентного бустинга для регрессии
gb_regr = GradientBoostingRegressor(random_state=42)
gb_regr.fit(X_train_house, y_train_house)

# Предсказания
y_pred_house = gb_regr.predict(X_test_house)

In [6]:
print("Gradient Boosting Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic))

print("\nGradient Boosting Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house))

Gradient Boosting Classifier - Titanic Dataset:
Accuracy: 0.8202247191011236
Precision: 0.7681159420289855
Recall: 0.7681159420289855
F1 Score: 0.7681159420289855

Gradient Boosting Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 19818.465123659484
Mean Squared Error (MSE): 871673675.420398
Root Mean Squared Error (RMSE): 29524.120231099147
R-squared (R²): 0.8863576786859942


Улучшения бейзлайна

In [7]:
def enhanced_preprocess_titanic(filepath):
    data = pd.read_csv(filepath)
    # Интеллектуальное заполнение пропусков
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data.groupby('Pclass')['Fare'].transform('median'), inplace=True)
    data.dropna(subset=['Embarked'], inplace=True)
    
    # Лог-преобразование тарифа для уменьшения воздействия выбросов
    data['Fare'] = np.log1p(data['Fare'])

    # Кодирование категориальных признаков с использованием get_dummies
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])

    features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    X = data[features]
    y = data['Survived']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def enhanced_preprocess_house(filepath):
    data = pd.read_csv(filepath)
    # Интеллектуальное заполнение пропусков
    data['LotFrontage'].fillna(data.groupby('Neighborhood')['LotFrontage'].transform('median'), inplace=True)

    # Лог-преобразование целевой переменной
    data['SalePrice'] = np.log1p(data['SalePrice'])

    important_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF']
    data = pd.get_dummies(data)
    X = data[important_features]
    y = data['SalePrice']
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid_clf = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_clf = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_grid=param_grid_clf, cv=5, scoring='accuracy', n_jobs=-1)
grid_clf.fit(X_train_titanic, y_train_titanic)

print("Best parameters for Gradient Boosting Classifier:", grid_clf.best_params_)

Best parameters for Gradient Boosting Classifier: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}


In [9]:
# Обучение улучшенной модели градиентного бустинга для Titanic
improved_gb_clf = GradientBoostingClassifier(**grid_clf.best_params_, random_state=42)
improved_gb_clf.fit(X_train_titanic, y_train_titanic)
y_pred_titanic_improved = improved_gb_clf.predict(X_test_titanic)

In [10]:
from sklearn.model_selection import RandomizedSearchCV

param_grid_regr = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_regr = RandomizedSearchCV(estimator=GradientBoostingRegressor(random_state=42), param_distributions=param_grid_regr, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', n_iter=100, random_state=42)
grid_regr.fit(X_train_house, y_train_house)

improved_gb_regr = GradientBoostingRegressor(**grid_regr.best_params_, random_state=42)
improved_gb_regr.fit(X_train_house, y_train_house)
y_pred_house_improved = improved_gb_regr.predict(X_test_house)




In [11]:
print("Improved Gradient Boosting Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic_improved))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic_improved))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic_improved))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic_improved))

print("\nImproved Gradient Boosting Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house_improved))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house_improved))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house_improved)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house_improved))

Improved Gradient Boosting Classifier - Titanic Dataset:
Accuracy: 0.8146067415730337
Precision: 0.7903225806451613
Recall: 0.7101449275362319
F1 Score: 0.7480916030534351

Improved Gradient Boosting Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 21485.396079199447
Mean Squared Error (MSE): 1010335655.8718059
Root Mean Squared Error (RMSE): 31785.77757223828
R-squared (R²): 0.8682799624708115


Собственная имплементация

In [12]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

class SimpleGradientBoostingClassifier:
    def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []

    def fit(self, X, y):
        # Инициализируем модели
        y_pred = np.zeros(shape=y.shape)
        
        for _ in range(self.n_estimators):
            # Вычисляем остатки
            residual = y - 1 / (1 + np.exp(-y_pred))
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            y_pred += self.learning_rate * tree.predict(X)
            self.models.append(tree)

    def predict(self, X):
        y_pred = np.zeros(shape=(X.shape[0],))
        for tree in self.models:
            y_pred += self.learning_rate * tree.predict(X)
        return (y_pred > 0.5).astype(int)

In [13]:
class SimpleGradientBoostingRegressor:
    def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []

    def fit(self, X, y):
        # Инициализация
        residual = y
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            prediction = tree.predict(X)
            residual -= self.learning_rate * prediction
            self.models.append(tree)

    def predict(self, X):
        y_pred = np.zeros(shape=(X.shape[0],))
        for tree in self.models:
            y_pred += self.learning_rate * tree.predict(X)
        return y_pred

In [14]:
# Обучение простого градиентного бустинга для классификации
simple_gb_clf = SimpleGradientBoostingClassifier(n_estimators=10, learning_rate=0.1, max_depth=3)
simple_gb_clf.fit(X_train_titanic, y_train_titanic)
y_pred_titanic_simple = simple_gb_clf.predict(X_test_titanic)

In [15]:
# Обучение простого градиентного бустинга для регрессии
simple_gb_regr = SimpleGradientBoostingRegressor(n_estimators=10, learning_rate=0.1, max_depth=3)
simple_gb_regr.fit(X_train_house, y_train_house)
y_pred_house_simple = simple_gb_regr.predict(X_test_house)

In [16]:
print("Simple Gradient Boosting Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic_simple))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic_simple))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic_simple))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic_simple))

print("\nSimple Gradient Boosting Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house_simple))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house_simple))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house_simple)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house_simple))

Simple Gradient Boosting Classifier - Titanic Dataset:
Accuracy: 0.6123595505617978
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

Simple Gradient Boosting Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 64244.727409513325
Mean Squared Error (MSE): 6557324943.750704
Root Mean Squared Error (RMSE): 80977.31128995765
R-squared (R²): 0.14510481475918646


  _warn_prf(average, modifier, msg_start, len(result))


Улучшения

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Пример использования стандартизации
scaler = StandardScaler()
X_train_titanic_scaled = scaler.fit_transform(X_train_titanic)
X_test_titanic_scaled = scaler.transform(X_test_titanic)

# Для регрессии
X_train_house_scaled = scaler.fit_transform(X_train_house)
X_test_house_scaled = scaler.transform(X_test_house)

In [20]:
class ImprovedGradientBoostingClassifier:
    def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []
        self.init_val = None
    
    def fit(self, X, y):
        # Use log(odds) transformation for better initialization
        y_mean = np.mean(y)
        y_pred = np.full(y.shape, np.log(y_mean / (1 - y_mean)))
        self.init_val = y_pred[0]
        
        for _ in range(self.n_estimators):
            probs = 1 / (1 + np.exp(-y_pred))
            residual = y - probs
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            y_pred += self.learning_rate * tree.predict(X)
            self.models.append(tree)
    
    def predict(self, X):
        y_pred = np.full(X.shape[0], self.init_val)
        for tree in self.models:
            y_pred += self.learning_rate * tree.predict(X)
        return (1 / (1 + np.exp(-y_pred)) > 0.5).astype(int)

# Обучение и предсказание
improved_simple_gb_clf = ImprovedGradientBoostingClassifier(n_estimators=10, learning_rate=0.1, max_depth=3)
improved_simple_gb_clf.fit(X_train_titanic_scaled, y_train_titanic)
y_pred_titanic_improved = improved_simple_gb_clf.predict(X_test_titanic_scaled)

In [21]:
class ImprovedGradientBoostingRegressor:
    def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []
        self.init_val = None
    
    def fit(self, X, y):
        y_pred = np.full(y.shape, np.mean(y))
        self.init_val = y_pred[0]
        
        residual = y
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            prediction = tree.predict(X)
            residual -= self.learning_rate * prediction
            y_pred += self.learning_rate * prediction
            self.models.append(tree)
    
    def predict(self, X):
        y_pred = np.full(X.shape[0], self.init_val)
        for tree in self.models:
            y_pred += self.learning_rate * tree.predict(X)
        return y_pred

# Обучение и предсказание
improved_simple_gb_regr = ImprovedGradientBoostingRegressor(n_estimators=10, learning_rate=0.1, max_depth=3)
improved_simple_gb_regr.fit(X_train_house_scaled, y_train_house)
y_pred_house_improved = improved_simple_gb_regr.predict(X_test_house_scaled)

In [22]:
print("Improved Simple Gradient Boosting Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic_improved))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic_improved))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic_improved))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic_improved))

print("\nImproved Simple Gradient Boosting Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house_improved))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house_improved))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house_improved)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house_improved))

Improved Simple Gradient Boosting Classifier - Titanic Dataset:
Accuracy: 0.7359550561797753
Precision: 0.9230769230769231
Recall: 0.34782608695652173
F1 Score: 0.5052631578947369

Improved Simple Gradient Boosting Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 77277.49204817129
Mean Squared Error (MSE): 10662828698.649317
Root Mean Squared Error (RMSE): 103260.97374443704
R-squared (R²): -0.3901401857796094
