In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split

In [2]:
data = pd.read_csv("house_price_regression_dataset.csv")
data

Unnamed: 0,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,House_Price
0,1360,2,1,1981,0.599637,0,5,2.623829e+05
1,4272,3,3,2016,4.753014,1,6,9.852609e+05
2,3592,1,2,2016,3.634823,0,9,7.779774e+05
3,966,1,2,1977,2.730667,1,8,2.296989e+05
4,4926,2,1,1993,4.699073,0,8,1.041741e+06
...,...,...,...,...,...,...,...,...
995,3261,4,1,1978,2.165110,2,10,7.014940e+05
996,3179,1,2,1999,2.977123,1,10,6.837232e+05
997,2606,4,2,1962,4.055067,0,2,5.720240e+05
998,4723,5,2,1950,1.930921,0,7,9.648653e+05


In [3]:
X = data.drop(columns=["House_Price"])
y = data["House_Price"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train.reset_index(inplace=True)
X_test.reset_index(inplace=True)
y_train = y_train.reset_index()["House_Price"]
y_test = y_test.reset_index()["House_Price"]

In [5]:
model = RandomForestRegressor(n_estimators=300, max_depth=10, n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)

np.float64(18210.016692970752)

In [6]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
model = GradientBoostingRegressor(loss='squared_error', learning_rate=0.1, n_estimators=300, max_depth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)

np.float64(12258.747227529284)

In [8]:
from xgboost import XGBRegressor

model = XGBRegressor(learning_rate=0.1, n_estimators=300, max_depth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)

np.float64(11791.362951190544)

In [9]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(max_depth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)

np.float64(38227.32963579272)

In [None]:
### Собственная реализация
class MyGradientRegressor:
    def __init__(self, n_estimators: int=300, max_depth: int=3, lr: float=0.1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.lr = lr
        self.estimators = []

    def fit(self, X_train, y_train):
        X_train = np.array(X_train)
        y_train = np.array(y_train)
        predictions = 0

        for _ in range(self.n_estimators):
            new_model = DecisionTreeRegressor(max_depth=self.max_depth)
            new_target = -2 * (predictions - y_train)
            new_model.fit(X_train, new_target)
            predictions += self.lr * new_model.predict(X_train)
            self.estimators.append(new_model)

    def predict(self, X_test):
        X_test = np.array(X_test)
        curr_pred = 0
        for est in self.estimators:
            curr_pred += self.lr * est.predict(X_test)
        
        return curr_pred


In [11]:
my_model = MyGradientRegressor(n_estimators=300, max_depth=3, lr=0.1)

In [12]:
my_model.fit(X_train, y_train)

In [13]:
pred = my_model.predict(X_test)

In [14]:
mean_absolute_error(y_test, pred)

np.float64(12412.809000551224)

In [None]:
pred0 = pred

loss = (y_pred - y_true)**2

loss' = 2 (y_pred - y_true)
