In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.base import clone
import numpy as np

In [60]:
df = pd.read_csv('bmw.csv')

# Используем основные признаки, по результатам из задачи 1
df = df.dropna(subset=['year', 'engineSize', 'price', 'mpg', 'mileage'])
df = df[['year', 'engineSize', 'price', 'mpg', 'mileage']]
df

Unnamed: 0,year,engineSize,price,mpg,mileage
0,2014,2.0,11200,57.6,67068
1,2018,2.0,27000,42.8,14827
2,2016,3.0,16000,51.4,62794
3,2017,1.5,12750,72.4,26676
4,2014,3.0,14500,50.4,39554
...,...,...,...,...,...
10776,2016,2.0,19000,54.3,40818
10777,2016,2.0,14600,60.1,42947
10778,2017,2.0,13100,42.8,25468
10779,2014,2.0,9930,64.2,45000


In [54]:
X = df.drop(columns=['price'])
y = df['price']

# Проверка данных после разделения
print("X columns:", X.columns)
print("y head:", y.head())

X columns: Index(['year', 'engineSize', 'mpg', 'mileage'], dtype='object')
y head: 0    11200
1    27000
2    16000
3    12750
4    14500
Name: price, dtype: int64


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [56]:
base_models = [
    SGDRegressor(alpha=1, max_iter=1000, penalty='elasticnet', random_state=42),
    RandomForestRegressor(max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200, random_state=42)
]
final_model = CatBoostRegressor(n_estimators=200, random_state=42, verbose=0)  

In [57]:
def fit_stacking(x_train, y_train, base_models, final_model):
    base_models_ = [clone(model) for model in base_models]
    final_model_ = clone(final_model)
    for model in base_models_:
        model.fit(x_train, y_train)
    meta_features = np.column_stack([model.predict(x_train) for model in base_models_])
    final_model_.fit(meta_features, y_train)
    
    return base_models_, final_model_

In [58]:
def predict_stacking(x_test, base_models, final_model):
    meta_features = np.column_stack([model.predict(x_test) for model in base_models])
    return final_model.predict(meta_features)

In [59]:
# Обучение модели
base_models_, final_model_ = fit_stacking(X_train_scaled, y_train, base_models, final_model)

# Предсказание модели
y_pred_stacking = predict_stacking(X_test_scaled, base_models_, final_model_)

# Оценка качества
stacking_r2 = r2_score(y_test, y_pred_stacking)
print(f'Stacking Regressor R2 Score: {stacking_r2:.4f}')

Stacking Regressor R2 Score: 0.8896


Значение 0,8896 является хорошим для 4х выбранных признаков образования цены