In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.base import clone
import numpy as np

In [73]:
df = pd.read_csv('bmw.csv')

# Используем основные признаки, по результатам из задачи 1
#df = df.dropna(subset=['year', 'engineSize', 'price', 'mpg', 'mileage'])
#df = df[['year', 'engineSize', 'price', 'mpg', 'mileage']]
df = pd.get_dummies(df, drop_first=True)
df

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,model_ 2 Series,model_ 3 Series,model_ 4 Series,model_ 5 Series,...,model_ Z3,model_ Z4,model_ i3,model_ i8,transmission_Manual,transmission_Semi-Auto,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,2014,11200,67068,125,57.6,2.0,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,2018,27000,14827,145,42.8,2.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,2016,16000,62794,160,51.4,3.0,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,2017,12750,26676,145,72.4,1.5,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2014,14500,39554,160,50.4,3.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,19000,40818,150,54.3,2.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10777,2016,14600,42947,125,60.1,2.0,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
10778,2017,13100,25468,200,42.8,2.0,False,True,False,False,...,False,False,False,False,True,False,False,False,False,True
10779,2014,9930,45000,30,64.2,2.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [74]:
X = df.drop(columns=['price'])
y = df['price']

# Проверка данных после разделения
print("X columns:", X.columns)
print("y head:", y.head())

X columns: Index(['year', 'mileage', 'tax', 'mpg', 'engineSize', 'model_ 2 Series',
       'model_ 3 Series', 'model_ 4 Series', 'model_ 5 Series',
       'model_ 6 Series', 'model_ 7 Series', 'model_ 8 Series', 'model_ M2',
       'model_ M3', 'model_ M4', 'model_ M5', 'model_ M6', 'model_ X1',
       'model_ X2', 'model_ X3', 'model_ X4', 'model_ X5', 'model_ X6',
       'model_ X7', 'model_ Z3', 'model_ Z4', 'model_ i3', 'model_ i8',
       'transmission_Manual', 'transmission_Semi-Auto', 'fuelType_Electric',
       'fuelType_Hybrid', 'fuelType_Other', 'fuelType_Petrol'],
      dtype='object')
y head: 0    11200
1    27000
2    16000
3    12750
4    14500
Name: price, dtype: int64


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [76]:
base_models = [
    SGDRegressor(alpha=1, max_iter=1000, penalty='elasticnet', random_state=42),
    RandomForestRegressor(max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200, random_state=42)
]
final_model = CatBoostRegressor(n_estimators=200, random_state=42, verbose=0)  

In [77]:
def fit_stacking(x_train, y_train, base_models, final_model):
    base_models_ = [clone(model) for model in base_models]
    final_model_ = clone(final_model)
    for model in base_models_:
        model.fit(x_train, y_train)
    meta_features = np.column_stack([model.predict(x_train) for model in base_models_])
    final_model_.fit(meta_features, y_train)
    
    return base_models_, final_model_

In [78]:
def predict_stacking(x_test, base_models, final_model):
    meta_features = np.column_stack([model.predict(x_test) for model in base_models])
    return final_model.predict(meta_features)

In [79]:
# Обучение модели
base_models_, final_model_ = fit_stacking(X_train_scaled, y_train, base_models, final_model)

# Предсказание модели
y_pred_stacking = predict_stacking(X_test_scaled, base_models_, final_model_)

# Оценка качества
stacking_r2 = r2_score(y_test, y_pred_stacking)
print(f'Stacking Regressor R2 Score: {stacking_r2:.4f}')

Stacking Regressor R2 Score: 0.9398


Значение 0,9398 является отличным. Если сравнивать с качеством из предыдущих задач, то результат был 0.9325.