In [15]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [16]:
x, y = make_regression(
    n_samples=400,
    n_features=5,
    noise=15.0,
    random_state=42
)

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

linear = LinearRegression()
linear.fit(X_train, Y_train)

y_pred = linear.predict(X_test)

mae = mean_absolute_error(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, y_pred)
print(f"MAE: {mae: .3f}")
print(f"MSE: {mse: .3f}")
print(f"RMSE: {rmse: .3f}")
print(f"R2: {r2: .3f}")


MAE:  12.774
MSE:  248.775
RMSE:  15.773
R2:  0.973


Обучение алгоритма регрессии:
1. Предоброботка
2. Разделение на train/test
3. Выбор модели и гиперпараметров
4. Оптимизация параметров(обучение)
5. Оценка по нескольким метрикам, проверка переобучения
6. Кросс-валидация (CV) для более стабильной оценки

In [19]:
pipeline = make_pipeline(
    StandardScaler(),
    LinearRegression(),
)

cv_scores = cross_val_score(pipeline, x, y, cv=5)
print(f"CV R^2 mean: {cv_scores.mean(): .3f} (+/-{cv_scores.std(): .3f})")

CV R^2 mean:  0.977 (+/- 0.004)


In [20]:
pipeline.fit(X_train, Y_train)
print(f"Test R^2: {pipeline.score(X_test, Y_test):.3f}")

Test R^2: 0.973
