In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
from sklearn.model_selection import train_test_split

def prepare_data():
    x = np.linspace(0, 1, 100)
    y = np.cos(1.5 * np.pi * x)

    np.random.seed(14300631)
    x_samples = np.random.uniform(0, 1, size=60)
    y_samples = np.cos(1.5 * np.pi * x_samples) + np.random.normal(scale=0.1, size=x_samples.shape)

    df = pd.DataFrame({"x": x_samples, "y": y_samples})

    df_remain, df_online_test = train_test_split(df, test_size=10)
    df_remain, df_test = train_test_split(df_remain, test_size=10)
    df_train, df_val = train_test_split(df_remain, test_size=10)
    return {
        "train": df_train,
        "val": df_val,
        "test": df_test,
        "online_test": df_online_test,
        "real": pd.DataFrame({"x": x, "y": y})
    }

In [None]:
data = prepare_data()

In [None]:
data.keys()

dict_keys(['train', 'val', 'test', 'online_test', 'real'])

In [None]:
train_df = data["train"]

In [None]:
train_df.info()

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
ax.scatter(train_df["x"], train_df["y"])

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(train_df["x"].values.reshape(-1, 1), train_df["y"])

In [None]:
train_predictions = model.predict(train_df["x"].values.reshape(-1, 1))

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
ax.scatter(train_df["x"], train_df["y"], label="train_data")
ax.scatter(train_df["x"], train_predictions, label="train_predictions")
ax.legend()

In [None]:
from sklearn.metrics import mean_squared_error

train_mse = mean_squared_error(train_df["y"], train_predictions)
print(f"Train MSE (one feature) = {train_mse}")

In [None]:
train_df["x2"] = train_df["x"] ** 2

In [None]:
model = LinearRegression()
model.fit(train_df[["x", "x2"]], train_df["y"])

In [None]:
train_predictions = model.predict(train_df[["x", "x2"]])

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
ax.scatter(train_df["x"], train_df["y"], label="train_data")
ax.scatter(train_df["x"], train_predictions, label="train_predictions")
ax.legend()

In [None]:
train_mse = mean_squared_error(train_df["y"], train_predictions)
print(f"Train MSE (two features) = {train_mse}")

In [None]:
train_df["x3"] = train_df["x"] ** 3

model = LinearRegression()
model.fit(train_df[["x", "x2", "x3"]], train_df["y"])
train_predictions = model.predict(train_df[["x", "x2", "x3"]])

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
ax.scatter(train_df["x"], train_df["y"], label="train_data")
ax.scatter(train_df["x"], train_predictions, label="train_predictions")
ax.legend()

In [None]:
train_mse = mean_squared_error(train_df["y"], train_predictions)
print(f"Train MSE (three features) = {train_mse}")

In [None]:
from sklearn.preprocessing import PolynomialFeatures

max_degree = 21
train_mses = []
for i in range(1, 21):
    poly = PolynomialFeatures(degree=i, include_bias=False)
    X = poly.fit_transform(train_df[["x"]])
    model = LinearRegression().fit(X, train_df["y"])
    train_predictions = model.predict(X)
    train_mses.append(mean_squared_error(train_df["y"], train_predictions))

fig, ax = plt.subplots(figsize=(16, 9))
ax.scatter(range(1, max_degree), train_mses)
ax.set_yscale('log')

In [None]:
train_mses[-1]

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(25, 9))
for ax, degree in zip(axes, [1, 4, 20]):
    poly = PolynomialFeatures(degree=degree)
    X = poly.fit_transform(train_df[["x"]])
    model = LinearRegression().fit(X, train_df["y"])
    train_predictions = model.predict(X)
    train_mse = mean_squared_error(train_df["y"], train_predictions)

    xspace = np.linspace(0, 1, 100)
    Xspace = poly.fit_transform(xspace.reshape(-1, 1))
    yspace = model.predict(Xspace)

    ax.scatter(train_df["x"], train_df["y"], label="train_points")
    ax.scatter(train_df["x"], train_predictions, label="predictions")
    ax.plot(xspace, yspace, label="model_function", color="red")
    ax.set_title(f"Degree: {degree}, MSE: {train_mse}")
    ax.legend()
plt.show()

In [None]:
val_df = data["val"]

fig, axes = plt.subplots(1, 3, figsize=(25, 9))
for ax, degree in zip(axes, [1, 4, 20]):
    poly = PolynomialFeatures(degree=degree)
    X = poly.fit_transform(train_df[["x"]])
    model = LinearRegression().fit(X, train_df["y"])
    train_predictions = model.predict(X)
    train_mse = mean_squared_error(train_df["y"], train_predictions)

    xspace = np.linspace(0, 1, 100)
    Xspace = poly.fit_transform(xspace.reshape(-1, 1))
    yspace = model.predict(Xspace)

    X_val = poly.fit_transform(val_df[["x"]])
    val_predictions = model.predict(X_val)
    val_mse = mean_squared_error(val_df["y"], val_predictions)

    ax.scatter(train_df["x"], train_df["y"], label="train_points")
    ax.scatter(val_df["x"], val_df["y"], label="val_points")

    ax.scatter(train_df["x"], train_predictions, label="predictions")
    ax.scatter(val_df["x"], val_predictions, label="val_predictions")
    
    ax.plot(xspace, yspace, label="model_function", color="red")
    ax.set_title(f"Degree: {degree}, MSE: {train_mse}, Val MSE: {val_mse}")
    ax.legend()
plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures

max_degree = 21
train_mses = []
val_mses = []
for i in range(1, 21):
    poly = PolynomialFeatures(degree=i, include_bias=False)
    X = poly.fit_transform(train_df[["x"]])
    model = LinearRegression().fit(X, train_df["y"])
    train_predictions = model.predict(X)
    train_mses.append(mean_squared_error(train_df["y"], train_predictions))

    X_val = poly.fit_transform(val_df[["x"]])
    val_predictions = model.predict(X_val)
    val_mses.append(mean_squared_error(val_df["y"], val_predictions))

fig, ax = plt.subplots(figsize=(16, 9))
ax.scatter(range(1, max_degree), train_mses, label="train mse")
ax.scatter(range(1, max_degree), val_mses, label="validation mse")
ax.legend()
ax.set_yscale('log')

# Boston Housing dataset

http://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

In [None]:
!wget https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv

In [None]:
df = pd.read_csv("BostonHousing.csv")
df.head()

In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split

trainval_df, test_df = train_test_split(df, test_size=0.1)
train_df, val_df = train_test_split(trainval_df, test_size=0.2)

In [None]:
X_train = train_df.drop("medv", axis=1)
y_train = train_df["medv"]

X_val = val_df.drop("medv", axis=1)
y_val = val_df["medv"]

X_test = test_df.drop("medv", axis=1)
y_test = test_df["medv"]

In [None]:
from sklearn.preprocessing import StandardScaler

scl = StandardScaler()
X_train_scaled = scl.fit_transform(X_train)
X_val_scaled = scl.transform(X_val)

In [None]:
model = LinearRegression().fit(X_train_scaled, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

train_pred = model.predict(X_train_scaled)
val_pred  = model.predict(X_val_scaled)

print(f"Train MSE: {mean_squared_error(y_train, train_pred)}")
print(f"Validation MSE: {mean_squared_error(y_val, val_pred)}")

In [None]:
# визуализация коэффициентов линейной регрессии
def visualize_coefficients(coefs, feature_names, top_n):
    """Функция для визуализации коэффициентов линейной регрессии.

    Параметры:
        coefs: коэффициенты модели (model.coef_).
        feature_names: названия признаков (X_train.columns).
        top_n: вывести top_n самых положительных и top_n самых отрицательных признаков.
    """
    feature_names = np.array(feature_names)
    if top_n * 2 > len(coefs):
        n_pos = len(coefs) // 2
        n_neg = len(coefs) - n_pos
    else:
        n_pos, n_neg = top_n, top_n
    # нам нужно найти индексы top_n наибольших и top_n наименьших коэффициентов
    min_coef_idxs = np.argsort(coefs)[:n_neg]
    max_coef_idxs = np.argsort(coefs)[len(coefs) - n_pos:]
    # соответствующие имена фичей
    top_feature_names = np.concatenate((feature_names[min_coef_idxs], feature_names[max_coef_idxs])) 
    # отобразим на bar-графике
    fig, ax = plt.subplots(figsize=(16, 9))
    ax.bar(np.arange(n_neg), coefs[min_coef_idxs], color=sns.xkcd_rgb["mauve"], hatch="/")
    ax.bar(np.arange(n_neg, n_neg + n_pos), coefs[max_coef_idxs], color=sns.xkcd_rgb["teal"], hatch="\\")
    ax.set_xticks(np.arange(0, n_neg + n_pos))
    ax.set_xticklabels(top_feature_names, rotation=45, ha="right", fontsize=14)
    plt.show()

In [None]:
visualize_coefficients(model.coef_, X_train.columns)

In [None]:
LinearRegression()