In [None]:
from sklearn.linear_model import HuberRegressor, LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neural_network import MLPRegressor
import warnings
 
warnings.simplefilter("ignore")

### Generowanie danych

In [None]:
def generate_dataset(n_samples, liniar, noise):
  X = np.random.normal(-2, 2, n_samples)
  if liniar:
    [a, b] = [3.5, 3]
    y = a * X + b + np.random.normal(-1, 1, size=n_samples)
  else:
    [a, b, c] = [1.5, -1, 2]
    y = a * X**3 + b * X + c + np.random.normal(-1, 1, size=n_samples) * 20
  X = X.reshape((-1, 1))
  y = y.reshape((-1, 1))
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=0)
  if noise:
    n_outliers = 7
    random_noise = np.random.randint(0, len(X_train), n_outliers)
    y_train[np.c_[random_noise]] += 100 * np.random.randint(3, 4, size=y_train[np.c_[random_noise]].shape)
    # X_train[np.c_[random_noise]] += -2 * np.random.uniform(-1, 1, size=X_train[np.c_[random_noise]].shape)
  return X_train, X_test, y_train, y_test

### Strojenie modeli

In [None]:
def mse_mae(model, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_train_pred = model.predict(X_train)
  y_test_pred = model.predict(X_test)
  mse_train = mean_squared_error(y_train, y_train_pred)
  mse_test = mean_squared_error(y_test, y_test_pred)
  mae_train = mean_absolute_error(y_train, y_train_pred)
  mae_test = mean_absolute_error(y_test, y_test_pred)
  return np.round([mse_train, mse_test, mae_train, mae_test], 3), y_test_pred

columns = ["model", "mse on train subset", "mse on test subset", "mae on train subset", "mae on test subset"]
models = {"Linear Regression": LinearRegression(),  
          "Ridge": Ridge(alpha=0.7, solver="svd"), 
          "Lasso": Lasso(), 
          "ElasticNet": ElasticNet(l1_ratio=0.9), 
          "Huber Regressor": HuberRegressor(epsilon=1), 
          "SGD Regressor mse": SGDRegressor(penalty='l2', learning_rate='constant', tol=1e-5),
          "SGD Regressor huber": SGDRegressor(penalty='l2', learning_rate='constant', loss="huber", alpha=1e-5, tol=1e-5),
}

### Dane liniowe bez outlierów

In [None]:
X_train, X_test, y_train, y_test = generate_dataset(50, True, False)

rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(models[model_name], X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(models[model_name], X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_pred), mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane liniowe bez outlierów", autosize=False, width=1200, height=700)
fig.show()

df_lin_nonoise = pd.DataFrame(rows, columns=columns)
df_lin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,0.513,0.909,0.584,0.786
1,Ridge,0.521,0.936,0.597,0.792
2,Lasso,0.718,1.185,0.737,0.889
3,ElasticNet,0.816,1.29,0.795,0.927
4,Huber Regressor,0.553,0.87,0.552,0.766
5,SGD Regressor mse,0.527,1.031,0.616,0.827
6,SGD Regressor huber,0.536,0.889,0.56,0.777


#### Modele Lasso oraz Elasicnet są najgorzej dopasowane do zbioru. W każdym z nich było użyto regularyzacji l1. Modele z brakiem regularyzacji lub obecością l2 dobrze dopasowały się do zbioru. 

### Dane liniowe, zbiór do uczenia zawiera outliery

In [None]:
X_train, X_test, y_train, y_test = generate_dataset(100, True, True)

rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(models[model_name], X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(models[model_name], X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_pred), name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane liniowe, zbiór do uczenia zawiera outliery", autosize=False, width=1200, height=700)
fig.show()

df_lin_noise = pd.DataFrame(rows, columns=columns)
df_lin_noise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,14315.375,3406.981,95.428,57.851
1,Ridge,14315.375,3407.139,95.429,57.854
2,Lasso,14315.584,3414.158,95.456,57.961
3,ElasticNet,14315.568,3413.87,95.455,57.956
4,Huber Regressor,17917.433,1.634,60.667,1.038
5,SGD Regressor mse,14328.812,3073.8,93.669,54.576
6,SGD Regressor huber,17985.398,1.353,60.781,0.944


#### Modele z mse nie są poddatne na dane odstające, natomiast model Hubert Regression jest odporny na outliery. Jako kryterium wyboru funkcji straty ustawiono epsilon na 1, czyli najczęściej będzie wybierana mae, która jest odporna na wartości odstające.

### Dane nieliniowe bez outlierów, stopień wielomianu = 2

In [None]:
X_train, X_test, y_train, y_test = generate_dataset(200, False, False)

columns = ["model", "mse on train subset", "mse on test subset", "mae on train subset", "mae on test subset"]
models = {"Linear Regression": LinearRegression(), 
          "Ridge": Ridge(alpha=0.7, solver="svd"), 
          "Lasso": Lasso(), 
          "ElasticNet": ElasticNet(l1_ratio=0.9), 
          "Huber Regressor": HuberRegressor(epsilon=1), 
          "SGD Regressor mse": SGDRegressor(penalty='l2', learning_rate='constant', tol=1e-5),
          "SGD Regressor hubber": SGDRegressor(penalty='l2', learning_rate='constant', loss="huber", alpha=1e-1, tol=1e-7, eta0=1e-2),
          "MLP Regressor": "none",
}

deg = 2
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (5 * deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe bez outlierów, stopień wielomianu = 2", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,648.551,2206.229,20.455,24.223
1,Ridge,648.566,2202.682,20.442,24.218
2,Lasso,650.199,2171.234,20.336,24.188
3,ElasticNet,651.878,2160.157,20.299,24.182
4,Huber Regressor,835.228,2713.117,19.571,25.303
5,SGD Regressor mse,5810.471,12361.389,53.981,69.235
6,SGD Regressor hubber,1886.882,4922.715,27.83,35.827
7,MLP Regressor,1504.368,4389.183,23.383,31.518


#### Najlepiej do próbek testowych dopasowała się zwykla regresja logistyczna oraz z regularyzacją l2 (Ridge).

### Dane nieliniowe bez outlierów, stopień wielomianu = 3

In [None]:
models = {"Linear Regression": LinearRegression(), 
          "Ridge": Ridge(alpha=0.7, solver="svd"), 
          "Lasso": Lasso(), 
          "ElasticNet": ElasticNet(l1_ratio=0.9), 
          "Huber Regressor": HuberRegressor(epsilon=1), 
          "SGD Regressor hubber": SGDRegressor(penalty='l2', learning_rate='constant', loss="huber", alpha=1e-1, tol=1e-7, eta0=1e-3),
          "MLP Regressor": "none",
}

deg = 3
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe bez outlierów, stopień wielomianu = 3", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,421.186,360.186,16.731,15.47
1,Ridge,421.187,360.203,16.729,15.471
2,Lasso,421.813,354.168,16.772,15.399
3,ElasticNet,421.779,354.969,16.772,15.41
4,Huber Regressor,422.84,354.874,16.535,15.434
5,SGD Regressor hubber,723.673,521.046,21.631,18.91
6,MLP Regressor,1132.125,1890.094,25.988,28.904


#### Regressory z obecością funkcji straty mae dobrze radzą sobie z próbkami odstającymi. Regularyzacja l2 nieco lepiej pomogła dopasować się modelom niż l1. 


### Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 2

In [None]:
models = {"Linear Regression": LinearRegression(), 
          "Ridge": Ridge(alpha=0.7, solver="svd"), 
          "Lasso": Lasso(), 
          "ElasticNet": ElasticNet(l1_ratio=0.9), 
          "Huber Regressor": HuberRegressor(epsilon=1), 
          "SGD Regressor hubber l1": SGDRegressor(penalty='l1', learning_rate='constant', loss="huber", alpha=1e-3, tol=1e-7),
          "MLP Regressor": "none",
}

X_train, X_test, y_train, y_test = generate_dataset(200, False, True)

deg = 2
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 2", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,11271.201,3916.457,70.408,50.537
1,Ridge,11271.279,3912.11,70.433,50.544
2,Lasso,11271.992,3902.044,70.497,50.549
3,ElasticNet,11277.507,3891.869,70.692,50.585
4,Huber Regressor,13314.96,3562.037,56.554,25.623
5,SGD Regressor hubber l1,13534.834,2417.951,59.233,23.744
6,MLP Regressor,14773.513,5924.439,79.655,47.204


#### Najbardziej odporny na ouliery wykazał się model Huber oraz SGD Huber.

### Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 3

In [None]:
models = {"Linear Regression": LinearRegression(), 
          "Ridge": Ridge(alpha=0.7, solver="svd"), 
          "Lasso": Lasso(), 
          "ElasticNet": ElasticNet(l1_ratio=0.9), 
          "Huber Regressor": HuberRegressor(epsilon=1), 
          # "SGD Regressor hubber l1": SGDRegressor(penalty='l2', learning_rate='constant', loss="huber", alpha=1e-3, tol=1e-7),
          # "MLP Regressor": "none",
}

X_train, X_test, y_train, y_test = generate_dataset(200, False, True)
deg = 15
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (10 * deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 3", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,7198.337,206612932469294.7,57.51,1277281.803
1,Ridge,9982.329,1977705893446.607,71.703,151487.498
2,Lasso,11288.868,20166.191,73.804,63.666
3,ElasticNet,11294.815,17504.594,73.728,62.009
4,Huber Regressor,13058.016,37228612.327,77.644,658.95


### Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 5

In [None]:
deg = 5
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name == "SGD Regressor":
    continue
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 5", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,11287.839,16341.739,73.837,58.25
1,Ridge,11287.971,15630.249,73.832,57.909
2,Lasso,11307.365,6891.126,73.552,51.317
3,ElasticNet,11311.725,6631.309,73.486,50.566
4,Huber Regressor,12821.031,4669.044,57.323,27.908


### Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 8

In [None]:
deg = 8
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    continue
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 8", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,11262.443,318700.412,73.806,119.812
1,Ridge,11263.088,188365.772,73.873,101.779
2,Lasso,11296.307,8502.815,73.747,55.654
3,ElasticNet,11302.284,8115.798,73.686,55.012


### Wnioski:
- Funkcja straty MAE jest bardziej odporna na dane odstające (Hubber Regressor, SGD Regressor).
- Regularyzacja l1 (Lasso) lepiej pomaga zminimaliziwać funkcję straty niż l2 (Ridge).
