In [102]:
from sklearn.linear_model import HuberRegressor, LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neural_network import MLPRegressor
import warnings
 
warnings.simplefilter("ignore")

### Generowanie danych

In [103]:
def generate_dataset(n_samples, liniar, noise):
  X = np.random.uniform(-2, 2, n_samples)
  if liniar:
    [a, b] = [3.5, 3]
    y = a * X + b + np.random.normal(-1, 1, size=n_samples)
  else:
    [a, b, c] = [1.5, -1, 2]
    y = a * X**3 + b * X + c + np.random.normal(-1, 1, size=n_samples)
  X = X.reshape((-1, 1))
  y = y.reshape((-1, 1))
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
  if noise:
    n_outliers = 5
    random_noise = np.random.randint(0, 99, (n_outliers))
    y_train[np.c_[random_noise]] += 2 * np.random.normal(-1, 1, size=y_train[np.c_[random_noise]].shape)
    X_train[np.c_[random_noise]] += -2 * np.random.normal(-1, 1, size=X_train[np.c_[random_noise]].shape)
  return X_train, X_test, y_train, y_test

### Strojenie modeli

In [104]:
def mse_mae(model, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_train_pred = model.predict(X_train)
  y_test_pred = model.predict(X_test)
  mse_train = mean_squared_error(y_train, y_train_pred)
  mse_test = mean_squared_error(y_test, y_test_pred)
  mae_train = mean_absolute_error(y_train, y_train_pred)
  mae_test = mean_absolute_error(y_test, y_test_pred)
  return np.round([mse_train, mse_test, mae_train, mae_test], 3), y_test_pred

columns = ["model", "mse on train subset", "mse on test subset", "mae on train subset", "mae on test subset"]
models = {"Linear Regression": LinearRegression(),  
          "Ridge": Ridge(alpha=0.7, solver="svd"), 
          "Lasso": Lasso(), 
          "ElasticNet": ElasticNet(l1_ratio=0.9), 
          "Huber Regressor": HuberRegressor(epsilon=1), 
          "SGD Regressor": SGDRegressor(penalty='elasticnet', learning_rate='constant')
}

### Dane liniowe bez outlierów

In [109]:
X_train, X_test, y_train, y_test = generate_dataset(300, True, False)

rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(models[model_name], X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(models[model_name], X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_pred), mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane liniowe bez outlierów", autosize=False, width=1200, height=700)
fig.show()

df_lin_nonoise = pd.DataFrame(rows, columns=columns)
df_lin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,0.976,0.908,0.794,0.814
1,Ridge,0.976,0.907,0.794,0.813
2,Lasso,1.701,1.393,1.042,0.941
3,ElasticNet,1.967,1.6,1.128,1.017
4,Huber Regressor,0.981,0.886,0.792,0.805
5,SGD Regressor,0.985,0.892,0.794,0.803


#### Modele Lasso oraz Elasicnet są najgorzej dopasowane do zbioru. W każdym z nich było użyto regularyzacji l1. Modele z brakiem regularyzacji lub obecością l2 dobrze dopasowały się do zbioru. Regressor SGD też zawiarał regularyzację l1, natomiast dzięki stałej aktualizacji wag prawidłowo dopasował się do próbek.

### Dane liniowe, zbiór do uczenia zawiera outliery

In [225]:
X_train, X_test, y_train, y_test = generate_dataset(300, True, True)

rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(models[model_name], X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(models[model_name], X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_pred), name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane liniowe, zbiór do uczenia zawiera outliery", autosize=False, width=1200, height=700)
fig.show()

df_lin_noise = pd.DataFrame(rows, columns=columns)
df_lin_noise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,4.337,1.61,1.079,0.987
1,Ridge,4.337,1.619,1.081,0.991
2,Lasso,5.141,3.335,1.529,1.501
3,ElasticNet,5.336,3.64,1.597,1.579
4,Huber Regressor,4.625,1.245,0.988,0.879
5,SGD Regressor,4.345,1.512,1.053,0.953


#### Modele Lasso oraz Elasticnet są podatne na outliery, natomiast model Hubert Regression jest odporny na wartości odstające ze względu na możliwość doboru funkcji straty. Jako kryterium wyboru funkcji straty ustawiono epsilon na 1, czyli najczęściej będzie wybierana mae, która jest odporna na wartości odstające.

### Dane nieliniowe bez outlierów, stopień wielomianu = 2

In [110]:
X_train, X_test, y_train, y_test = generate_dataset(300, False, False)

columns = ["model", "mse on train subset", "mse on test subset", "mae on train subset", "mae on test subset"]
models = {"Linear Regression": LinearRegression(), 
          "Ridge": Ridge(alpha=0.7, solver="svd"), 
          "Lasso": Lasso(), 
          "ElasticNet": ElasticNet(l1_ratio=0.9), 
          "Huber Regressor": HuberRegressor(epsilon=1), 
          "SGD Regressor": SGDRegressor(penalty='elasticnet', learning_rate='constant'),
          "MLP Regressor": "none",
}

deg = 2
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe bez outlierów, stopień wielomianu = 2", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,4.045,4.327,1.654,1.675
1,Ridge,4.046,4.329,1.653,1.675
2,Lasso,4.792,5.01,1.68,1.662
3,ElasticNet,4.929,5.135,1.691,1.665
4,Huber Regressor,4.179,4.424,1.637,1.657
5,SGD Regressor,4.134,4.493,1.686,1.695
6,MLP Regressor,8.227,6.986,2.076,1.875


#### Najlepiej do próbek testowych dopasowała się zwykla regresja logistyczna oraz z regularyzacją l2 (Ridge).

### Dane nieliniowe bez outlierów, stopień wielomianu = 3

In [96]:
deg = 3
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe bez outlierów, stopień wielomianu = 3", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,3.035,1.603,1.052,0.964
1,Ridge,3.035,1.605,1.053,0.964
2,Lasso,3.336,1.912,1.186,1.058
3,ElasticNet,3.334,1.906,1.185,1.057
4,Huber Regressor,3.474,0.936,0.969,0.761
5,SGD Regressor,3.915,1.299,1.112,0.898
6,MLP Regressor,2.856,1.968,1.102,1.056


#### Regressory z obecością funkcji straty mae dobrze radzą sobie z próbkami odstającymi. Regularyzacja l2 nieco lepiej pomogła dopasować się modelom niż l1. 


### Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 2

In [85]:
X_train, X_test, y_train, y_test = generate_dataset(300, False, True)

deg = 2
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 2", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,4.083,4.319,1.597,1.643
1,Ridge,4.083,4.323,1.596,1.642
2,Lasso,5.298,5.704,1.707,1.828
3,ElasticNet,5.33,5.815,1.712,1.842
4,Huber Regressor,4.417,5.192,1.571,1.767
5,SGD Regressor,4.135,4.244,1.605,1.652
6,MLP Regressor,9.629,11.155,2.171,2.435


#### Na podstawie mse dla danych testowych najlepiej do danych dopasował się model ze stochastycznym spadkiem gradientu.

### Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 3

In [90]:
X_train, X_test, y_train, y_test = generate_dataset(300, False, True)
deg = 3
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 3", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,3.035,1.603,1.052,0.964
1,Ridge,3.035,1.605,1.053,0.964
2,Lasso,3.336,1.912,1.186,1.058
3,ElasticNet,3.334,1.906,1.185,1.057
4,Huber Regressor,3.474,0.936,0.969,0.761
5,SGD Regressor,3.167,1.349,1.048,0.916
6,MLP Regressor,6.071,4.481,1.608,1.509


### Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 5

In [93]:
deg = 5
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name == "SGD Regressor":
    continue
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 5", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,2.221,1.644,1.024,0.966
1,Ridge,2.225,1.621,1.019,0.953
2,Lasso,3.764,2.414,1.22,1.199
3,ElasticNet,3.644,2.327,1.199,1.172
4,Huber Regressor,3.276,0.95,0.959,0.76
5,MLP Regressor,3.397,1.708,1.126,1.064


### Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 8

In [54]:
deg = 8
rows = []
fig = go.Figure()
fig = fig.add_trace(go.Scatter(x=np.ravel(X_train), y=np.ravel(y_train), mode='markers', name="Train subset", marker=dict(size=9)))
fig = fig.add_trace(go.Scatter(x=np.ravel(X_test), y=np.ravel(y_test), mode='markers', name="Test subset", marker=dict(size=9)))
for model_name in models.keys():
  if model_name in ["Huber Regressor", "SGD Regressor"]:
    continue
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  elif model_name == "MLP Regressor":
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), MLPRegressor(hidden_layer_sizes = (deg), max_iter=500)), X_train, X_test, np.ravel(y_train), np.ravel(y_test))
  else:
    row, y_pred = mse_mae(make_pipeline(PolynomialFeatures(deg), models[model_name]), X_train, X_test, y_train, y_test)
  rows.append(np.concatenate([[model_name], row]))
  xy = np.c_[np.ravel(X_test), y_pred]
  xy = xy[xy[:, 0].argsort()]
  fig = fig.add_trace(go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='lines', name=model_name, marker=dict(size=9)))

fig.update_layout(xaxis_title="X", yaxis_title="y", showlegend=True, title="Dane nieliniowe, zbiór do uczenia z outlierami, stopień wielomianu = 8", autosize=False, width=1200, height=700)
fig.show()

df_nonlin_nonoise = pd.DataFrame(rows, columns=columns)
df_nonlin_nonoise

Unnamed: 0,model,mse on train subset,mse on test subset,mae on train subset,mae on test subset
0,Linear Regression,1.571,0.916,0.88,0.725
1,Ridge,1.572,0.919,0.878,0.728
2,Lasso,1.908,1.337,0.981,0.896
3,ElasticNet,1.904,1.333,0.979,0.898
4,MLP Regressor,1.855,0.907,0.91,0.732


### Wnioski:
- Funkcja straty MAE jest bardziej odporna na dane odstające (Hubber Regressor, SGD Regressor).
- Regularyzacja l2 (Ridge) lepiej pomaga zminimaliziwać funkcję straty niż l1 (Lasso).
