# Lineaar regression with iterative optimisation(`SGDRegressor`)

It gives conrtrol over optimisation through a number of hyperparameters.

In [None]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import SGDRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import validation_curve

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
np.random.seed(306)

shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

Split into test and train

In [None]:
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)

Split into train and dev

In [None]:
train_features, dev_features, train_labels, dev_labels = train_test_split(com_train_features, com_train_labels, random_state=42)

## Baseline `SGDRegressor`

* STEP 1: Instantiate baseline `SGDRegressor` with default parameters.
* STEP 2: Train the model with training feature matrix and labels.
* STEP 3: Obtain the score on the train and dev data

In [None]:
sgd = SGDRegressor(random_state=42)
sgd.fit(train_features, train_labels)

train_mae = mean_absolute_error(train_labels, sgd.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd.predict(dev_features))

print("Mean Absolute Error on Training set: ", train_mae)
print("Mean Absolute Error on development set: ", dev_mae)

Errors are too large!!!

## Adding a feature scaling step

SGD is sensitive to feature scaling.

In [None]:
sgd_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                        ("sgd", SGDRegressor())])

sgd_pipeline.fit(train_features, train_labels)

train_mae = mean_absolute_error(train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))

print("Mean Absolute Error on Training set: ", train_mae)
print("Mean Absolute Error on development set: ", dev_mae)

## Step-wise training of SGDRegressor

* STEP 1: Instantiate `SGDRegressor` with `warm_start=True` and `tol=-np.infty`
* STEP 2: Train SGD step by step and recod regression loss in each step.
* STEP 3: Plot learning curve and see if there are any issues in training.

In [None]:
eta0 = 1e-2
sgd_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                        ("sgd", SGDRegressor(max_iter=1, tol=-np.infty,
                                                warm_start=True,
                                                random_state=42))])

loss = []

for epoch in range(100):
    sgd_pipeline.fit(train_features, train_labels)
    loss.append(mean_squared_error(train_labels, sgd_pipeline.predict(train_features)))

plt.plot(np.arange(len(loss)), loss, 'b-')
plt.xlabel('Iteration #')
plt.ylabel('MSE')
plt.title(f'Learning curve: eta0={eta0: .4f}')

The loss reduced intially and then increased. This could be due to large training rates. We will reduce the rate by a factor of 10 and repeat the process.

In [None]:
eta0 = 1e-3
sgd_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                        ("sgd", SGDRegressor(max_iter=1, tol=-np.infty,
                                                warm_start=True, eta0=eta0,
                                                random_state=42))])

loss = []

for epoch in range(100):
    sgd_pipeline.fit(train_features, train_labels)
    loss.append(mean_squared_error(train_labels, sgd_pipeline.predict(train_features)))

plt.plot(np.arange(len(loss)), loss, 'b-')
plt.xlabel('Iteration #')
plt.ylabel('MSE')
plt.title(f'Learning curve: eta0={eta0: .4f}')

This is an ideal learning curve where the train loss reduce monotonically as the training progresses.


In [None]:
print('# iteration before reaching convergence criteria: ', 
        sgd_pipeline[-1].n_iter_)

print ("#Weight updated:", sgd_pipeline[-1].t_)

In [None]:
train_mae = mean_absolute_error(train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))

print("Mean Absolute Error on Training set: ", train_mae)
print("Mean Absolute Error on development set: ", dev_mae)

## Fixing learning rate through validation curves

* STEP 1: Provide the list of values to be tried for a hyper=parameter.
* STEP 2: Instantiate an object of `validation_curve` with estimator, training features and label. Set `scoring` parameter to relevant score.
* STEP 3: Convert sccores to error.
* STEP 4: Fix the hyper parameter value where the test error is the least. 

In [None]:
%%time

eta0 = [1e-5, 1e-4, 1e-3, 1e-2]
train_scores, test_scores = validation_curve(
                                            sgd_pipeline, com_train_features, com_train_labels, param_name="sgd__eta0",
                                            param_range=eta0, cv = shuffle_split_cv, scoring="neg_mean_squared_error",
                                            n_jobs=2)

train_errors, test_errors = -train_scores, -test_scores

In [None]:
plt.plot(eta0, train_errors.mean(axis=1), 'b-x', label='Training error')
plt.plot(eta0, test_errors.mean(axis=1), 'r-x', label='Test error')
plt.legend()
plt.xlabel("eta0")
plt.ylabel("Mean absolute error ($k$)")
_ = plt.title("Validation curve for SGD")

## `SGDRegressor`

In [None]:
sgd_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                        ("sgd", SGDRegressor(max_iter=500,
                                            early_stopping=True,
                                            eta0 = 1e-3,
                                            tol=1e-3,
                                            validation_fraction=0.2,
                                            n_iter_no_change=5,
                                            average=10,
                                            random_state=42))])

sgd_pipeline.fit(train_features, train_labels)

train_mae = mean_absolute_error(train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))

print("Mean Absolute Error on Training set: ", train_mae)
print("Mean Absolute Error on development set: ", dev_mae)

In [None]:
print('# iteration before reaching convergence criteria: ', 
        sgd_pipeline[-1].n_iter_)

print ("#Weight updated:", sgd_pipeline[-1].t_)

In [None]:
#learning rate changed to constant from inverse scaling(default)
sgd_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                        ("sgd", SGDRegressor(max_iter=500,
                                            early_stopping=True,
                                            eta0 = 1e-3,
                                            tol=1e-3,
                                            learning_rate='constant',
                                            validation_fraction=0.2,
                                            n_iter_no_change=5,
                                            average=10,
                                            random_state=42))])

sgd_pipeline.fit(train_features, train_labels)

train_mae = mean_absolute_error(train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))

print("Mean Absolute Error on Training set: ", train_mae)
print("Mean Absolute Error on development set: ", dev_mae)

print('\n# iteration before reaching convergence criteria: ', 
        sgd_pipeline[-1].n_iter_)

print ("#Weight updated:", sgd_pipeline[-1].t_)

In [None]:
#learning rate changed to adaptive from inverse scaling(default)
sgd_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                        ("sgd", SGDRegressor(max_iter=500,
                                            early_stopping=True,
                                            eta0 = 1e-3,
                                            tol=1e-3,
                                            learning_rate='adaptive',
                                            validation_fraction=0.2,
                                            n_iter_no_change=5,
                                            average=10,
                                            random_state=42))])

sgd_pipeline.fit(train_features, train_labels)

train_mae = mean_absolute_error(train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))

print("Mean Absolute Error on Training set: ", train_mae)
print("Mean Absolute Error on development set: ", dev_mae)

print('\n# iteration before reaching convergence criteria: ', 
        sgd_pipeline[-1].n_iter_)

print ("#Weight updated:", sgd_pipeline[-1].t_)

## Setting `max_iters`

In [None]:
max_iter = np.ceil(1e6/com_train_features.shape[0])
max_iter

In [None]:
sgd_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                        ("sgd", SGDRegressor(max_iter=max_iter,
                                            early_stopping=True,
                                            eta0 = 1e-3,
                                            tol=1e-3,
                                            learning_rate='adaptive',
                                            validation_fraction=0.2,
                                            n_iter_no_change=5,
                                            average=10,
                                            random_state=42))])

sgd_pipeline.fit(train_features, train_labels)

train_mae = mean_absolute_error(train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))

print("Mean Absolute Error on Training set: ", train_mae)
print("Mean Absolute Error on development set: ", dev_mae)

print('\n# iteration before reaching convergence criteria: ', 
        sgd_pipeline[-1].n_iter_)

print ("#Weight updated:", sgd_pipeline[-1].t_)