# Linear Regression with Numpy Implementation

## 1. Introduction

## 2. Linear Regression Model

## 3. Numpy Implementation of Linear Regression

In [67]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random

import numpy as np

np.random.seed(71)

In [81]:
class LinearRegression(object):
    """Numpy implementation of Linear Regression."""
    def __init__(self, batch_size=64, lr=0.01, n_epochs=1000):
        self.batch_size = batch_size
        self.lr = lr
        self.n_epochs = n_epochs

    def get_dataset(self, X_train, y_train, shuffle=True):
        """Get dataset and information."""
        self.X_train = X_train
        self.y_train = y_train

        # Get the numbers of examples and inputs.
        self.n_examples, self.n_inputs = self.X_train.shape

        if shuffle:
            idx = list(range(self.n_examples))
            random.shuffle(idx)
            self.X_train = self.X_train[idx]
            self.y_train = self.y_train[idx]

    def _create_weights(self):
        """Create model weights and bias."""
        self.w = np.zeros(self.n_inputs).reshape(self.n_inputs, 1)
        self.b = np.zeros(1).reshape(1, 1)

    def _model(self, X):
        """Linear regression model."""
        return np.matmul(X, self.w) + self.b

    def _loss(self, y, y_hat):
        """Squared error loss.

        # squared_error_loss(y, y_hat) 
        #   = - 1/n * \sum_{i=1}^n (y_i - y_hat_i)^2
        """
        self.squared_error = np.square(y - y_hat)
        return np.mean(self.squared_error)

    def _optimize(self, X, y):
        """Optimize by stochastic gradient descent."""
        m = X.shape[0]

        y_hat = self._model(X) 
        dw = 1 / m * np.matmul(X.T, y_hat - y)
        db = np.mean(y_hat - y)

        for (param, grad) in zip([self.w, self.b], [dw, db]):
            param[:] = param - self.lr * grad

    def _fetch_batch(self):
        """Fetch batch dataset."""
        idx = list(range(self.n_examples))
        for i in range(0, self.n_examples, self.batch_size):
            idx_batch = idx[i:min(i + self.batch_size, self.n_examples)]
            yield (self.X_train.take(idx_batch, axis=0), self.y_train.take(idx_batch, axis=0))

    def fit(self):
        """Fit model."""
        self._create_weights()

        for epoch in range(self.n_epochs):
            total_loss = 0
            for X_train_b, y_train_b in self._fetch_batch():
                y_train_b = y_train_b.reshape((y_train_b.shape[0], -1))
                self._optimize(X_train_b, y_train_b)
                train_loss = self._loss(y_train_b, self._model(X_train_b))
                total_loss += train_loss * X_train_b.shape[0]

            if epoch % 100 == 0:
                print('epoch {0}: training loss {1}'.format(epoch, total_loss / self.n_examples))

        return self

    def get_coeff(self):
        return self.b, self.w.reshape((-1,))

    def predict(self, X):
        return self._model(X)

## 4. Data Preparation and Preprocessing

In [103]:
import sklearn
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression as LinearRegressionSklearn

# https://github.com/bowen0701/machine-learning/blob/master/metrics.py
from metrics import mean_squared_error

In [83]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [84]:
# Read California housing data.
housing = fetch_california_housing()
X = housing.data
y = housing.target

In [85]:
X.shape, y.shape

((20640, 8), (20640,))

In [86]:
X[:3]

array([[ 8.32520000e+00,  4.10000000e+01,  6.98412698e+00,
         1.02380952e+00,  3.22000000e+02,  2.55555556e+00,
         3.78800000e+01, -1.22230000e+02],
       [ 8.30140000e+00,  2.10000000e+01,  6.23813708e+00,
         9.71880492e-01,  2.40100000e+03,  2.10984183e+00,
         3.78600000e+01, -1.22220000e+02],
       [ 7.25740000e+00,  5.20000000e+01,  8.28813559e+00,
         1.07344633e+00,  4.96000000e+02,  2.80225989e+00,
         3.78500000e+01, -1.22240000e+02]])

In [87]:
y[:3]

array([4.526, 3.585, 3.521])

In [88]:
# Split data into training and test datasets.
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=71, shuffle=True)

In [89]:
print(X_train_raw.shape, y_train.shape)
print(X_test_raw.shape, y_test.shape)

(15480, 8) (15480,)
(5160, 8) (5160,)


In [90]:
# Feature engineering for standardizing features by min-max scaler.
min_max_scaler = MinMaxScaler()

X_train = min_max_scaler.fit_transform(X_train_raw)
X_test = min_max_scaler.transform(X_test_raw)

## 4. Fitting Linear Regression

In [112]:
# Fit our Linear Regression.
linreg = LinearRegression(batch_size=64, lr=0.1, n_epochs=1000)

In [113]:
# Get datasets and build graph.
linreg.get_dataset(X_train, y_train, shuffle=True)

In [114]:
linreg.fit()

epoch 0: training loss 1.110757794169854
epoch 100: training loss 0.5296216268873367
epoch 200: training loss 0.5278473827772808
epoch 300: training loss 0.5267872344394765
epoch 400: training loss 0.5259265363110449
epoch 500: training loss 0.5251790545618769
epoch 600: training loss 0.524516683361849
epoch 700: training loss 0.523923481722729
epoch 800: training loss 0.523388285612947
epoch 900: training loss 0.5229027549547198


<__main__.LinearRegression at 0x7fb8455e4080>

In [115]:
# Get coefficient.
linreg.get_coeff()

(array([[3.89356922]]),
 array([ 5.73123657,  0.53439701, -3.33453912,  8.44685202, -0.03089555,
        -4.00668228, -4.15283968, -4.56849703]))

In [116]:
# Predicted response for training data.
y_train_hat = linreg.predict(X_train)
y_train_hat[:3]

array([[1.59413648],
       [1.44672811],
       [2.19580714]])

In [117]:
# Prediction squared error for training data.
mean_squared_error(y_train, y_train_hat)

0.5249569073413634

In [118]:
# Predicted response for test data.
y_test_hat = linreg.predict(X_test)
y_test_hat[:3]

array([[1.71611425],
       [2.87312538],
       [2.32329362]])

In [119]:
# Prediction accuracy for test data.
mean_squared_error(y_test, y_test_hat)

0.5457436339086752

## 5. Fitting Sklearn's Linear Regression as Benchmark

In [104]:
# Fit sklearn's Logistic Regression.
linreg_sk = LinearRegressionSklearn()

linreg_sk.fit(X_train, y_train)

LinearRegression()

In [106]:
# Get coefficients.
linreg_sk.intercept_, linreg_sk.coef_

(3.6417826978434746,
 array([  6.34849599,   0.51442258, -14.45592149,  21.59547904,
         -0.04895467,  -4.96569765,  -3.91622294,  -4.31329812]))

In [108]:
# Predicted labels for training data.
y_train_hat = linreg_sk.predict(X_train)
y_train_hat[:3]

array([1.53790266, 1.47207536, 2.21329706])

In [109]:
# Prediction squared error for training data.
mean_squared_error(y_train, y_train_hat)

0.5195327861846003

In [110]:
# Predicted labels for test data.
y_test_hat = linreg_sk.predict(X_test)
y_test_hat[:3]

array([1.7578634 , 2.80313448, 2.30474671])

In [111]:
# # Prediction accuracy for test data.
mean_squared_error(y_test, y_test_hat)

0.5393499137492421