# Linear Regression with Implementation

## Introduction

## Linear Regression Model

[To be continued.]

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import sys
sys.path.append('../numpy/')

from metrics import mean_squared_error

np.random.seed(71)

In [2]:
%load_ext autoreload
%autoreload 2

## California Housing Dataset and Preprocessing

In [3]:
import sklearn
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [4]:
# Read California housing data.
housing = fetch_california_housing()
X = housing.data
y = housing.target

In [5]:
X.shape, y.shape

((20640, 8), (20640,))

In [6]:
print(housing.feature_names)
X[:3]

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


array([[ 8.32520000e+00,  4.10000000e+01,  6.98412698e+00,
         1.02380952e+00,  3.22000000e+02,  2.55555556e+00,
         3.78800000e+01, -1.22230000e+02],
       [ 8.30140000e+00,  2.10000000e+01,  6.23813708e+00,
         9.71880492e-01,  2.40100000e+03,  2.10984183e+00,
         3.78600000e+01, -1.22220000e+02],
       [ 7.25740000e+00,  5.20000000e+01,  8.28813559e+00,
         1.07344633e+00,  4.96000000e+02,  2.80225989e+00,
         3.78500000e+01, -1.22240000e+02]])

In [7]:
y[:3]

array([4.526, 3.585, 3.521])

In [8]:
# Split data into training and test datasets.
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=71, shuffle=True)

In [9]:
print(X_train_raw.shape, y_train.shape)
print(X_test_raw.shape, y_test.shape)

(15480, 8) (15480,)
(5160, 8) (5160,)


In [10]:
# Feature engineering for standardizing features by min-max scaler.
min_max_scaler = MinMaxScaler()

X_train = min_max_scaler.fit_transform(X_train_raw)
X_test = min_max_scaler.transform(X_test_raw)

In [11]:
# Convert arrays to float32.
X_train, X_test, y_train, y_test = (
    np.float32(X_train), np.float32(X_test), np.float32(y_train), np.float32(y_test))

In [12]:
X_train.dtype, y_train.dtype

(dtype('float32'), dtype('float32'))

In [None]:
def get_data(self, X_train, y_train, shuffle=True):
    """Get dataset and information."""
    self.X_train = X_train
    self.y_train = y_train

    # Get the numbers of examples and inputs.
    self.n_examples, self.input_size = self.X_train.shape

    if shuffle:
        idx = list(range(self.n_examples))
        random.shuffle(idx)
        self.X_train = self.X_train[idx]
        self.y_train = self.y_train[idx]

def _fetch_batch(self):
    """Fetch batch dataset."""
    idx = list(range(self.n_examples))
    for i in range(0, self.n_examples, self.batch_size):
        idx_batch = idx[i:min(i + self.batch_size, self.n_examples)]
        yield (self.X_train.take(idx_batch, axis=0), 
                self.y_train.take(idx_batch, axis=0))

## PyTorch Implementation of Linear Regression

In [None]:
class LinearRegression(nn.Module):
    """PyTorch implementation of Linear Regression."""

    def __init__(self, input_size, lr):
        super(LinearRegression, self).__init__()

        self.input_size = input_size
        self.lr = lr

        self.fc = nn.Linear(self.input_size, 1)

    def forward(self, x):
        """Foward to output model."""
        x = self.fc(x)
        return x

    def set_loss_optimizer(self):
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.RMSprop(self.parameters(), lr=self.lr)

    def get_coeff(self):
        """Get model coefficients."""
        # Detach var which require grad.
        return (self.fc.bias.detach().numpy(),
                self.fc.weight.detach().numpy())

In [None]:
def train(self):
    """Train model."""
    self.train()

    for epoch in range(1, self.n_epochs + 1):
        total_loss = 0
        for X_train_b, y_train_b in self._fetch_batch():
            # Convert to Tensor from NumPy array and reshape ys.
            X_train_b, y_train_b = (
                torch.from_numpy(X_train_b), 
                torch.from_numpy(y_train_b).view(-1, 1))

            y_pred_b = self.forward(X_train_b)
            loss = self.criterion(y_pred_b, y_train_b)
            total_loss += loss * X_train_b.shape[0]

            # Zero grads, performs backward pass, and update weights.
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        if epoch % 100 == 0:
            print('Epoch {0}: training loss: {1}'
                    .format(epoch, total_loss / self.n_examples))


## Fitting Linear Regression in PyTorch

In [None]:
# Fit our Linear Regression.
linreg = LinearRegression(batch_size=64, lr=0.1, n_epochs=1000)

NameError: ignored

In [None]:
linreg_torch.get_data(X_train, y_train, shuffle=True)

NameError: ignored

In [None]:
linreg_torch.build()

In [None]:
linreg_torch.fit()

In [None]:
# Get coefficient.
linreg_torch.get_coeff()

In [None]:
# Predicted response for training data.
y_train_ = linreg_torch.predict(X_train)
y_train_[:10]

In [None]:
# Prediction squared error for training data.
mean_squared_error(y_train_, y_train)

In [None]:
# Predicted response for test data.
y_test_ = linreg_torch.predict(X_test)
y_test_[:10]

In [None]:
# Prediction accuracy for test data.
mean_squared_error(y_test_, y_test)

## Benchmark with Sklearn's Linear Regression

In [None]:
# Fit sklearn's Logistic Regression.
from sklearn.linear_model import LinearRegression as LinearRegressionSklearn

linreg_sk = LinearRegressionSklearn()

linreg_sk.fit(X_train, y_train.reshape(y_train.shape[0]))

LinearRegression()

In [None]:
# Get coefficients.
linreg_sk.intercept_, linreg_sk.coef_

(3.6417923,
 array([  6.348496  ,   0.5144263 , -14.455919  ,  21.595474  ,
         -0.04895439,  -4.965696  ,  -3.9162228 ,  -4.3132935 ],
       dtype=float32))

In [None]:
# Predicted labels for training data.
y_train_ = linreg_sk.predict(X_train)
y_train_[:10]

array([1.5379176, 1.472091 , 2.2133121, 3.8295603, 3.0244732, 1.9933348,
       2.263915 , 1.0535035, 1.0954115, 1.9086264], dtype=float32)

In [None]:
# Prediction squared error for training data.
mean_squared_error(y_train_, y_train)

0.51953274

In [None]:
# Predicted labels for test data.
y_test_ = linreg_sk.predict(X_test)
y_test_[:10]

array([1.75787538, 2.8031482 , 2.30476246, 2.80146927, 2.87024621,
       1.75832087, 2.11390826, 2.71989601, 2.10377988, 1.68258784])

In [None]:
# # Prediction accuracy for test data.
mean_squared_error(y_test_, y_test)

0.5393498488643094