#ML Lab (CS360)
##Assignment 3

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
class Model():
  def __init__(self, p):
    self._features = p
    self._bias = 1
    self._weights = np.random.uniform(size=(p, 1))

  def predict(self, X):
    array_1d = lambda x: np.reshape(x, newshape=(-1))

    if type(X).__name__ == 'list' or type(X).__name__ == 'int' or type(X).__name__ == 'float':
      X = np.array(X)

    if X.ndim == 0:
      if self._features > 1:
        raise (f"Number of features of sample must be {self._features}")
      else:
        return array_1d(np.dot(X, self._weights) + self._bias)

    elif X.ndim == 1:
      if X.shape[0] == self._features or X.ndim == self._features:
        return array_1d(self._bias + np.multiply(X, self._weights))
      else:
        raise Exception(f"Number of features of sample must be {self.features}")

    elif X.ndim == 2:
      if X.shape[1] == self._features:
        return array_1d(np.dot(X, self._weights) + self._bias)
      else:
        raise Exception(f"Shape of input array must be (Number of samples, {self._features})")

    else:
      raise Exception(f"Too many dimensions in input array: {X.ndim}")
      
  def score(self, X, y):
    y_preds = self.predict(X)
    assert y_preds.shape == np.asarray(y).shape
    return np.mean(np.absolute(np.subtract(y, y_preds)))

class OLS_LinearRegression(Model):
  def __init__(self, features):
    Model.__init__(self, features)

  def fit(self, X, y):
    if type(X).__name__ == 'list':
      X = np.array(X)
      X = np.reshape(X, reshape=(-1, 1))

    if type(y).__name__ == 'list':
      y = np.array(y).reshape(-1)

    assert type(X).__name__ == 'ndarray'
    assert X.shape[0] > self._features
    assert X.shape[1] == self._features
    assert X.shape[0] == y.shape[0]

    X = np.c_[np.ones(X.shape[0]), X]
    
    pseudo_inv = np.linalg.pinv(np.dot(X.T, X))

    weights = np.dot(np.dot(pseudo_inv, X.T), y)

    self._bias = weights[0]
    self._weights = weights[1:]


class GD_LinearRegression(Model):
  def __init__(self, features, learning_rate=1e-4):
    self._learning_rate = learning_rate
    Model.__init__(self, features)
  
  def fit(self, X, y):
    db, dw = self.__derivatives(X, y)
    while self.__update(db, dw, X, y):
      db, dw = self.__derivatives(X, y)

  def __cost_function(self, X, y):
    m = X.shape[0] # number of samples
    squared_error = np.sum(np.square(self.predict(X) - y)) / 2
    return squared_error / m 

  def __derivatives(self, X, y):
    m = X.shape[0] # number of samples

    error = np.reshape(np.subtract(self.predict(X), y), newshape=(-1, 1))

    del_b = np.sum(error * self._bias, axis=0) / m
    del_w = np.reshape(np.sum(error * X, axis=0) / m, newshape=(-1, 1))
  
    return del_b, del_w
  
  def __update(self, db, dw, X, y):
    converged = lambda J, J_new: -1e-2 < J - J_new < 1e-2

    J = self.__cost_function(X, y)

    self._bias -= self._learning_rate * db
    self._weights -= self._learning_rate * dw

    J_new = self.__cost_function(X, y)

    return not converged(J, J_new)
  

In [None]:
def split_and_scale(X, y):
  # Split the dataset 
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, random_state=44)
  X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=43)

  # scaling the dataset
  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train = scaler.transform(X_train)
  X_validation = scaler.transform(X_validation)
  X_test = scaler.transform(X_test)

  return X_train, y_train, X_validation, y_validation, X_test, y_test

def print_scores(regressor, X_train, y_train, X_validation, y_validation, X_test, y_test):
  train_accuracy = regressor.score(X_train, y_train)
  validation_accuracy = regressor.score(X_validation, y_validation)
  test_accuracy = regressor.score(X_test, y_test)
  print(f"Training Set Accuracy: {train_accuracy}\nCross-Validaton Set Accuracy: {validation_accuracy}\nTest Set Accuracy: {test_accuracy}")

### Simple Linear Regression (One feature)

1. Implement the linear regression using (a) Ordinary Least Squares (Method) and (b) Gradient Descent Algorithm.
Dataset: Swedish Auto Insurance dataset.
i) You need to split the dataset into train (60%), validation (20%), and test (20%).
ii) Print the train, validation, and test accuracy

In [None]:
# Load the Dataset

dataset = pd.read_csv('insurance_dataset.csv')
X = dataset['X'].to_numpy().reshape(-1, 1)
y = dataset['Y'].to_numpy()

#Split the dataset into training (60%), validation (20%) and testing sets (20%)
X_train, y_train, X_validation, y_validation, X_test, y_test = split_and_scale(X, y)

In [None]:
# Using Ordinary Least Squares Method

ols_insurance = OLS_LinearRegression(X_train.shape[1])
ols_insurance.fit(X_train, y_train)

print_scores(ols_insurance, X_train, y_train, 
             X_validation, y_validation, 
             X_test, y_test)

Training Set Accuracy: 26.309574984859502
Cross-Validaton Set Accuracy: 26.037363528237435
Test Set Accuracy: 37.1354947272296


In [None]:
# Using Gradient Descent

gd_insurance = GD_LinearRegression(X_train.shape[1], learning_rate=0.001)
gd_insurance.fit(X_train, y_train)

print_scores(gd_insurance, X_train, y_train, 
             X_validation, y_validation, 
             X_test, y_test)

Training Set Accuracy: 26.099019142030517
Cross-Validaton Set Accuracy: 26.65294991852139
Test Set Accuracy: 37.3997186268862


###Multiple Linear Regression

2. Implement the linear regression using the Gradient Descent Algorithm
Dataset: Boston house pricing dataset.
i) You need to split the dataset into train (60%), validation (20%), and test(20%).
ii) Print the train, validation, and test accuracy

In [None]:
from sklearn import datasets
boston = datasets.load_boston(return_X_y=False)

X, y = boston.data, boston.target

X_train, y_train, X_validation, y_validation, X_test, y_test = split_and_scale(X, y)

del X, y

In [None]:
gd_bostonhousing = GD_LinearRegression(X_train.shape[1], learning_rate=1e-3)
gd_bostonhousing.fit(X_train, y_train)
print_scores(gd_bostonhousing, X_train, y_train, 
             X_validation, y_validation, 
             X_test, y_test)

Training Set Accuracy: 3.6234561106003276
Cross-Validaton Set Accuracy: 3.606224847445148
Test Set Accuracy: 4.15061633539572
