# Implementation of various ML regression classes:

## - Penalised Least Squares

## - K Nearest Neighbours Regression

## - Regression Forest

## - Gaussian Process Regression

In [1]:
import numpy as np
import random
import pandas as pd
import os
import gc
import tensorflow.compat.v1 as tf
from matplotlib              import pyplot as plt
from tqdm.notebook           import tqdm
from pathos.multiprocessing  import Pool
from scipy.spatial.distance  import cdist
from scipy.spatial           import cKDTree
from numpy.linalg            import inv
from scipy.linalg            import cho_solve

tf.disable_eager_execution()

In [2]:
import sys
sys.path.append('../')

In [3]:
from ML_implementations.regression import *

We will work with the Heating Load dataset:

In [4]:
with open("ee-train.csv", "rb") as csvfile:
    train = pd.read_csv("ee-train.csv",low_memory=False) 
with open("ee-test.csv", "rb") as csvfile:
    test = pd.read_csv("ee-test.csv",low_memory=False) 
train_means = np.mean(train, axis = 0)
train_std   = np.std(train,axis = 0)
train_norm = (train - train_means)/train_std
train_norm.loc[:,'Heating Load'] = train.loc[:,'Heating Load']
# train_norm.insert(0,'Constant',1)
n_features = train_norm.shape[1]
test_norm = (test - train_means)/train_std
test_norm.loc[:,'Heating Load'] = test.loc[:,'Heating Load']
# test_norm.insert(0,'Constant',1)
x_train = train_norm.iloc[:,:n_features-1].values
y_train = train_norm.iloc[:,n_features-1].values
x_test  = test_norm.iloc[:,:n_features-1].values
y_test  = test_norm.iloc[:,n_features-1].values
y_test = y_test.reshape(-1,1)

### Penalised Least Squares

Generic PLS, a more involved implementation can be found in the sampling notebook.

In [5]:
regression_error = Regression(intercept = True).fit(x_train,y_train, 0.01).rmse(x_test, y_test)

In [6]:
print('PLS RMSE:', regression_error)

PLS RMSE: 2.8435523716042383


### KNN Regression

In [7]:
KNN_error = K_Neighbours_Regressor(k = 10, p=2, weight = 'distance').fit(x_train, y_train).rmse(x_test,y_test)

In [8]:
KNN_error

2.388251366284861

### Regression Forest

In [9]:
with open("ee-train.csv", "rb") as csvfile:
    train = pd.read_csv("ee-train.csv",low_memory=False) 
with open("ee-test.csv", "rb") as csvfile:
    test = pd.read_csv("ee-test.csv",low_memory=False) 
train_means = np.mean(train, axis = 0)
train_std   = np.std(train,axis = 0)
train_norm = (train - train_means)/train_std
train_norm.loc[:,'Heating Load'] = train.loc[:,'Heating Load']
# train_norm.insert(0,'Constant',1)
n_features = train_norm.shape[1]
test_norm = (test - train_means)/train_std
test_norm.loc[:,'Heating Load'] = test.loc[:,'Heating Load']
# test_norm.insert(0,'Constant',1)
x_train = train_norm.iloc[:,:n_features-1].values
y_train = train_norm.iloc[:,n_features-1].values
x_test  = test_norm.iloc[:,:n_features-1].values
y_test  = test_norm.iloc[:,n_features-1].values
y_test = y_test.reshape(-1,1)

In [10]:
RF_error = Random_Forest(n_trees = 160, max_depth = 30, min_node_count = 3, bag_ratio=1, 
                         feature_ratio=0.8, bins = 200, step = 2,
                         weight = 'OOB', 
                         threads = 16).fit(x_train, y_train).rmse(x_test,y_test)

In [11]:
RF_error

1.6445340492313083

### Gaussian Process - Numpy

In [12]:
with open("ee-train.csv", "rb") as csvfile:
    train = pd.read_csv("ee-train.csv",low_memory=False) 
with open("ee-test.csv", "rb") as csvfile:
    test = pd.read_csv("ee-test.csv",low_memory=False) 
train_means = np.mean(train, axis = 0)
train_std   = np.std(train,axis = 0)
train_norm = (train - train_means)/train_std
train_norm.loc[:,'Heating Load'] = train.loc[:,'Heating Load']
# train_norm.insert(0,'Constant',1)
n_features = train_norm.shape[1]
test_norm = (test - train_means)/train_std
test_norm.loc[:,'Heating Load'] = test.loc[:,'Heating Load']
# test_norm.insert(0,'Constant',1)
x_train = train_norm.iloc[:,:n_features-1].values
y_train = train_norm.iloc[:,n_features-1].values
x_test  = test_norm.iloc[:,:n_features-1].values
y_test  = test_norm.iloc[:,n_features-1].values
y_train = y_train.reshape(-1,1)
y_test  = y_test.reshape(-1,1)

In [13]:
GP_error = Gaussian_Process().fit(x_train, y_train, alpha = 0.9, epochs = 100, momentum = 0.4, prior_c = 1).rmse(x_test,y_test)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

In [14]:
GP_error

0.645451981697637

### Gaussian Process - Tensorflow

In [15]:
gp = tfGP(variable_l = True)

In [16]:
gp.fit(x_train, y_train, [1,1,1])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))

  opt iter     0: objective = 1419.9030516529642
  opt iter   200: objective = 264.10011235455283
  opt iter   400: objective = 92.37950501095204
  opt iter   600: objective = 55.71067397509729
  opt iter   800: objective = 34.46311454839803
  opt iter  1000: objective = 23.198379732614075
  opt iter  1200: objective = 16.8541893325557
  opt iter  1400: objective = 13.235618694157104
  opt iter  1600: objective = 11.223911401499564
  opt iter  1800: objective = 10.129902796455344
Noise Variance: 0.02419063503910851
Signal Variance: 19.831821770378685
Lengthscale: [ 2.33440356  3.77649718  0.98039842  3.30398894  2.59707607 66.05116942
  2.19121134 44.64825631]


<ML_implementations.regression.regression.tfGP at 0x7fa9c071d5b0>

In [17]:
mu, var = gp.predict(x_test)

In [18]:
np.sqrt(np.mean(np.square(mu - y_test)))

0.4640013406404691

In [20]:
gp.sess.close()