# Regression on Friedman Data

In [1]:
from sklearn.ensemble import IsolationForest
import pandas as pd
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import chart_studio.plotly as py
import matplotlib.pyplot as plt
from matplotlib import pyplot
import plotly.graph_objs as go
import warnings
from scipy import stats
from sklearn.datasets import make_friedman1
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
import seaborn as sns
import math
import sklearn.gaussian_process as gp
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [2]:
# function that accepts n by 5 matrix of inputs and computes friedman output for all n rows of inputs

def make_friedman(X):
    n = len(X[:,0])
    toReturn = np.zeros((n, 1))
    
    for i in range(0, n):
        toReturn[i, 0] += 10 * math.sin(math.pi * X[i,0] * X[i,1]) + 20 * (X[i,2] - 0.5) ** 2 + 10 * X[i,3] + 5 * X[i,4]
    
    return toReturn

In [3]:
# create training data - restrict x0 to 0.0 to 0.5

x0 = np.random.uniform(0.0, 0.5, 1000)
x1 = np.random.uniform(0.0, 1.0, 1000)
x2 = np.random.uniform(0.0, 1.0, 1000)
x3 = np.random.uniform(0.0, 1.0, 1000)
x4 = np.random.uniform(0.0, 1.0, 1000)

x_train = [x0, x1, x2, x3, x4]
x_train = np.transpose(x_train)

y_train = make_friedman(x_train)

x_train

array([[0.35680929, 0.81620163, 0.98047689, 0.84088021, 0.635122  ],
       [0.21051139, 0.07630007, 0.33126067, 0.32723243, 0.55877422],
       [0.18337613, 0.11182609, 0.1688338 , 0.2820492 , 0.33987094],
       ...,
       [0.10131272, 0.24813646, 0.39334723, 0.41538346, 0.30573371],
       [0.23413541, 0.89204488, 0.1278217 , 0.84279465, 0.36568543],
       [0.01168544, 0.21812812, 0.4627226 , 0.26562261, 0.55155462]])

In [4]:
# create testing data - restrict x0 to 0.5 to 1.0

x0 = np.random.uniform(0.5, 1.0, 1000)
x1 = np.random.uniform(0.0, 1.0, 1000)
x2 = np.random.uniform(0.0, 1.0, 1000)
x3 = np.random.uniform(0.0, 1.0, 1000)
x4 = np.random.uniform(0.0, 1.0, 1000)

x_test = [x0, x1, x2, x3, x4]
x_test = np.transpose(x_test)

df_test = pd.DataFrame(x_test, columns = ['x0','x1','x2','x3','x4'])

x_test

array([[0.82917831, 0.63794681, 0.19706139, 0.03654425, 0.60044076],
       [0.54705393, 0.55345232, 0.99216565, 0.41283255, 0.34212711],
       [0.83085405, 0.04183026, 0.08188682, 0.97472818, 0.18437896],
       ...,
       [0.99081762, 0.70614616, 0.83853906, 0.3101949 , 0.97401982],
       [0.6552142 , 0.83183145, 0.78520989, 0.50393883, 0.44391691],
       [0.68501379, 0.50893665, 0.9899373 , 0.89776693, 0.22054401]])

In [5]:
# train Support Vector Regression model to predict output with inputs

SVR_model = SVR(kernel='rbf')
SVR_model.fit(x_train,y_train.ravel())

SVR()

In [6]:
# train MLP model to predict output with inputs

MLP_model = MLPRegressor(hidden_layer_sizes=(64,64,64),activation="relu" ,random_state=1, max_iter=2000)
MLP_model.fit(x_train, y_train.ravel())

MLPRegressor(hidden_layer_sizes=(64, 64, 64), max_iter=2000, random_state=1)

In [7]:
# train Gaussian Process Regressor model to predict output with inputs

kernel = gp.kernels.ConstantKernel(1.0, (1e-1, 1e3)) * gp.kernels.RBF(10.0, (1e-3, 1e3))
GPR_model = gp.GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=0.1, normalize_y=True)
GPR_model.fit(x_train, y_train)

GaussianProcessRegressor(alpha=0.1, kernel=1**2 * RBF(length_scale=10),
                         n_restarts_optimizer=10, normalize_y=True)

In [8]:
# train RF model to predict output with inputs

RF_model = RandomForestRegressor(n_estimators = 500)
RF_model.fit(x_train,y_train.ravel())

RandomForestRegressor(n_estimators=500)

In [9]:
# predict using models and and evalute prediction using root mean squared error

SVR_rmse = np.sqrt(mean_squared_error(make_friedman(x_test), SVR_model.predict(x_test)))
print("SVR model root mean squared error: " + str(SVR_rmse))

MLP_rmse = np.sqrt(mean_squared_error(make_friedman(x_test), MLP_model.predict(x_test)))
print("MLP model root mean squared error: " + str(MLP_rmse))

GPR_rmse = np.sqrt(mean_squared_error(make_friedman(x_test), GPR_model.predict(x_test)))
print("GPR model root mean squared error: " + str(GPR_rmse))

RF_rmse = np.sqrt(mean_squared_error(make_friedman(x_test), RF_model.predict(x_test)))
print("RF model root mean squared error: " + str(RF_rmse))

SVR model root mean squared error: 2.28576805289214
MLP model root mean squared error: 3.321749909137672
GPR model root mean squared error: 3.854105023178912
RF model root mean squared error: 2.6949087267028005


## Resources Used
* https://arxiv.org/pdf/1610.02995.pdf