## Initialization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [3]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [4]:
mses = {'LinReg' : 0, 'SVR': 0, 'DTR':0, 'RFR':0, 'KNNR':0}
r2s = {'LinReg' : 0, 'SVR': 0, 'DTR':0, 'RFR':0, 'KNNR':0}

## Importing the Data

In [5]:
dataset = pd.read_excel('Folds5x2_pp.xlsx')
dataset.shape

(9568, 5)

In [6]:
dataset.head()

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


## Create X and Y

In [7]:
X = dataset.iloc[:, 0:4].values
Y = dataset.iloc[:, 4].values

In [8]:
X.shape

(9568, 4)

In [9]:
Y.shape

(9568,)

In [10]:
X

array([[   14.96,    41.76,  1024.07,    73.17],
       [   25.18,    62.96,  1020.04,    59.08],
       [    5.11,    39.4 ,  1012.16,    92.14],
       ..., 
       [   31.32,    74.33,  1012.92,    36.48],
       [   24.48,    69.45,  1013.86,    62.39],
       [   21.6 ,    62.52,  1017.23,    67.87]])

In [11]:
Y

array([ 463.26,  444.37,  488.56, ...,  429.57,  435.74,  453.28])

## Preprocess the Data

In [12]:
sc_X = StandardScaler()

In [13]:
X = sc_X.fit_transform(X)

In [14]:
X

array([[-0.6620227 , -1.14640293,  1.79929926,  1.31293526, -0.64480461],
       [-0.59856135, -1.14640293,  1.79929926,  1.31293526, -0.64480461],
       [-0.51923465, -1.14640293,  1.79929926,  1.31293526, -0.64480461],
       ..., 
       [ 0.353359  ,  1.49044302, -0.37373954, -0.72334483,  3.17277251],
       [ 0.67066578,  1.49044302, -0.37373954, -0.72334483,  3.17277251],
       [ 1.08316459,  1.49044302, -0.37373954, -0.72334483,  3.17277251]])

## Create Train and Test Data

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [13]:
X_train.shape

(7654, 4)

In [14]:
X_test.shape

(1914, 4)

In [15]:
Y_train.shape

(7654,)

In [16]:
Y_test.shape

(1914,)

## Linear Regression

In [17]:
reg_lin = LinearRegression()

In [18]:
reg_lin.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
Y_pred_lin = reg_lin.predict(X_test)

## SVR

In [21]:
reg_lsvr = SVR(kernel = 'linear')

In [22]:
reg_lsvr.fit(X_train, Y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [23]:
Y_pred_lsvr = reg_lsvr.predict(X_test)

## Decision Tree Regressor

In [24]:
reg_dtr = DecisionTreeRegressor()

In [25]:
reg_dtr.fit(X_train, Y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [26]:
Y_pred_dtr = reg_dtr.predict(X_test)

## Random Forest Regressor

In [27]:
reg_rfr = RandomForestRegressor(n_estimators=200)

In [28]:
reg_rfr.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [29]:
Y_pred_rfr = reg_rfr.predict(X_test)

## KNN Regressor

In [30]:
reg_knnr = KNeighborsRegressor(n_neighbors = 2)

In [31]:
reg_knnr.fit(X_train, Y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=2, p=2,
          weights='uniform')

In [32]:
Y_pred_knnr = reg_knnr.predict(X_test)

## Metrics

In [33]:
mses['LinReg'] = mean_squared_error(Y_pred_lin, Y_test)
mses['SVR'] = mean_squared_error(Y_pred_lsvr, Y_test)
mses['DTR'] = mean_squared_error(Y_pred_dtr, Y_test)
mses['RFR'] = mean_squared_error(Y_pred_rfr, Y_test)
mses['KNNR'] = mean_squared_error(Y_pred_knnr, Y_test)
mses

{'DTR': 20.241854806687559,
 'KNNR': 17.262467972831768,
 'LinReg': 21.569811681279333,
 'RFR': 11.319187647284467,
 'SVR': 21.797784262707001}

In [34]:
r2s['LinReg'] = r2_score(Y_pred_lin, Y_test)
r2s['SVR'] = r2_score(Y_pred_lsvr, Y_test)
r2s['DTR'] = r2_score(Y_pred_dtr, Y_test)
r2s['RFR'] = r2_score(Y_pred_rfr, Y_test)
r2s['KNNR'] = r2_score(Y_pred_knnr, Y_test)
r2s

{'DTR': 0.93077056432509675,
 'KNNR': 0.93841586185440073,
 'LinReg': 0.91987573484773588,
 'RFR': 0.95908236689846738,
 'SVR': 0.92198483424593902}