##Upload the Data set from machine

In [1]:
from google.colab import files
uploaded = files.upload()

Saving Data.csv to Data.csv


##Import the libraries needed

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


##Preview the dataset

In [3]:
dataset = pd.read_csv('Data.csv')
dataset

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.40,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.50,1009.23,96.62,473.90
...,...,...,...,...,...
9563,16.65,49.69,1014.01,91.00,460.03
9564,13.19,39.18,1023.67,66.78,469.62
9565,31.32,74.33,1012.92,36.48,429.57
9566,24.48,69.45,1013.86,62.39,435.74


##We preprocess our data 

Since there is no missing data and no categorical data. We have a clean data

##Split the dataset into X(Features) and y(dependent variable)

In [4]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
#The .values prints it out in numpy array.

In [5]:
X

array([[  14.96,   41.76, 1024.07,   73.17],
       [  25.18,   62.96, 1020.04,   59.08],
       [   5.11,   39.4 , 1012.16,   92.14],
       ...,
       [  31.32,   74.33, 1012.92,   36.48],
       [  24.48,   69.45, 1013.86,   62.39],
       [  21.6 ,   62.52, 1017.23,   67.87]])

In [6]:
y

array([463.26, 444.37, 488.56, ..., 429.57, 435.74, 453.28])

##Split the data into Training and Test Set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
#test_size of 0.2 is good, since it will use 0.8 to train the rest
#random test of 1 means entropy factor remains constant

In [8]:
X_train

array([[   6.61,   38.91, 1015.77,   92.31],
       [  22.72,   65.61, 1014.64,   70.53],
       [  10.06,   39.61, 1018.22,   70.22],
       ...,
       [  18.59,   39.54, 1008.56,   68.61],
       [  22.96,   45.01, 1012.21,   50.81],
       [  20.5 ,   49.69, 1009.6 ,   70.81]])

In [9]:
y_train

array([484.32, 449.04, 471.86, ..., 462.56, 450.37, 452.94])

In [10]:
X_test

array([[  17.93,   44.63, 1003.54,   87.41],
       [  16.34,   42.28, 1008.08,   52.8 ],
       [  25.97,   70.32, 1007.48,   57.01],
       ...,
       [  10.68,   38.38, 1020.79,   72.33],
       [  34.03,   73.56, 1006.49,   51.07],
       [  15.28,   44.58, 1016.45,   83.07]])

In [11]:
y_test

array([458.96, 463.29, 435.27, ..., 476.22, 440.29, 467.92])

#Multiple Linear Regression 

##We train the train set using multiple linear regression model

In [12]:
from sklearn.linear_model import LinearRegression
regressor_mlr = LinearRegression()
regressor_mlr.fit(X_train, y_train)



LinearRegression()

##We predict the test result

In [13]:
y_pred_mlr = regressor_mlr.predict(X_test)
y_pred_mlr

array([457.25522108, 466.71927366, 440.36694911, ..., 476.40502919,
       424.61609708, 463.91141143])

##We compare to prediction to the y_test

In [14]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_mlr.reshape(len(y_pred_mlr),1), y_test.reshape(len(y_test),1)),1))

[[457.26 458.96]
 [466.72 463.29]
 [440.37 435.27]
 ...
 [476.41 476.22]
 [424.62 440.29]
 [463.91 467.92]]


##We check the performance using r squared

The closer to 1 the better

In [15]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_mlr)

0.9321860060402447

In [63]:
r2_score_mlr = r2_score(y_test, y_pred_mlr)
r2_score_mlr

0.9321860060402447

#Polynomial Regression 

#We tune our X first

In [16]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)#n=4 is the best tuning 
X_poly = poly_reg.fit_transform(X_train)


##We train the train set using polynomial regression model using the tuned X

In [17]:
from sklearn.linear_model import LinearRegression
regressor_pr = LinearRegression()
regressor_pr.fit(X_poly, y_train)

LinearRegression()

##We predict the test result

We have to tune X_test too

In [18]:
y_pred_pr = regressor_pr.predict(poly_reg.transform(X_test))
y_pred_pr

array([456.08, 462.8 , 438.33, ..., 476.32, 432.68, 464.56])

##We compare to prediction to the y_test

In [19]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_pr.reshape(len(y_pred_pr),1), y_test.reshape(len(y_test),1)),1))

[[456.08 458.96]
 [462.8  463.29]
 [438.33 435.27]
 ...
 [476.32 476.22]
 [432.68 440.29]
 [464.56 467.92]]


##We check the performance using r squared

In [20]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_pr)

0.9447339528200642

In [64]:
r2_score_pr = r2_score(y_test, y_pred_pr)
r2_score_pr

0.9447339528200642

#Decision Tree Regression 

#We train the train set using decision tree regression model

In [21]:
from sklearn.tree import DecisionTreeRegressor
regressor_dtr = DecisionTreeRegressor(random_state = 0)
regressor_dtr.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

##We predict the test result

In [22]:
y_pred_dtr = regressor_dtr.predict(X_test)
y_pred_dtr

array([459.65, 462.26, 436.03, ..., 477.18, 432.78, 468.23])

##We compare to prediction to the y_test

In [23]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_dtr.reshape(len(y_pred_dtr),1), y_test.reshape(len(y_test),1)),1))

[[459.65 458.96]
 [462.26 463.29]
 [436.03 435.27]
 ...
 [477.18 476.22]
 [432.78 440.29]
 [468.23 467.92]]


##We check the performance using r squared

In [24]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_dtr)

0.9342783714449767

In [65]:
r2_score_dtr = r2_score(y_test, y_pred_dtr)
r2_score_dtr

0.9342783714449767

#Random Forest Regression 

##We train the train set using random forest regression model

In [25]:
from sklearn.ensemble import RandomForestRegressor
regressor_rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)#n=10
regressor_rfr.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

##We predict the test result

In [26]:
y_pred_rfr = regressor_dtr.predict(X_test)
y_pred_rfr

array([459.65, 462.26, 436.03, ..., 477.18, 432.78, 468.23])

##We compare to prediction to the y_test

In [27]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_rfr.reshape(len(y_pred_rfr),1), y_test.reshape(len(y_test),1)),1))

[[459.65 458.96]
 [462.26 463.29]
 [436.03 435.27]
 ...
 [477.18 476.22]
 [432.78 440.29]
 [468.23 467.92]]


##We check the performance using r squared

In [28]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_rfr)

0.9342783714449767

In [66]:
r2_score_rfr = r2_score(y_test, y_pred_rfr)
r2_score_rfr

0.9342783714449767

#Support Vector Regression 

##Feature Scaling (since it is an SVR model)

Spliting must be done before scaling !!!

In [36]:
#For X_train
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler() #Object for X
X_train_svr = sc_X.fit_transform(X_train)
#If there is categorical data in the feature, you must not include it the the scaling 

In [39]:
X_train_svr.shape

(7654, 4)

In [37]:
#We need to reshape y first before we can scale it
y_train_svr_reshape = y_train.reshape(-1,1)
y_train_svr_reshape

array([[484.32],
       [449.04],
       [471.86],
       ...,
       [462.56],
       [450.37],
       [452.94]])

In [38]:
y_train_svr_reshape.shape

(7654, 1)

In [40]:
#For y_train
from sklearn.preprocessing import StandardScaler
sc_y = StandardScaler() #Object for y
y_train_svr = sc_y.fit_transform(y_train_svr_reshape)

In [41]:
y_train_svr.shape

(7654, 1)

In [42]:
X_train_svr

array([[-1.75, -1.21,  0.42,  1.3 ],
       [ 0.41,  0.89,  0.23, -0.19],
       [-1.29, -1.16,  0.83, -0.22],
       ...,
       [-0.14, -1.16, -0.78, -0.33],
       [ 0.45, -0.73, -0.17, -1.55],
       [ 0.12, -0.36, -0.61, -0.18]])

In [43]:
y_train_svr

array([[ 1.76],
       [-0.31],
       [ 1.03],
       ...,
       [ 0.48],
       [-0.23],
       [-0.08]])

##We train the train set using Support Vector Model






In [44]:
from sklearn.svm import SVR
regressor_svr = SVR(kernel='rbf')
regressor_svr.fit(X_train_svr,y_train_svr)

  y = column_or_1d(y, warn=True)


SVR()

##We scale X_test before we can use the predict object

In [46]:
X_test_svr_scaled =  sc_X.transform(X_test)
X_test_svr_scaled


array([[-0.23, -0.76, -1.62,  0.97],
       [-0.44, -0.95, -0.86, -1.41],
       [ 0.85,  1.26, -0.96, -1.12],
       ...,
       [-1.21, -1.25,  1.26, -0.07],
       [ 1.94,  1.51, -1.13, -1.53],
       [-0.59, -0.77,  0.53,  0.67]])

##We predict the test result

We predict the result but the result will be in a scaled format

In [48]:
y_pred_svr_scaled = regressor_svr.predict(X_test_svr_scaled)
y_pred_svr_scaled

array([ 0.15,  0.55, -0.99, ...,  1.29, -1.26,  0.58])

We need to get the original numerical scale which can make sense to us, so we get the inverse and also remember to reshape it

In [52]:
y_pred_svr = sc_y.inverse_transform(y_pred_svr_scaled.reshape(-1,1))
y_pred_svr

array([[456.98],
       [463.67],
       [437.52],
       ...,
       [476.33],
       [432.91],
       [464.26]])

In [54]:
y_pred_svr.shape

(1914, 1)

Since we want our y_test of this svr model to be of the same shape as y_pred_svr, we must also reshape y_test

In [55]:
y_test_svr = y_test.reshape(-1,1)
y_test_svr

array([[458.96],
       [463.29],
       [435.27],
       ...,
       [476.22],
       [440.29],
       [467.92]])

In [57]:
y_test_svr.shape

(1914, 1)

##We compare to prediction to the y_test

In [58]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_svr,y_test_svr),1))

[[456.98 458.96]
 [463.67 463.29]
 [437.52 435.27]
 ...
 [476.33 476.22]
 [432.91 440.29]
 [464.26 467.92]]


##We check the performance using r squared

In [59]:
from sklearn.metrics import r2_score
r2_score(y_test_svr, y_pred_svr)

0.9479978713795594

In [67]:
r2_score_svr = r2_score(y_test_svr, y_pred_svr)
r2_score_svr

0.9479978713795594

##Now we choose the best model we should use!

In [80]:
#The two list must have a 1-1 correspondence 
list_r2_score = [r2_score_svr,r2_score_dtr,r2_score_mlr,r2_score_pr,r2_score_rfr]
list_r2_score_string = ['r2_score_svr','r2_score_dtr','r2_score_mlr','r2_score_pr','r2_score_rfr']



In [81]:
index_max = list_r2_score.index(max(list_r2_score))
index_max

0

In [82]:
print(list_r2_score_string[index_max],list_r2_score[index_max])

r2_score_svr 0.9479978713795594
