#### Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

### Import Dataset

In [2]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.40,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.50,1009.23,96.62,473.90
...,...,...,...,...,...
9563,16.65,49.69,1014.01,91.00,460.03
9564,13.19,39.18,1023.67,66.78,469.62
9565,31.32,74.33,1012.92,36.48,429.57
9566,24.48,69.45,1013.86,62.39,435.74


#### Split into X and y

In [3]:
X = data.iloc[ : , : -1].values
y = data.iloc[: , -1].values

In [4]:
X

array([[  14.96,   41.76, 1024.07,   73.17],
       [  25.18,   62.96, 1020.04,   59.08],
       [   5.11,   39.4 , 1012.16,   92.14],
       ...,
       [  31.32,   74.33, 1012.92,   36.48],
       [  24.48,   69.45, 1013.86,   62.39],
       [  21.6 ,   62.52, 1017.23,   67.87]])

In [5]:
y

array([463.26, 444.37, 488.56, ..., 429.57, 435.74, 453.28])

### Split into train and test sets

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # random state make sure it has the same outcome everytime you run it


Simple linear regression is not needed since it has more than one independent variable, but for learning purpose we are going to use it here

#### Simple Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
simple_regressor = LinearRegression()
simple_regressor.fit(X_train, y_train)

LinearRegression()

Predict for test set

In [14]:
simple_pred = simple_regressor.predict(X_test)
simple_pred

array([457.26, 466.72, 440.37, ..., 476.41, 424.62, 463.91])

Compare y_test to the prediction

In [17]:
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(-1,1), simple_pred.reshape(-1,1)),1))

[[458.96 457.26]
 [463.29 466.72]
 [435.27 440.37]
 ...
 [476.22 476.41]
 [440.29 424.62]
 [467.92 463.91]]


Evaluate using R-square

In [19]:
from sklearn.metrics import r2_score
simple_r2_score = r2_score(y_test,  simple_pred)
simple_r2_score

0.9321860060402447

#### Multiple Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression
multiple_regressor = LinearRegression()
multiple_regressor.fit(X_train, y_train) 

LinearRegression()

Predict for test set

In [8]:
multiple_pred = multiple_regressor.predict(X_test)
multiple_pred

array([457.25522108, 466.71927366, 440.36694911, ..., 476.40502919,
       424.61609708, 463.91141143])

Compare y_test to the prediction

In [10]:
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(-1,1), multiple_pred.reshape(-1,1)),1))

[[458.96 457.26]
 [463.29 466.72]
 [435.27 440.37]
 ...
 [476.22 476.41]
 [440.29 424.62]
 [467.92 463.91]]


Evaluate using R-Square

In [16]:
from sklearn.metrics import r2_score
multiple_r2_score = r2_score(y_test,  multiple_pred)
multiple_r2_score


0.9321860060402447

#### Polynomial Regression

In [24]:
from sklearn.preprocessing import PolynomialFeatures 
poly_converter= PolynomialFeatures(degree=4)  # convert the X_train to degree 4 
X_train_poly = poly_converter.fit_transform(X_train) # convert the X_test to degree 4 
X_test_poly = poly_converter.fit_transform(X_test)


In [23]:
from sklearn.linear_model import LinearRegression
poly_regressor = LinearRegression()
poly_regressor.fit(X_train_poly, y_train)

LinearRegression()

Predict for X_test

In [26]:
poly_pred = poly_regressor.predict(X_test_poly)
poly_pred

array([456.08, 462.8 , 438.33, ..., 476.32, 432.68, 464.56])

Compare y_test to the prediction

In [27]:
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(-1,1), poly_pred.reshape(-1,1)),1))

[[458.96 456.08]
 [463.29 462.8 ]
 [435.27 438.33]
 ...
 [476.22 476.32]
 [440.29 432.68]
 [467.92 464.56]]


Evaluate using R-squared

In [28]:
from sklearn.metrics import r2_score
poly_r2_score = r2_score(y_test,  poly_pred)
poly_r2_score

0.9447339094083262

#### Support Vector Regression

In [29]:
# We do the feature scaling for both X and y sets
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_test_scaled = sc_X.fit_transform(X_test)



In [30]:
# Reshape y before scaling
sc_y = StandardScaler()
y_train_trans = y_train.reshape(-1, 1)
y_train_scaled = sc_y.fit_transform(y_train_trans)


In [32]:
from sklearn.svm import SVR
svr_regressor = SVR(kernel='rbf') # always use rbf
svr_regressor.fit(X_train_scaled, y_train_scaled) # train on the scaled model
# ignore the 1-d error

  y = column_or_1d(y, warn=True)


SVR()

Predict for X_test

In [33]:
svr_pred_scaled = svr_regressor.predict(X_test_scaled)
svr_pred_scaled = svr_pred_scaled.reshape(-1, 1) # so it can be easily inverted


In [34]:
# Ger the inverse
svr_pred = sc_y.inverse_transform(svr_pred_scaled)
svr_pred = svr_pred.flatten()
svr_pred

array([457.15, 463.42, 437.47, ..., 476.08, 432.78, 464.15])

Compare y_test to the prediction

In [35]:
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(-1,1), svr_pred.reshape(-1,1)),1))

[[458.96 457.15]
 [463.29 463.42]
 [435.27 437.47]
 ...
 [476.22 476.08]
 [440.29 432.78]
 [467.92 464.15]]


Evaluate using R-squared

In [41]:
from sklearn.metrics import r2_score
svr_r2_score = r2_score(y_test,  svr_pred)
svr_r2_score

0.9480489459580308

#### Decision Tree Regression

In [38]:
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor(random_state=1)
tree_regressor.fit(X_train, y_train)

DecisionTreeRegressor(random_state=1)

Predict for X_test

In [39]:
tree_pred = tree_regressor.predict(X_test)
tree_pred

array([459.65, 462.26, 436.03, ..., 473.77, 432.76, 468.23])

Compare y_test to the prediction

In [40]:
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(-1,1), tree_pred.reshape(-1,1)),1))

[[458.96 459.65]
 [463.29 462.26]
 [435.27 436.03]
 ...
 [476.22 473.77]
 [440.29 432.76]
 [467.92 468.23]]


Evaluate using R-squared

In [42]:
from sklearn.metrics import r2_score
tree_r2_score = r2_score(y_test,  tree_pred)
tree_r2_score

0.9314913712563007

#### Random Forest Regression

In [43]:
from sklearn.ensemble import RandomForestRegressor
forest_regressor = RandomForestRegressor(n_estimators=10, random_state=0) # numbers of trees to be used
forest_regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

Predict for X_test

In [45]:
forest_pred = forest_regressor.predict(X_test)
forest_pred

array([457.9 , 464.07, 439.72, ..., 476.18, 432.9 , 466.11])

Compare y_test to the prediction

In [46]:
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(-1,1), forest_pred.reshape(-1,1)),1))


[[458.96 457.9 ]
 [463.29 464.07]
 [435.27 439.72]
 ...
 [476.22 476.18]
 [440.29 432.9 ]
 [467.92 466.11]]


Evaluate using R-squared

In [47]:
from sklearn.metrics import r2_score
forest_r2_score = r2_score(y_test,  forest_pred)
forest_r2_score

0.9628673278135129

#### Choosing the best model

In [53]:
performance_dict = {
    'simple linear regression': round(simple_r2_score, 4),
    'multiple linear regression': round(multiple_r2_score, 4),
    'polynomial regression': round(poly_r2_score, 4),
    'support vector regression': round(svr_r2_score, 4),
    'decision tree regression': round(tree_r2_score, 4),
    'random forest regression': round(forest_r2_score, 4),
}

score_list = []

for key,value in performance_dict.items():
    score_list.append(value)
    max_score = max(score_list)

for key,value in performance_dict.items():
    if value == max_score:
        print(f'The best performing model is {key}')



The best performing model is random forest regression
