# Regressor

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1 - Data Pre-Processing

## Importing the dataset

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Power_Plant.csv to Power_Plant.csv


In [3]:
import io
dataset = pd.read_csv(io.BytesIO(uploaded['Power_Plant.csv']), delimiter=",")

#dataset = pd.read_csv('Power_Plant.csv') # use it in VS Code

In [4]:
# list the dataset
dataset.head(10)

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9
5,26.27,59.44,1012.23,58.77,443.67
6,15.89,43.96,1014.02,75.24,467.35
7,9.48,44.71,1019.12,66.43,478.42
8,14.64,45.0,1021.78,41.25,475.98
9,11.74,43.56,1015.14,70.72,477.5


## Spliting the data set into trainining and test sets

In [5]:
# First, divide the data set into independent (X) and dependent (y) variables
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [6]:
# Import the train_test_split function from the sklearn.model_selection module
from sklearn.model_selection import train_test_split

# Use train_test_split to split the data into training and testing sets
# X: Features, y: Target variable, test_size: Proportion of the dataset to include in the test split
# random_state: Seed for reproducibility, ensures the same split every time the code is run
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
print(X_train)

[[  11.22   43.13 1017.24   80.9 ]
 [  13.67   54.3  1015.92   75.42]
 [  32.84   77.95 1014.68   45.8 ]
 ...
 [  16.81   38.52 1018.26   75.21]
 [  12.8    41.16 1022.43   86.19]
 [  32.32   67.9  1006.08   37.93]]


In [8]:
print(X_test)

[[  28.66   77.95 1009.56   69.07]
 [  17.48   49.39 1021.51   84.53]
 [  14.86   43.14 1019.21   99.14]
 ...
 [  12.24   44.92 1023.74   88.21]
 [  27.28   47.93 1003.46   59.22]
 [  17.28   39.99 1007.09   74.25]]


In [9]:
print(y_train)

[473.93 467.87 431.97 ... 459.01 462.72 428.12]


In [10]:
print(y_test)

[431.23 460.01 461.14 ... 473.26 438.   463.28]


## Feature Scaling

3 scaling methods are added below. The appropriate one for each data set can be selected by trying it out.

In [None]:
# Standard Scaling

from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X_train_scaled = standard_scaler.fit_transform(X_train)
X_test_scaled = standard_scaler.transform(X_test)


In [11]:
# Min-Max Scaling

from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)

In [None]:
# Robust Scaling

from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
X_train_scaled = robust_scaler.fit_transform(X_train)
X_test_scaled = robust_scaler.transform(X_test)

# 2 - Modelling

## Training the MultiLinear Regression model on the test dataset

In [12]:
# Import the LinearRegression model from the sklearn.linear_model module
from sklearn.linear_model import LinearRegression

In [13]:
# A MODEL IS CREATED with parameters specific to the algorithm to be used.
# Here the name of the model is named "regressor".
regressor = LinearRegression()

## TRAIN the model with training data sets **(X_train_scaled and y_train)**.

In [14]:
# The model created above is TRAINED with the test dataset (X_train_scaled).
regressor.fit(X_train_scaled, y_train)

## Predicting the result
Get PREDICTIONS of the model using data from the test set **(X_test_scaled)**

In [15]:
# PREDICT the output using test data (X_test_scaled)
y_pred = regressor.predict(X_test_scaled)

In [16]:
# y_pred contains the results predicted by the model.
print(y_pred)

[431.42761597 458.56124622 462.75264705 ... 469.51835895 442.41759454
 461.88279939]


# 3 - Evaluation

## Evaluating the model; **y_test ve y_pred**

Among the five metrics provided, the chosen metric should yield a minimum value for optimal model performance, except for R². In the case of R², a value closer to 1 indicates better model performance.

In [17]:
# Mean Absolute Error - MAE

# Import the mean_absolute_error function from the sklearn.metrics module
from sklearn.metrics import mean_absolute_error

# Calculate the Mean Absolute Error (MAE) by comparing the true values (y_test) with the predicted values (y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the calculated Mean Absolute Error
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.566564655203823


In [18]:
# Mean Squared Error - MSE

# Import the mean_squared_error function from the sklearn.metrics module
from sklearn.metrics import mean_squared_error

# Calculate the Mean Squared Error (MSE) by comparing the true values (y_test) with the predicted values (y_pred)
mse = mean_squared_error(y_test, y_pred)

# Print the calculated Mean Squared Error
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 19.73369930349765


In [19]:
# Root Mean Squared Error - RMSE

# Calculate the Root Mean Squared Error (RMSE) by taking the square root of the Mean Squared Error (mse)
rmse = np.sqrt(mse)

# Print the calculated Root Mean Squared Error
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 4.442262858442491


In [20]:
# R Squared - R²

# Import the r2_score function from the sklearn.metrics module
from sklearn.metrics import r2_score

# Calculate the R-squared (R²) score by comparing the true values (y_test) with the predicted values (y_pred)
r2 = r2_score(y_test, y_pred)

# Print the calculated R-squared (R²) score
print("R-squared (R²):", r2)

R-squared (R²): 0.9325315554761302


In [21]:
# Adjusted R²

# Calculate the number of samples (n) in the test set
n = len(y_test)

# Specify the number of features (p) in this example (set to 4 as an example)
p = 4

# Calculate the Adjusted R-squared using the formula
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Print the calculated Adjusted R-squared
print("Adjusted R-squared:", adjusted_r2)

Adjusted R-squared: 0.9323901862890713


# **Customer Use of the Model with "Unseen Data"**

At this stage, the AI model produced with a certain performance is delivered to the customer.

And the customer gets the outputs by using the model with the **new data (Unseen Data)** produced. Unseen Data set has inputs but no output!

Based on this example, our AI model will predict the PE value that the power plant will produce.

In [22]:
# Unseen Data is imported.
from google.colab import files

uploaded = files.upload()

Saving Power_Plant_Unseen.csv to Power_Plant_Unseen.csv


In [23]:
unseen_dataset_df = pd.read_csv(io.BytesIO(uploaded['Power_Plant_Unseen.csv']), delimiter=";")

#dataset = pd.read_csv('Power_Plant_Unseen.csv') # use it in VS Code

unseen_dataset_df.head(10)

Unnamed: 0,AT,V,AP,RH
0,12.45,40.56,1017.84,66.52
1,16.7,40.56,1019.04,50.9
2,19.88,44.6,1015.52,37.85
3,19.64,46.33,1013.33,98.99
4,20.01,59.87,1019.0,84.12
5,16.27,58.2,1018.47,80.03
6,10.8,41.39,1019.74,74.08
7,9.64,39.35,1015.1,91.76
8,11.53,41.14,1025.63,88.54
9,21.83,63.07,1011.57,87.02


In [24]:
# Extract values from the unseen_dataset_df DataFrame to create a NumPy array
# unseen_dataset
unseen_dataset = unseen_dataset_df.iloc[:, :].values

unseen_dataset

array([[  12.45,   40.56, 1017.84,   66.52],
       [  16.7 ,   40.56, 1019.04,   50.9 ],
       [  19.88,   44.6 , 1015.52,   37.85],
       [  19.64,   46.33, 1013.33,   98.99],
       [  20.01,   59.87, 1019.  ,   84.12],
       [  16.27,   58.2 , 1018.47,   80.03],
       [  10.8 ,   41.39, 1019.74,   74.08],
       [   9.64,   39.35, 1015.1 ,   91.76],
       [  11.53,   41.14, 1025.63,   88.54],
       [  21.83,   63.07, 1011.57,   87.02],
       [   9.51,   40.46, 1018.84,   70.12],
       [  28.84,   75.6 , 1018.41,   53.96],
       [  25.82,   72.39, 1003.4 ,   86.33],
       [  28.47,   69.23, 1013.18,   40.73],
       [  19.87,   49.69, 1012.23,   68.57],
       [  14.38,   44.84, 1024.59,   81.68],
       [  27.37,   65.06, 1013.09,   50.92],
       [  28.05,   62.6 , 1017.01,   46.46],
       [   8.66,   36.25, 1028.22,   86.96],
       [  16.61,   45.87, 1009.34,   97.93],
       [  11.28,   42.44, 1014.62,   99.78],
       [  21.1 ,   62.66, 1011.19,   83.49],
       [  

In [None]:
# Standard Scaling

# from sklearn.preprocessing import StandardScaler
# standard_scaler = StandardScaler()
unseen_dataset_scaled = standard_scaler.fit_transform(unseen_dataset)

In [25]:
# Min-Max Scaling

# from sklearn.preprocessing import MinMaxScaler
# min_max_scaler = MinMaxScaler()
unseen_dataset_scaled = min_max_scaler.fit_transform(unseen_dataset)

In [None]:
# Robust Scaling

# from sklearn.preprocessing import RobustScaler
# robust_scaler = RobustScaler()
unseen_dataset_scaled = robust_scaler.fit_transform(unseen_dataset)

In [26]:
# The trained AI-ML model predicts the outputs of the customer's new data (i.e. how much the Power Plant will produce)
y_pred_unseen = regressor.predict(unseen_dataset_scaled)

y_pred_unseen

array([483.22983519, 474.9079093 , 467.29432975, 456.82907089,
       454.56771572, 465.48440893, 486.13055443, 486.43065241,
       482.31574212, 447.74768342, 490.38387883, 431.69316111,
       433.90111407, 436.49856542, 460.24742841, 474.81219982,
       438.9400848 , 439.00769169, 491.79947158, 464.73120017,
       479.78885566, 450.34681861, 468.16038745, 426.54211639,
       493.1244935 , 434.63685672, 458.38751567, 457.68001365,
       442.36664072, 433.73163908, 467.13645706, 427.83226477,
       447.08976937, 448.73437799, 437.38131663, 479.8801462 ,
       476.72641445, 421.93239642, 454.87963395, 432.72299508,
       446.23144034, 455.56496423, 476.68954927, 449.52339585,
       418.6293079 , 433.88790307, 425.90016797, 448.21077851,
       471.96659826, 480.14419266, 432.14089197, 442.38850605,
       438.67104705, 433.30442961, 488.95336501, 482.8028216 ,
       440.43192215, 478.30523469, 446.91983144, 474.30366049,
       479.72591246, 446.58050127, 439.2791932 , 493.24

In [27]:
# Create a DataFrame (y_pred_unseen_df) to store the predicted outputs from the regression model
y_pred_unseen_df = pd.DataFrame(y_pred_unseen, columns=["Predicted_Output"])

# Display the first 10 rows of the DataFrame
y_pred_unseen_df.head(10)

Unnamed: 0,Predicted_Output
0,483.229835
1,474.907909
2,467.29433
3,456.829071
4,454.567716
5,465.484409
6,486.130554
7,486.430652
8,482.315742
9,447.747683


In [28]:
# Merge the original unseen dataset with the DataFrame containing predicted outputs based on the indices
all_df = unseen_dataset_df.merge(y_pred_unseen_df, how="left", left_index=True, right_index=True)

# Display the merged DataFrame
all_df

Unnamed: 0,AT,V,AP,RH,Predicted_Output
0,12.45,40.56,1017.84,66.52,483.229835
1,16.70,40.56,1019.04,50.90,474.907909
2,19.88,44.60,1015.52,37.85,467.294330
3,19.64,46.33,1013.33,98.99,456.829071
4,20.01,59.87,1019.00,84.12,454.567716
...,...,...,...,...,...
95,20.97,47.43,1007.64,71.18,457.293166
96,14.10,41.16,1021.26,73.87,477.765001
97,17.83,66.86,1011.65,77.31,458.622978
98,8.46,40.80,1023.57,81.27,491.488885
