# Regressor

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1 - Data Pre-Processing

## Importing the dataset

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Power_Plant.csv to Power_Plant.csv


In [3]:
import io
dataset = pd.read_csv(io.BytesIO(uploaded['Power_Plant.csv']), delimiter=",")

#dataset = pd.read_csv('Power_Plant.csv') # use it in VS Code

In [4]:
# list the dataset
dataset.head(10)

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9
5,26.27,59.44,1012.23,58.77,443.67
6,15.89,43.96,1014.02,75.24,467.35
7,9.48,44.71,1019.12,66.43,478.42
8,14.64,45.0,1021.78,41.25,475.98
9,11.74,43.56,1015.14,70.72,477.5


## Spliting the data set into trainining and test sets

In [5]:
# First, divide the data set into independent (X) and dependent (y) variables
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [6]:
# Import the train_test_split function from the sklearn.model_selection module
from sklearn.model_selection import train_test_split

# Use train_test_split to split the data into training and testing sets
# X: Features, y: Target variable, test_size: Proportion of the dataset to include in the test split
# random_state: Seed for reproducibility, ensures the same split every time the code is run
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
print(X_train)

[[  11.22   43.13 1017.24   80.9 ]
 [  13.67   54.3  1015.92   75.42]
 [  32.84   77.95 1014.68   45.8 ]
 ...
 [  16.81   38.52 1018.26   75.21]
 [  12.8    41.16 1022.43   86.19]
 [  32.32   67.9  1006.08   37.93]]


In [8]:
print(X_test)

[[  28.66   77.95 1009.56   69.07]
 [  17.48   49.39 1021.51   84.53]
 [  14.86   43.14 1019.21   99.14]
 ...
 [  12.24   44.92 1023.74   88.21]
 [  27.28   47.93 1003.46   59.22]
 [  17.28   39.99 1007.09   74.25]]


In [9]:
print(y_train)

[473.93 467.87 431.97 ... 459.01 462.72 428.12]


In [10]:
print(y_test)

[431.23 460.01 461.14 ... 473.26 438.   463.28]


# 2 - Modelling

## Training the MultiLinear Regression model on the test dataset

In [11]:
# Import the LinearRegression model from the sklearn.linear_model module
from sklearn.linear_model import LinearRegression

In [12]:
# A MODEL IS CREATED with parameters specific to the algorithm to be used.
# Here the name of the model is named "regressor".
regressor = LinearRegression()

## TRAIN the model with training data sets **(X_train and y_train)**.

In [13]:
# The model created above is TRAINED with the test dataset (X_train).
regressor.fit(X_train, y_train)

## Predicting the result
Get PREDICTIONS of the model using data from the test set **(X_test)**

In [14]:
# PREDICT the output using test data (X_test)
y_pred = regressor.predict(X_test)

In [15]:
# y_pred contains the results predicted by the model.
print(y_pred)

[431.42761597 458.56124622 462.75264705 ... 469.51835895 442.41759454
 461.88279939]


# 3 - Evaluation

## Evaluating the model; **y_test ve y_pred**

In [16]:
# The outputs produced by the model (y_pred) and real values (y_test) are used.

# Evaluating the model
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9325315554761303

# **Customer Use of the Model with "Unseen Data"**

At this stage, the AI model produced with a certain performance is delivered to the customer.

And the customer gets the outputs by using the model with the **new data (Unseen Data)** produced. Unseen Data set has inputs but no output!

Based on this example, our AI model will predict the PE value that the power plant will produce.

In [17]:
# Unseen Data is imported.
from google.colab import files

uploaded = files.upload()

Saving Power_Plant_Unseen.csv to Power_Plant_Unseen.csv


In [18]:
unseen_dataset_df = pd.read_csv(io.BytesIO(uploaded['Power_Plant_Unseen.csv']), delimiter=";")

#dataset = pd.read_csv('Power_Plant_Unseen.csv') # use it in VS Code

unseen_dataset_df.head(10)

Unnamed: 0,AT,V,AP,RH
0,12.45,40.56,1017.84,66.52
1,16.7,40.56,1019.04,50.9
2,19.88,44.6,1015.52,37.85
3,19.64,46.33,1013.33,98.99
4,20.01,59.87,1019.0,84.12
5,16.27,58.2,1018.47,80.03
6,10.8,41.39,1019.74,74.08
7,9.64,39.35,1015.1,91.76
8,11.53,41.14,1025.63,88.54
9,21.83,63.07,1011.57,87.02


In [19]:
# Extract values from the unseen_dataset_df DataFrame to create a NumPy array
unseen_dataset = unseen_dataset_df.iloc[:, :].values

unseen_dataset

array([[  12.45,   40.56, 1017.84,   66.52],
       [  16.7 ,   40.56, 1019.04,   50.9 ],
       [  19.88,   44.6 , 1015.52,   37.85],
       [  19.64,   46.33, 1013.33,   98.99],
       [  20.01,   59.87, 1019.  ,   84.12],
       [  16.27,   58.2 , 1018.47,   80.03],
       [  10.8 ,   41.39, 1019.74,   74.08],
       [   9.64,   39.35, 1015.1 ,   91.76],
       [  11.53,   41.14, 1025.63,   88.54],
       [  21.83,   63.07, 1011.57,   87.02],
       [   9.51,   40.46, 1018.84,   70.12],
       [  28.84,   75.6 , 1018.41,   53.96],
       [  25.82,   72.39, 1003.4 ,   86.33],
       [  28.47,   69.23, 1013.18,   40.73],
       [  19.87,   49.69, 1012.23,   68.57],
       [  14.38,   44.84, 1024.59,   81.68],
       [  27.37,   65.06, 1013.09,   50.92],
       [  28.05,   62.6 , 1017.01,   46.46],
       [   8.66,   36.25, 1028.22,   86.96],
       [  16.61,   45.87, 1009.34,   97.93],
       [  11.28,   42.44, 1014.62,   99.78],
       [  21.1 ,   62.66, 1011.19,   83.49],
       [  

In [20]:
# Use the trained regression model (regressor) to predict the target variable for the unseen dataset
y_pred_unseen = regressor.predict(unseen_dataset)

y_pred_unseen

array([473.1867979 , 467.34670219, 461.95464811, 452.21474871,
       450.9951783 , 459.38229431, 475.17252842, 474.8527412 ,
       471.88181965, 445.71425631, 478.50627924, 434.58199606,
       435.2244621 , 438.57574093, 455.70451708, 466.40127402,
       440.11590542, 440.31136452, 479.1163875 , 458.21480312,
       469.58763683, 447.78532067, 461.28400399, 430.3362834 ,
       480.24416143, 436.98457897, 453.80019875, 453.49647931,
       442.42073936, 435.23506049, 461.41634371, 431.59628662,
       445.02251504, 447.05284993, 437.86547232, 470.03283638,
       467.68795362, 427.04161933, 451.29960656, 434.75837823,
       445.65747302, 451.7351373 , 467.83402098, 448.28666724,
       424.90244796, 436.60201356, 430.70060851, 447.38347937,
       464.12433764, 471.20079681, 434.24669653, 442.17389489,
       439.57651811, 435.19126027, 477.37080948, 472.28601026,
       441.16747455, 469.21949231, 446.00067376, 465.92616534,
       470.21507796, 445.42597125, 440.08580083, 480.24

In [21]:
# Create a DataFrame (y_pred_unseen_df) to store the predicted outputs from the regression model
y_pred_unseen_df = pd.DataFrame(y_pred_unseen, columns=["Predicted_Output"])

# Display the first 10 rows of the DataFrame
y_pred_unseen_df.head(10)

Unnamed: 0,Predicted_Output
0,473.186798
1,467.346702
2,461.954648
3,452.214749
4,450.995178
5,459.382294
6,475.172528
7,474.852741
8,471.88182
9,445.714256


In [22]:
# Merge the original unseen dataset with the DataFrame containing predicted outputs based on the indices
all_df = unseen_dataset_df.merge(y_pred_unseen_df, how="left", left_index=True, right_index=True)

# Display the merged DataFrame
all_df

Unnamed: 0,AT,V,AP,RH,Predicted_Output
0,12.45,40.56,1017.84,66.52,473.186798
1,16.70,40.56,1019.04,50.90,467.346702
2,19.88,44.60,1015.52,37.85,461.954648
3,19.64,46.33,1013.33,98.99,452.214749
4,20.01,59.87,1019.00,84.12,450.995178
...,...,...,...,...,...
95,20.97,47.43,1007.64,71.18,453.362795
96,14.10,41.16,1021.26,73.87,468.845882
97,17.83,66.86,1011.65,77.31,454.250417
98,8.46,40.80,1023.57,81.27,479.037321
