In [1]:
#  Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Data Preprocessing & Splitting
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
# Model Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
df = pd.read_excel("Engr_Chika_Data_Refined.xlsx")

In [3]:
df.head()

Unnamed: 0,Time,S1,S2,S3,ST,P1,P2,P3,PT
0,0,6.2389,7.4459,8.2823,21.9671,274.644,276.426,251.1,802.17
1,1,5.4219,4.8077,5.1969,15.4265,234.846,281.286,193.752,709.884
2,2,5.0748,3.6868,3.1097,11.8713,263.682,270.216,150.39,684.288
3,3,3.412,3.6288,4.3667,11.4075,200.826,181.926,169.506,552.258
4,4,6.846,3.9673,4.22,15.0333,153.036,158.544,135.216,446.796


In [4]:
df.tail()

Unnamed: 0,Time,S1,S2,S3,ST,P1,P2,P3,PT
228,235,2.2065,2.333,2.771,7.3105,175.338,187.866,174.096,537.3
229,236,3.2437,3.6176,4.838,11.6993,175.554,186.84,175.662,538.056
230,237,3.792,3.8828,5.3536,13.0284,177.714,187.326,175.392,540.432
231,238,3.6214,4.1689,5.4164,13.2067,178.74,189.648,178.47,546.858
232,239,3.6632,4.3163,4.4102,12.3897,177.282,189.216,176.904,543.402


In [5]:
# To display information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    233 non-null    int64  
 1   S1      233 non-null    float64
 2   S2      233 non-null    float64
 3   S3      233 non-null    float64
 4   ST      233 non-null    float64
 5   P1      221 non-null    float64
 6   P2      221 non-null    float64
 7   P3      221 non-null    float64
 8   PT      227 non-null    float64
dtypes: float64(8), int64(1)
memory usage: 16.5 KB


In [6]:
# To show the number of rows and columns in the dataset (no of rows, no of columns)
df.shape

(233, 9)

In [7]:
# To display the summary statistics of numerical columns in the dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time,233.0,119.064378,70.418577,0.0,58.0,116.0,181.0,239.0
S1,233.0,4.927389,1.97268,0.0315,3.052,4.9342,6.8342,8.1269
S2,233.0,5.310852,2.06643,0.019,3.6288,4.3896,7.3792,8.8232
S3,233.0,5.501664,1.960384,0.0434,4.2294,5.0433,7.1692,10.2075
ST,233.0,15.739906,5.641158,0.0939,11.2601,13.0284,21.5055,25.4088
P1,221.0,222.409416,47.748699,149.202,176.364,203.111,272.7,308.718
P2,221.0,231.494258,46.787348,156.438,186.462,211.356,279.558,322.596
P3,221.0,201.202018,36.567245,135.216,171.396,177.444,240.354,278.478
PT,227.0,637.790123,162.072935,0.0,533.817,582.66,788.967,909.792


In [8]:
df.isna().sum()

Time     0
S1       0
S2       0
S3       0
ST       0
P1      12
P2      12
P3      12
PT       6
dtype: int64

In [9]:
# Handle missing values (Mean Imputation)
#df.fillna(df.mean(), inplace=True)
df = df.dropna()
df.isna().sum()

Time    0
S1      0
S2      0
S3      0
ST      0
P1      0
P2      0
P3      0
PT      0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221 entries, 0 to 232
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    221 non-null    int64  
 1   S1      221 non-null    float64
 2   S2      221 non-null    float64
 3   S3      221 non-null    float64
 4   ST      221 non-null    float64
 5   P1      221 non-null    float64
 6   P2      221 non-null    float64
 7   P3      221 non-null    float64
 8   PT      221 non-null    float64
dtypes: float64(8), int64(1)
memory usage: 17.3 KB


In [11]:
df.isna().sum()

Time    0
S1      0
S2      0
S3      0
ST      0
P1      0
P2      0
P3      0
PT      0
dtype: int64

In [12]:
df.head()

Unnamed: 0,Time,S1,S2,S3,ST,P1,P2,P3,PT
0,0,6.2389,7.4459,8.2823,21.9671,274.644,276.426,251.1,802.17
1,1,5.4219,4.8077,5.1969,15.4265,234.846,281.286,193.752,709.884
2,2,5.0748,3.6868,3.1097,11.8713,263.682,270.216,150.39,684.288
3,3,3.412,3.6288,4.3667,11.4075,200.826,181.926,169.506,552.258
4,4,6.846,3.9673,4.22,15.0333,153.036,158.544,135.216,446.796


In [13]:
# Define Features (X) and Target (y)
#X = df[['S1', 'S2', 'S3']]  # Traffic data as features
#y = df['PT']  # Power consumption as target

# Split the dataset (80% train, 20% test)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
X_s1 = df[['S1']]
y_s1 = df['P1']
X_s2 = df[['S2']]
y_s2 = df['P2']
X_s3 = df[['S3']]
y_s3 = df['P3']

In [15]:
X_s1train, X_s1test, y_s1train, y_s1test = train_test_split(X_s1, y_s1, test_size=0.2, random_state=42)
X_s2train, X_s2test, y_s2train, y_s2test = train_test_split(X_s2, y_s2, test_size=0.2, random_state=42)
X_s3train, X_s3test, y_s3train, y_s3test = train_test_split(X_s3, y_s3, test_size=0.2, random_state=42)

In [16]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV

# # Define model
# rf = RandomForestRegressor()

# # Define parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Perform GridSearchCV
# grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2', n_jobs=-1)
# grid_search.fit(X_s1train, y_s1train)

# # Best parameters
# print(grid_search.best_params_)


In [17]:
# Train Rqndom Forest Model
rf_model1 = RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_leaf=4, min_samples_split = 10, random_state=42)
#rf_model2 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model2 = LinearRegression()
#rf_model3 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model3 = SVR()
rf_model1.fit(X_s1train, y_s1train)
rf_model2.fit(X_s2train, y_s2train)
rf_model3.fit(X_s3train, y_s3train)


# Predictions
y_pred_lrs1 = rf_model1.predict(X_s1test)
y_pred_lrs2 = rf_model2.predict(X_s2test)
y_pred_lrs3 = rf_model3.predict(X_s3test)

# Evaluation Metrics
mae_lrs1 = mean_absolute_error(y_s1test, y_pred_lrs1)
mae_lrs2 = mean_absolute_error(y_s2test, y_pred_lrs2)
mae_lrs3 = mean_absolute_error(y_s3test, y_pred_lrs3)
rmse_lrs1 = np.sqrt(mean_squared_error(y_s1test, y_pred_lrs1))
rmse_lrs2 = np.sqrt(mean_squared_error(y_s2test, y_pred_lrs2))
rmse_lrs3 = np.sqrt(mean_squared_error(y_s3test, y_pred_lrs3))
r2_lrs1 = r2_score(y_s1test, y_pred_lrs1)
r2_lrs2 = r2_score(y_s2test, y_pred_lrs2)
r2_lrs3 = r2_score(y_s3test, y_pred_lrs3)

mae = [mae_lrs1, mae_lrs2, mae_lrs3]
rmse = [rmse_lrs1, rmse_lrs2, rmse_lrs3]
r2 = [r2_lrs1, r2_lrs2, r2_lrs3]

In [18]:
print(mae)

[8.294292045128113, 16.323138206892978, 16.498856340568334]


In [19]:
print(rmse)

[16.622143480409193, 22.667230391560047, 22.34023804712231]


In [20]:
print(r2)

[0.8736927433610581, 0.7556162626352431, 0.6309700309589247]


In [21]:
import pickle
with open("model1.pkl", "wb") as file:
    pickle.dump(rf_model1, file)
with open("model2.pkl", "wb") as file:
    pickle.dump(rf_model2, file)
with open("model3.pkl", "wb") as file:
    pickle.dump(rf_model3, file)

In [22]:
# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_s1train, y_s1train)

# Predictions
y_pred_rf = rf_model.predict(X_s1test)

# Evaluation Metrics
mae_rf = mean_absolute_error(y_s1test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_s1test, y_pred_rf))
r2_rf = r2_score(y_s1test, y_pred_rf)

# Print Results
print("\nRandom Forest Performance:")
print(f"Mean Absolute Error (MAE): {mae_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")
print(f"R² Score: {r2_rf:.4f}")



Random Forest Performance:
Mean Absolute Error (MAE): 8.7241
Root Mean Squared Error (RMSE): 16.3771
R² Score: 0.8774


In [23]:
# Train Random Forest Model
#svr_model = RandomForestRegressor(n_estimators=100, random_state=42)
svr = SVR()
svr.fit(X_s1train, y_s1train)

# Predictions
y_pred_rf = svr.predict(X_s1test)

# Evaluation Metrics
mae_rf = mean_absolute_error(y_s1test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_s1test, y_pred_rf))
r2_rf = r2_score(y_s1test, y_pred_rf)

# Print Results
print("\nRandom Forest Performance:")
print(f"Mean Absolute Error (MAE): {mae_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")
print(f"R² Score: {r2_rf:.4f}")



Random Forest Performance:
Mean Absolute Error (MAE): 10.6962
Root Mean Squared Error (RMSE): 19.3476
R² Score: 0.8289


In [24]:
X

NameError: name 'X' is not defined

In [15]:
y

0      802.170
1      709.884
2      684.288
3      552.258
4      446.796
        ...   
228    537.300
229    538.056
230    540.432
231    546.858
232    543.402
Name: PT, Length: 233, dtype: float64

In [16]:
pd.DataFrame(X_train)

Unnamed: 0,S1,S2,S3
144,3.4370,3.7961,4.5600
69,7.6967,8.4391,8.9900
230,3.7920,3.8828,5.3536
196,3.3050,3.6174,4.5296
223,2.8090,3.5867,3.0957
...,...,...,...
106,6.9558,8.3331,6.2292
14,2.5301,2.7592,2.0416
92,5.9833,7.0845,8.6351
179,2.9772,3.6708,4.6906


In [17]:
y

0      802.170
1      709.884
2      684.288
3      552.258
4      446.796
        ...   
228    537.300
229    538.056
230    540.432
231    546.858
232    543.402
Name: PT, Length: 233, dtype: float64

In [18]:
# Train Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test)

# Evaluation Metrics
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)


In [19]:
lr_model.coef_

array([10.26370456, 16.94926369, 23.698129  ])

In [20]:
# Print Results
print("\nLinear Regression Performance:")
print(f"Mean Absolute Error (MAE): {mae_lr:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr:.4f}")
print(f"R² Score: {r2_lr:.4f}")


Linear Regression Performance:
Mean Absolute Error (MAE): 53.9770
Root Mean Squared Error (RMSE): 65.0829
R² Score: 0.7069


In [21]:
# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluation Metrics
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

# Print Results
print("\nRandom Forest Performance:")
print(f"Mean Absolute Error (MAE): {mae_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")
print(f"R² Score: {r2_rf:.4f}")



Random Forest Performance:
Mean Absolute Error (MAE): 41.8956
Root Mean Squared Error (RMSE): 94.5709
R² Score: 0.3811
