In [None]:
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("new_main_table.csv")
df.head()

Unnamed: 0,ID,SensorName,SensorReading,TimeEntered,Location,SmartObject,DateEntered
0,1,Temperature,27.2,11:49:44,Library,ESP32WOFA,12/04/2024
1,2,Humidity,54.1,11:49:44,Library,ESP32WOFA,12/04/2024
2,3,Humidity,54.1,11:49:44,Library,ESP32WOFA,12/04/2024
3,4,LDR,25.61,11:49:44,Library,ESP32WOFA,12/04/2024
4,5,Temperature,27.2,11:49:46,Library,ESP32WOFA,12/04/2024


In [None]:
df.sample()

Unnamed: 0,ID,SensorName,SensorReading,TimeEntered,Location,SmartObject,DateEntered
1449,1450,Temperature,36.6,12:13:54,HangOut,ESP32ALLOY,12/04/2024


## Clean the data

In [None]:
mask = df['SensorReading'] <= 500000
df = df[mask]
df.info()                 # Confirm the drop occured

<class 'pandas.core.frame.DataFrame'>
Index: 2785 entries, 0 to 3058
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             2785 non-null   int64  
 1   SensorName     2785 non-null   object 
 2   SensorReading  2785 non-null   float64
 3   TimeEntered    2785 non-null   object 
 4   Location       2785 non-null   object 
 5   SmartObject    2785 non-null   object 
 6   DateEntered    2785 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 174.1+ KB


## Converting time to minutes

In [None]:
# Convert timestamp strings to datetime objects for extraction
df['timestamp'] = pd.to_datetime(df['TimeEntered'], format='%H:%M:%S')
df['Time'] = df['timestamp'].dt.time

# Drop specific feature columns
df = df.drop(columns=['timestamp'])
df = df.drop(columns=['TimeEntered'])
df = df.drop(columns=['DateEntered'])
df = df.drop(columns=['ID'])
df = df.drop(columns=['SmartObject'])

df.reset_index(inplace = True, drop = True)
df.head()

Unnamed: 0,SensorName,SensorReading,Location,Time
0,Temperature,27.2,Library,11:49:44
1,Humidity,54.1,Library,11:49:44
2,Humidity,54.1,Library,11:49:44
3,LDR,25.61,Library,11:49:44
4,Temperature,27.2,Library,11:49:46


In [None]:
# Convert Time objects into total seconds
time_list = list()
for entry in df['Time']:
    seconds = 0
    time = str(entry)
    time_components = time.split(':')
    seconds += int(time_components[0]) * 3600
    seconds += int(time_components[1]) * 60
    seconds += int(time_components[2])
    time_list.append(seconds)
df['Time'] = time_list
df

Unnamed: 0,SensorName,SensorReading,Location,Time
0,Temperature,27.20,Library,42584
1,Humidity,54.10,Library,42584
2,Humidity,54.10,Library,42584
3,LDR,25.61,Library,42584
4,Temperature,27.20,Library,42586
...,...,...,...,...
2780,LDR,12.06,Library,45616
2781,Humidity,57.10,Library,45618
2782,Temperature,26.20,Library,45622
2783,LDR,12.16,Library,45622


In [None]:
# Filter for 'Library' location and pivot the data to group Temperature, Humidity, and LDR readings by unique timestamp
val = [0,0,0]
actual = {}

for i in range(len(df)):
    time = df['Time'][i]
    actual[time] = [0,0,0]

for i in range(len(df)):
    time = df['Time'][i]
    sensor = df['SensorName'][i]
    location = df['Location'][i]
    if location == "Library":
        if sensor == "Temperature":
            tlist = actual[time]
            tlist[0] = df['SensorReading'][i]
            actual[time] = tlist
        elif sensor == "Humidity":
            tlist = actual[time]
            tlist[1] = df['SensorReading'][i]
            actual[time] = tlist
        elif sensor == "LDR":
            tlist = actual[time]
            tlist[2] = df['SensorReading'][i]
            actual[time] = tlist

actual

{42584: [27.2, 54.1, 25.61],
 42586: [27.2, 0, 0],
 42587: [0, 54.3, 25.81],
 42589: [0, 54.6, 0],
 42592: [27.2, 0, 0],
 42593: [0, 55.0, 15.28],
 42596: [0, 55.4, 0],
 42598: [27.2, 0, 0],
 42599: [0, 55.5, 19.97],
 42602: [0, 55.7, 0],
 42605: [27.2, 55.8, 18.52],
 42607: [0, 56.0, 0],
 42610: [27.2, 56.1, 21.12],
 42613: [0, 56.2, 0],
 42616: [27.2, 0, 0],
 42617: [0, 56.2, 18.23],
 42621: [0, 55.7, 0],
 42622: [27.2, 0, 18.02],
 42623: [0, 55.7, 0],
 42625: [0, 55.9, 0],
 42628: [27.2, 0, 0],
 42629: [0, 55.9, 21.53],
 42632: [0, 55.8, 0],
 42634: [27.2, 0, 0],
 42635: [0, 55.7, 22.75],
 42638: [0, 55.6, 0],
 42640: [27.2, 0, 0],
 42641: [0, 55.7, 22.75],
 42643: [0, 55.7, 0],
 42646: [27.2, 55.8, 23.57],
 42649: [0, 55.8, 0],
 42652: [27.2, 0, 0],
 42653: [0, 55.8, 22.75],
 42656: [0, 55.5, 0],
 42658: [27.2, 55.2, 22.75],
 42662: [0, 55.0, 0],
 42664: [27.2, 0, 0],
 42671: [27.2, 54.9, 23.88],
 42674: [0, 54.9, 0],
 42676: [27.2, 0, 0],
 42677: [0, 54.9, 21.78],
 42679: [0, 55.0

In [None]:
# Transform the aggregated dictionary data into a structured DataFrame with columns for each sensor
actual_df = pd.DataFrame()
values = list(actual.values())
temp = [0] * len(values)
hum = [0] * len(values)
ldr = [0] * len(values)
for i in range(len(values)):
    temp[i] = values[i][0]
    hum[i] = values[i][1]
    ldr[i] = values[i][2]

actual_df['Time'] = actual.keys()
actual_df['Temperature'] = temp
actual_df['Humidity'] = hum
actual_df['Light Intensity'] = ldr

actual_df

Unnamed: 0,Time,Temperature,Humidity,Light Intensity
0,42584,27.2,54.1,25.61
1,42586,27.2,0.0,0.00
2,42587,0.0,54.3,25.81
3,42589,0.0,54.6,0.00
4,42592,27.2,0.0,0.00
...,...,...,...,...
1409,45612,0.0,57.7,0.00
1410,45615,26.2,0.0,0.00
1411,45616,0.0,57.3,12.06
1412,45618,0.0,57.1,0.00


In [None]:
# Filter out incomplete Temperature sensor reading rows
mask = actual_df['Temperature'] != 0
df_mod = actual_df[mask]
df_mod.reset_index(inplace = True, drop = True)
df_mod

Unnamed: 0,Time,Temperature,Humidity,Light Intensity
0,42584,27.2,54.1,25.61
1,42586,27.2,0.0,0.00
2,42592,27.2,0.0,0.00
3,42598,27.2,0.0,0.00
4,42605,27.2,55.8,18.52
...,...,...,...,...
378,44876,26.4,0.0,0.00
379,44883,26.4,0.0,0.00
380,45610,26.2,56.8,12.30
381,45615,26.2,0.0,0.00


In [None]:
# Filter out incomplete Humidity sensor reading rows
mask = df_mod['Humidity'] != 0
df_mod = df_mod[mask]
df_mod.reset_index(inplace = True, drop = True)
df_mod

Unnamed: 0,Time,Temperature,Humidity,Light Intensity
0,42584,27.2,54.1,25.61
1,42605,27.2,55.8,18.52
2,42610,27.2,56.1,21.12
3,42646,27.2,55.8,23.57
4,42658,27.2,55.2,22.75
...,...,...,...,...
197,44859,26.4,58.7,16.59
198,44865,26.5,58.7,16.45
199,44871,26.4,58.2,15.64
200,45610,26.2,56.8,12.30


In [None]:
# Filter out incomplete LDR sensor reading rows
mask = df_mod['Light Intensity'] != 0
df_mod = df_mod[mask]
df_mod.reset_index(inplace = True, drop = True)
df_mod

Unnamed: 0,Time,Temperature,Humidity,Light Intensity
0,42584,27.2,54.1,25.61
1,42605,27.2,55.8,18.52
2,42610,27.2,56.1,21.12
3,42646,27.2,55.8,23.57
4,42658,27.2,55.2,22.75
...,...,...,...,...
190,44859,26.4,58.7,16.59
191,44865,26.5,58.7,16.45
192,44871,26.4,58.2,15.64
193,45610,26.2,56.8,12.30


### Checking correlation

In [None]:
# Exploring the data
df_mod.corr()

Unnamed: 0,Time,Temperature,Humidity,Light Intensity
Time,1.0,-0.743567,0.46184,-0.824945
Temperature,-0.743567,1.0,-0.751214,0.62537
Humidity,0.46184,-0.751214,1.0,-0.401704
Light Intensity,-0.824945,0.62537,-0.401704,1.0


# Train the model

### Creating test and training datasets

In [293]:
# Create output set
y = df_mod['Temperature'].values
y

array([27.2, 27.2, 27.2, 27.2, 27.2, 27.2, 27.2, 27.2, 27.2, 27.1, 27.1,
       27. , 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 27. , 27.1,
       27.1, 27.1, 27.1, 27.1, 27.1, 27.1, 27.1, 27. , 26.9, 26.9, 26.9,
       26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9,
       26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9,
       26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 27. ,
       27. , 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 27. , 27. , 27. ,
       26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9,
       26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 26.9, 27. ,
       27. , 27. , 27. , 27. , 27. , 27.1, 27.1, 27.1, 27.1, 27.1, 27.1,
       27.1, 27.1, 27.1, 27.1, 27. , 27.1, 27. , 27. , 27. , 27. , 26.9,
       26.9, 26.9, 26.9, 26.9, 26.9, 27. , 27. , 27. , 27. , 27. , 27. ,
       27. , 27. , 27. , 27. , 27. , 26.9, 26.9, 26.9, 26.9, 26.9, 26.9,
       26.9, 26.9, 26.9, 26.9, 26.9, 26.8, 26.8, 26

In [None]:
# Create input set
df_modx = df_mod.drop(columns=['Temperature'])
X = df_modx.values
X

array([[4.2584e+04, 5.4100e+01, 2.5610e+01],
       [4.2605e+04, 5.5800e+01, 1.8520e+01],
       [4.2610e+04, 5.6100e+01, 2.1120e+01],
       [4.2646e+04, 5.5800e+01, 2.3570e+01],
       [4.2658e+04, 5.5200e+01, 2.2750e+01],
       [4.2671e+04, 5.4900e+01, 2.3880e+01],
       [4.2682e+04, 5.5100e+01, 2.3780e+01],
       [4.2718e+04, 5.5100e+01, 2.9460e+01],
       [4.2730e+04, 5.5100e+01, 2.9370e+01],
       [4.2766e+04, 5.5400e+01, 2.4460e+01],
       [4.2790e+04, 5.4900e+01, 2.5770e+01],
       [4.2857e+04, 5.7000e+01, 2.5310e+01],
       [4.2910e+04, 5.6300e+01, 2.7620e+01],
       [4.2940e+04, 5.5900e+01, 2.4200e+01],
       [4.2952e+04, 5.5800e+01, 2.3740e+01],
       [4.2961e+04, 5.6100e+01, 2.3670e+01],
       [4.3018e+04, 5.7200e+01, 2.5610e+01],
       [4.3072e+04, 5.6800e+01, 2.0700e+01],
       [4.3078e+04, 5.6700e+01, 1.9270e+01],
       [4.3127e+04, 5.7300e+01, 2.2750e+01],
       [4.3180e+04, 5.6300e+01, 2.4600e+01],
       [4.3211e+04, 5.5800e+01, 2.2360e+01],
       [4.

In [None]:
# Splitting dataset into training set & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # 20% test size

In [None]:
# Fitting a Linear Regression model to the training set
regr = LinearRegression()
model = regr.fit(X_train, y_train)

In [None]:
# Evaluating the model using MSE criterion
y_pred = regr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE): ", mse)

Mean Squared Error (MSE):  0.011722136310445774


### Mean Absolute Percentage Error

In [None]:
# Mean absolute percentage error
actual, pred = np.array(y_test), np.array(y_pred)
mape= np.mean(np.abs((actual - pred) / actual)) * 100
print(f"mean absolute percentage error = {mape*100:.2f}%")

mean absolute percentage error = 32.54%


In [None]:
# Other measures: rmse, r-squre, mae
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')    # absolute deviation of predicted value from true value
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

Mean absolute error: 0.09
Mean squared error: 0.01
Root mean squared error: 0.11


In [None]:
# Regressor score
r2 = regr.score(X_test, y_test)
print(f"The R2 score for test data is {r2:.2f}")

# Regressor score for trained data is
r2_training = regr.score(X_train, y_train)
print(f"The R2 score for test data is {r2_training:.2f}")

The R2 score for test data is 0.75
The R2 score for test data is 0.77
