In [72]:
# Libraries
import pandas as pd
import plotly.express as px
from datetime import datetime
from darts import TimeSeries
import plotly.graph_objs as go
from darts.models import ExponentialSmoothing

In [73]:
df=pd.read_csv('sensor_data.csv')
df.head()

Unnamed: 0,time,SensorA,SensorB,SensorC
0,00:00,1.41625,4.21993,3.139646
1,00:01,3.534439,,3.064088
2,00:02,5.659733,,2.925565
3,00:03,5.640167,3.07365,5.38121
4,00:04,4.454474,1.554044,1.055965


In [74]:
df[['SensorA','SensorB','SensorC']]=df[['SensorA','SensorB','SensorC']].round(2)

In [75]:
df

Unnamed: 0,time,SensorA,SensorB,SensorC
0,00:00,1.42,4.22,3.14
1,00:01,3.53,,3.06
2,00:02,5.66,,2.93
3,00:03,5.64,3.07,5.38
4,00:04,4.45,1.55,1.06
...,...,...,...,...
1435,23:55,8.42,4.69,9.01
1436,23:56,12.41,5.71,7.01
1437,23:57,11.40,7.06,7.52
1438,23:58,9.71,7.90,


We see this start at midnight and ends 24h later.

In [76]:
df.tail()

Unnamed: 0,time,SensorA,SensorB,SensorC
1435,23:55,8.42,4.69,9.01
1436,23:56,12.41,5.71,7.01
1437,23:57,11.4,7.06,7.52
1438,23:58,9.71,7.9,
1439,23:59,11.22,8.27,6.61


In [77]:
#Overview
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   time     1440 non-null   object 
 1   SensorA  1303 non-null   float64
 2   SensorB  1205 non-null   float64
 3   SensorC  1392 non-null   float64
dtypes: float64(3), object(1)
memory usage: 45.1+ KB


We can see some missing values in each sensor columns.
We also have a time column.
We can interpret this dataset in different ways. We do not have so much information so we can either say all the sensors are on the same spot and we can do the average of the 3. Or we can treat them individually assuming they are placed on different spots.

From the graph we can see the diffrence in measurements for the same time. So We hope those sensors are not based on the same spot...

So We will treat only the sensorA.

In [78]:
df.describe()

Unnamed: 0,SensorA,SensorB,SensorC
count,1303.0,1205.0,1392.0
mean,9.79353,3.215942,8.013534
std,2.038107,2.843564,2.138722
min,1.42,-2.74,1.06
25%,8.47,1.11,6.525
50%,9.81,3.21,8.17
75%,11.24,5.46,9.62
max,18.11,9.56,14.31


To deal with missing values, replacing by the median or mean doest not seem to be a good option. moving average or mean between all the sensors for the same time...maybe..


In [79]:
sensorA= df.iloc[:,0:2]
sensorA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   time     1440 non-null   object 
 1   SensorA  1303 non-null   float64
dtypes: float64(1), object(1)
memory usage: 22.6+ KB


In [80]:
sensorA['time']=pd.to_datetime(sensorA['time'])
sensorA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   time     1440 non-null   datetime64[ns]
 1   SensorA  1303 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 22.6 KB



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [81]:
sensorA

Unnamed: 0,time,SensorA
0,2024-05-28 00:00:00,1.42
1,2024-05-28 00:01:00,3.53
2,2024-05-28 00:02:00,5.66
3,2024-05-28 00:03:00,5.64
4,2024-05-28 00:04:00,4.45
...,...,...
1435,2024-05-28 23:55:00,8.42
1436,2024-05-28 23:56:00,12.41
1437,2024-05-28 23:57:00,11.40
1438,2024-05-28 23:58:00,9.71


In [82]:
sensorA=sensorA.set_index(pd.DatetimeIndex(sensorA['time'])).drop(columns='time')
sensorA.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1440 entries, 2024-05-28 00:00:00 to 2024-05-28 23:59:00
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   SensorA  1303 non-null   float64
dtypes: float64(1)
memory usage: 22.5 KB


In [83]:
sensorA

Unnamed: 0_level_0,SensorA
time,Unnamed: 1_level_1
2024-05-28 00:00:00,1.42
2024-05-28 00:01:00,3.53
2024-05-28 00:02:00,5.66
2024-05-28 00:03:00,5.64
2024-05-28 00:04:00,4.45
...,...
2024-05-28 23:55:00,8.42
2024-05-28 23:56:00,12.41
2024-05-28 23:57:00,11.40
2024-05-28 23:58:00,9.71


# Resample

We are resampling the datas doing the mean of the 5 min before the label including the label in the mean.

In [84]:
sensorA_res=sensorA.resample('5min',label='right',closed='right').mean()
sensorA_res

Unnamed: 0_level_0,SensorA
time,Unnamed: 1_level_1
2024-05-28 00:00:00,1.4200
2024-05-28 00:05:00,5.0320
2024-05-28 00:10:00,5.3400
2024-05-28 00:15:00,4.6475
2024-05-28 00:20:00,7.6280
...,...
2024-05-28 23:40:00,11.3220
2024-05-28 23:45:00,9.9860
2024-05-28 23:50:00,11.0575
2024-05-28 23:55:00,10.5720


In [85]:
sensorA_res.info()
#No missing values.

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 289 entries, 2024-05-28 00:00:00 to 2024-05-29 00:00:00
Freq: 5min
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   SensorA  289 non-null    float64
dtypes: float64(1)
memory usage: 4.5 KB


In [86]:
fig1=px.line(sensorA_res, y='SensorA')
fig1.show()

We can see an initial increase from 0.00 to 0.50. Then it stay more or less constant and we see another big increase at 17.15.

Let's resample by hour.


In [87]:
sensorA_60res=sensorA.resample('60min',label='right',closed='right').mean()
sensorA_60res

Unnamed: 0_level_0,SensorA
time,Unnamed: 1_level_1
2024-05-28 00:00:00,1.42
2024-05-28 01:00:00,7.832857
2024-05-28 02:00:00,9.582
2024-05-28 03:00:00,10.34
2024-05-28 04:00:00,10.362407
2024-05-28 05:00:00,9.909811
2024-05-28 06:00:00,10.179811
2024-05-28 07:00:00,9.771429
2024-05-28 08:00:00,9.684545
2024-05-28 09:00:00,9.645636


In [88]:
fig2=px.line(sensorA_60res,y='SensorA')
fig2.show()

# Three sensors together

In [89]:
df

Unnamed: 0,time,SensorA,SensorB,SensorC
0,00:00,1.42,4.22,3.14
1,00:01,3.53,,3.06
2,00:02,5.66,,2.93
3,00:03,5.64,3.07,5.38
4,00:04,4.45,1.55,1.06
...,...,...,...,...
1435,23:55,8.42,4.69,9.01
1436,23:56,12.41,5.71,7.01
1437,23:57,11.40,7.06,7.52
1438,23:58,9.71,7.90,


In [90]:
df=df.set_index(pd.DatetimeIndex(df['time'])).drop(columns='time')
df_res=df.resample('5min',label='right',closed='right').mean()
df_60res=df.resample('60min',label='right',closed='right').mean()
fig3=px.line(df_60res)
fig3.show()

In [91]:

fig4=px.line(df_res)
fig4.show()

In [92]:
fig4=px.box(df)
fig4.show()

# Normality of data

In [93]:
fig5 = px.histogram(df)
fig5.show()

In [94]:
df_res=df_res.round(2)
df_res

Unnamed: 0_level_0,SensorA,SensorB,SensorC
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-05-28 00:00:00,1.42,4.22,3.14
2024-05-28 00:05:00,5.03,1.57,3.45
2024-05-28 00:10:00,5.34,1.65,5.31
2024-05-28 00:15:00,4.65,3.74,3.39
2024-05-28 00:20:00,7.63,0.64,4.32
...,...,...,...
2024-05-28 23:40:00,11.32,8.04,9.25
2024-05-28 23:45:00,9.99,6.09,6.98
2024-05-28 23:50:00,11.06,8.51,8.29
2024-05-28 23:55:00,10.57,5.38,8.94


# Mean by time windows(Night, Morning, afternoon, Evning)

In [95]:
df_res['hour']= df_res.index.hour
df_res

Unnamed: 0_level_0,SensorA,SensorB,SensorC,hour
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-05-28 00:00:00,1.42,4.22,3.14,0
2024-05-28 00:05:00,5.03,1.57,3.45,0
2024-05-28 00:10:00,5.34,1.65,5.31,0
2024-05-28 00:15:00,4.65,3.74,3.39,0
2024-05-28 00:20:00,7.63,0.64,4.32,0
...,...,...,...,...
2024-05-28 23:40:00,11.32,8.04,9.25,23
2024-05-28 23:45:00,9.99,6.09,6.98,23
2024-05-28 23:50:00,11.06,8.51,8.29,23
2024-05-28 23:55:00,10.57,5.38,8.94,23


In [96]:
def time_day(data_frame):
    if data_frame['hour'] <= 6 or data_frame['hour']>=21:
        val='Night'
    elif data_frame['hour']<=12:
        val='Morning'
    elif data_frame['hour']<=17:
        val='afternoon'
    else:
        val='Evening'
    return val


df_res['Time_day']=df_res.apply(time_day,axis=1)

df_res=df_res.drop(columns='hour')
df_res


Unnamed: 0_level_0,SensorA,SensorB,SensorC,Time_day
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-05-28 00:00:00,1.42,4.22,3.14,Night
2024-05-28 00:05:00,5.03,1.57,3.45,Night
2024-05-28 00:10:00,5.34,1.65,5.31,Night
2024-05-28 00:15:00,4.65,3.74,3.39,Night
2024-05-28 00:20:00,7.63,0.64,4.32,Night
...,...,...,...,...
2024-05-28 23:40:00,11.32,8.04,9.25,Night
2024-05-28 23:45:00,9.99,6.09,6.98,Night
2024-05-28 23:50:00,11.06,8.51,8.29,Night
2024-05-28 23:55:00,10.57,5.38,8.94,Night


In [97]:
#plot
fig_time=px.box(df_res,color='Time_day')
fig_time.show()

In [98]:
hist_data= df_res.reset_index().drop(labels='time',axis=1)
hist_data

Unnamed: 0,SensorA,SensorB,SensorC,Time_day
0,1.42,4.22,3.14,Night
1,5.03,1.57,3.45,Night
2,5.34,1.65,5.31,Night
3,4.65,3.74,3.39,Night
4,7.63,0.64,4.32,Night
...,...,...,...,...
284,11.32,8.04,9.25,Night
285,9.99,6.09,6.98,Night
286,11.06,8.51,8.29,Night
287,10.57,5.38,8.94,Night


In [99]:
data= [hist_data['SensorA'],hist_data['SensorB'],hist_data['SensorC']]

In [100]:
import plotly.figure_factory as ff
fig6= ff.create_distplot(hist_data=data,group_labels=['A','B','C'])
fig6.show()

We can see that Sensor A seems to be following data normally distributed whereas the other 2 no.

# Kruskal test 

In [101]:
from scipy import stats
data1= list(hist_data['SensorA'])
data2=list(hist_data['SensorB'])
data3=list(hist_data['SensorC'])
stats.kruskal(data1,data2,data3)


KruskalResult(statistic=536.0446190933018, pvalue=3.975483331928521e-117)

p_value<0.05 >>>> We reject H0 so the median of the 3 groups are different.
We will perform the Dunn test as a post hoc test.


In [102]:
import scikit_posthocs as sp
glob_data=[data1,data2,data3]
sp.posthoc_dunn(glob_data,p_adjust='hommel')

Unnamed: 0,1,2,3
1,1.0,1.658549e-114,3.55274e-15
2,1.658549e-114,1.0,4.738978999999999e-50
3,3.55274e-15,4.738978999999999e-50,1.0


# Conclusion

In this small project we have used:
- venv
- git
- datetime series


The function resample() helped us to deal with the missing values averaging the measurements over different time windows.



