In [7]:
# Libraries
import pandas as pd
import plotly.express as px
from datetime import datetime
from darts import TimeSeries
import plotly.graph_objs as go
from darts.models import ExponentialSmoothing

In [8]:
df=pd.read_csv('sensor_data.csv')
df.head()

Unnamed: 0,time,SensorA,SensorB,SensorC
0,00:00,1.41625,4.21993,3.139646
1,00:01,3.534439,,3.064088
2,00:02,5.659733,,2.925565
3,00:03,5.640167,3.07365,5.38121
4,00:04,4.454474,1.554044,1.055965


We see this start at midnight and ends 24h later.

In [9]:
df.tail()

Unnamed: 0,time,SensorA,SensorB,SensorC
1435,23:55,8.417096,4.686364,9.014305
1436,23:56,12.413407,5.709658,7.005713
1437,23:57,11.400624,7.063344,7.522029
1438,23:58,9.707463,7.903916,
1439,23:59,11.220124,8.270642,6.609032


In [10]:
#Overview
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   time     1440 non-null   object 
 1   SensorA  1303 non-null   float64
 2   SensorB  1205 non-null   float64
 3   SensorC  1392 non-null   float64
dtypes: float64(3), object(1)
memory usage: 45.1+ KB


We can see some missing values in each sensor columns.
We also have a time column.
We can interpret this dataset in different ways. We do not have so much information so we can either say all the sensors are on the same spot and we can do the average of the 3. Or we can treat them individually assuming they are placed on different spots.

From the graph we can see the diffrence in measurements for the same time. So We hope those sensors are not based on the same spot...

So We will treat only the sensorA.

In [11]:
df.describe()

Unnamed: 0,SensorA,SensorB,SensorC
count,1303.0,1205.0,1392.0
mean,9.793641,3.215945,8.013604
std,2.038163,2.843696,2.138642
min,1.41625,-2.74,1.055965
25%,8.466171,1.114624,6.524029
50%,9.809673,3.210922,8.170571
75%,11.238712,5.464741,9.618772
max,18.107463,9.564,14.307993


To deal with missing values, replacing by the median or mean doest not seem to be a good option. moving average or mean between all the sensors for the same time...maybe..


In [12]:
sensorA= df.iloc[:,0:2]
sensorA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   time     1440 non-null   object 
 1   SensorA  1303 non-null   float64
dtypes: float64(1), object(1)
memory usage: 22.6+ KB


In [13]:
sensorA['time']=pd.to_datetime(sensorA['time'])
sensorA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   time     1440 non-null   datetime64[ns]
 1   SensorA  1303 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 22.6 KB


  sensorA['time']=pd.to_datetime(sensorA['time'])


In [14]:
sensorA

Unnamed: 0,time,SensorA
0,2024-05-27 00:00:00,1.416250
1,2024-05-27 00:01:00,3.534439
2,2024-05-27 00:02:00,5.659733
3,2024-05-27 00:03:00,5.640167
4,2024-05-27 00:04:00,4.454474
...,...,...
1435,2024-05-27 23:55:00,8.417096
1436,2024-05-27 23:56:00,12.413407
1437,2024-05-27 23:57:00,11.400624
1438,2024-05-27 23:58:00,9.707463


In [15]:
sensorA=sensorA.set_index(pd.DatetimeIndex(sensorA['time'])).drop(columns='time')
sensorA.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1440 entries, 2024-05-27 00:00:00 to 2024-05-27 23:59:00
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   SensorA  1303 non-null   float64
dtypes: float64(1)
memory usage: 22.5 KB


In [16]:
sensorA

Unnamed: 0_level_0,SensorA
time,Unnamed: 1_level_1
2024-05-27 00:00:00,1.416250
2024-05-27 00:01:00,3.534439
2024-05-27 00:02:00,5.659733
2024-05-27 00:03:00,5.640167
2024-05-27 00:04:00,4.454474
...,...
2024-05-27 23:55:00,8.417096
2024-05-27 23:56:00,12.413407
2024-05-27 23:57:00,11.400624
2024-05-27 23:58:00,9.707463


# Resample

We are resampling the datas doing the mean of the 5 min before the label including the label in the mean.

In [17]:
sensorA_res=sensorA.resample('5min',label='right',closed='right').mean()
sensorA_res

Unnamed: 0_level_0,SensorA
time,Unnamed: 1_level_1
2024-05-27 00:00:00,1.416250
2024-05-27 00:05:00,5.033728
2024-05-27 00:10:00,5.341291
2024-05-27 00:15:00,4.644082
2024-05-27 00:20:00,7.629764
...,...
2024-05-27 23:40:00,11.322034
2024-05-27 23:45:00,9.987526
2024-05-27 23:50:00,11.059341
2024-05-27 23:55:00,10.571256


In [18]:
sensorA_res.info()
#No missing values.

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 289 entries, 2024-05-27 00:00:00 to 2024-05-28 00:00:00
Freq: 5min
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   SensorA  289 non-null    float64
dtypes: float64(1)
memory usage: 4.5 KB


In [19]:
fig1=px.line(sensorA_res, y='SensorA')
fig1.show()

We can see an initial increase from 0.00 to 0.50. Then it stay more or less constant and we see another big increase at 17.15.

Let's resample by hour.


In [20]:
sensorA_60res=sensorA.resample('60min',label='right',closed='right').mean()
sensorA_60res

Unnamed: 0_level_0,SensorA
time,Unnamed: 1_level_1
2024-05-27 00:00:00,1.41625
2024-05-27 01:00:00,7.83274
2024-05-27 02:00:00,9.582315
2024-05-27 03:00:00,10.339462
2024-05-27 04:00:00,10.362713
2024-05-27 05:00:00,9.910676
2024-05-27 06:00:00,10.179804
2024-05-27 07:00:00,9.771181
2024-05-27 08:00:00,9.684664
2024-05-27 09:00:00,9.646544


In [21]:
fig2=px.line(sensorA_60res,y='SensorA')
fig2.show()

# Three sensors together

In [22]:
df

Unnamed: 0,time,SensorA,SensorB,SensorC
0,00:00,1.416250,4.219930,3.139646
1,00:01,3.534439,,3.064088
2,00:02,5.659733,,2.925565
3,00:03,5.640167,3.073650,5.381210
4,00:04,4.454474,1.554044,1.055965
...,...,...,...,...
1435,23:55,8.417096,4.686364,9.014305
1436,23:56,12.413407,5.709658,7.005713
1437,23:57,11.400624,7.063344,7.522029
1438,23:58,9.707463,7.903916,


In [23]:
df=df.set_index(pd.DatetimeIndex(df['time'])).drop(columns='time')
df_res=df.resample('5min',label='right',closed='right').mean()
df_60res=df.resample('60min',label='right',closed='right').mean()
fig3=px.line(df_60res)
fig3.show()

In [24]:

fig4=px.line(df_res)
fig4.show()

In [25]:
fig4=px.box(df)
fig4.show()

# Normality of data

Unnamed: 0_level_0,SensorA,SensorB,SensorC
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-05-27 00:00:00,1.416250,4.219930,3.139646
2024-05-27 00:01:00,3.534439,,3.064088
2024-05-27 00:02:00,5.659733,,2.925565
2024-05-27 00:03:00,5.640167,3.073650,5.381210
2024-05-27 00:04:00,4.454474,1.554044,1.055965
...,...,...,...
2024-05-27 23:55:00,8.417096,4.686364,9.014305
2024-05-27 23:56:00,12.413407,5.709658,7.005713
2024-05-27 23:57:00,11.400624,7.063344,7.522029
2024-05-27 23:58:00,9.707463,7.903916,


In [28]:
fig5 = px.histogram(df)
fig5.show()

In [38]:
df_res

Unnamed: 0_level_0,SensorA,SensorB,SensorC
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-05-27 00:00:00,1.416250,4.219930,3.139646
2024-05-27 00:05:00,5.033728,1.576916,3.445751
2024-05-27 00:10:00,5.341291,1.645510,5.313207
2024-05-27 00:15:00,4.644082,3.738421,3.391507
2024-05-27 00:20:00,7.629764,0.643676,4.323393
...,...,...,...
2024-05-27 23:40:00,11.322034,8.038819,9.251055
2024-05-27 23:45:00,9.987526,6.086590,6.975805
2024-05-27 23:50:00,11.059341,8.505761,8.286427
2024-05-27 23:55:00,10.571256,5.378183,8.937488


In [39]:
hist_data= df_res.reset_index().drop(labels='time',axis=1)
hist_data

Unnamed: 0,SensorA,SensorB,SensorC
0,1.416250,4.219930,3.139646
1,5.033728,1.576916,3.445751
2,5.341291,1.645510,5.313207
3,4.644082,3.738421,3.391507
4,7.629764,0.643676,4.323393
...,...,...,...
284,11.322034,8.038819,9.251055
285,9.987526,6.086590,6.975805
286,11.059341,8.505761,8.286427
287,10.571256,5.378183,8.937488


In [40]:
data= [hist_data['SensorA'],hist_data['SensorB'],hist_data['SensorC']]

In [41]:
import plotly.figure_factory as ff
fig6= ff.create_distplot(hist_data=data,group_labels=['A','B','C'])
fig6.show()

We can see that Sensor A seems to be following data normally distributed whereas the other 2 no.

# Kruskal test 

In [45]:
from scipy import stats
data1= list(hist_data['SensorA'])
data2=list(hist_data['SensorB'])
data3=list(hist_data['SensorC'])
stats.kruskal(data1,data2,data3)


KruskalResult(statistic=536.0784669871142, pvalue=3.908768593797289e-117)

p_value<0.05 >>>> We reject H0 so the median of the 3 groups are different.
We will perform the Dunn test as a post hoc test.


In [48]:
import scikit_posthocs as sp
glob_data=[data1,data2,data3]
sp.posthoc_dunn(glob_data,p_adjust='hommel')

Unnamed: 0,1,2,3
1,1.0,1.622972e-114,3.510995e-15
2,1.622972e-114,1.0,4.776706999999999e-50
3,3.510995e-15,4.776706999999999e-50,1.0


# Conclusion

In this small project we have used:
- venv
- git
- datetime series


The function resample() helped us to deal with the missing values averaging the measurements over different time windows.



