In [None]:
import pandas as pd 
import plotly.express as px
import plotly.io as pio

pio.renderers['svg'].engine = 'kaleido'
pio.renderers.default = 'svg'

In [None]:
homie_bool = pd.read_csv('data/homie_boolean.csv')
homie_color = pd.read_csv('data/homie_color.csv')
homie_enum = pd.read_csv('data/homie_enum.csv')

In [None]:
# warning: don't load these they are huge!
# homie_float = pd.read_csv('data/homie_float.csv')
# homie_integer = pd.read_csv('data/homie_integer.csv')

**Useful Information about dataset:** <br>

There are between 4 and 10 data points per sensor per minute, depending on how often a sensor gets polled (~ 10K data points in a 24h period for a given sensor)

the csv files are split by data type:
- homie_boolean
- homie_enum
- homie_float: contains all metrics stored as floats (temperature)
- homie_integer: contains all metrics stored as integers (humidity %, battery level %)

dataset columns:
- time: since epoch (unix epoch 1970). pandas handles this for us.
- device_id 
- device_name: only use data with device containing raspberry pi or cottage pi
- node_id: mac address of the sensor
- node_type=="Mijia sensor" -> useful filter?




In [None]:
def downsample_mijia_temp_data():
    homie_float = pd.read_csv('data/homie_float.csv')
    homie_float['time'] = pd.to_datetime(homie_float['time'])
    sensor_data = homie_float.loc[(homie_float['node_type']=='Mijia sensor')].copy()
    downsampled = sensor_data.set_index('time').groupby(['node_name']).resample('1min')['value'].mean().reset_index()
    # The float csv file only contains temperature data
    downsampled.rename(columns={'value':'temperature'}, inplace=True)
    return downsampled

In [None]:
def downsample_mijia_humidity_data():
    homie_integer = pd.read_csv('data/homie_integer.csv')
    homie_integer['time'] = pd.to_datetime(homie_integer['time'])
    sensor_data = homie_integer.loc[(homie_integer['node_type']=='Mijia sensor')].copy()
    sensor_data = sensor_data.loc[sensor_data['property_name']=='Humidity'].copy()
    downsampled = sensor_data.set_index('time').groupby(['node_name']).resample('1min')['value'].median().reset_index()
    downsampled.rename(columns={'value':'humidity'}, inplace=True)
    return downsampled


In [None]:
def build_dataset():
    downsampled_temps = downsample_mijia_temp_data()
    downsampled_humidity = downsample_mijia_humidity_data()
    result = pd.merge(downsampled_temps, downsampled_humidity, on=['node_name', 'time'])
    return result

In [None]:
import cache_magic

In [None]:
%cache dataset = build_dataset()

In [None]:
def calculate_differential_inplace(df, number_minutes):
    past_values = df.groupby('node_name')['temperature'].shift(number_minutes)
    df[f'{number_minutes}min_differential'] = (df['temperature'] - past_values)/number_minutes

In [None]:
calculate_differential_inplace(dataset, 10)

In [None]:
def plot_temp_variations(data): 
    df = data.loc[(data['time']>'2020-12-01 00:00:00.001')&
                (data['time']<'2020-12-02 00:00:00.001')&
                (data['node_name']=='Living room shelves')
                ].copy()
    df['time_elapsed'] = df['time'].apply(lambda t: t.value)

    fig = px.scatter(df, x="10min_differential", y="temperature",color='time_elapsed', 
                title='temperature vs temperature differential', hover_name='time')
    # print(df.shape)
    return fig.show()

plot_temp_variations(dataset)