In [None]:
import pandas as pd 
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
homie_bool = pd.read_csv('data/homie_boolean.csv')
homie_color = pd.read_csv('data/homie_color.csv')
homie_enum = pd.read_csv('data/homie_enum.csv')

In [None]:
# warning: don't load these they are huge!
# homie_float = pd.read_csv('data/homie_float.csv')
# homie_integer = pd.read_csv('data/homie_integer.csv')

**Useful Information about dataset:** <br>

There are between 4 and 10 data points per sensor per minute, depending on how often a sensor gets polled (~ 10K data points in a 24h period for a given sensor)

the csv files are split by data type:
- homie_boolean
- homie_enum
- homie_float: contains all metrics stored as floats (temperature)
- homie_integer: contains all metrics stored as integers (humidity %, battery level %)

dataset columns:
- time: since epoch (unix epoch 1970). pandas handles this for us.
- device_id 
- device_name: only use data with device containing raspberry pi or cottage pi
- node_id: mac address of the sensor
- node_type=="Mijia sensor" -> useful filter?




In [None]:
def downsample_mijia_temp_data():
    homie_float = pd.read_csv('data/homie_float.csv')
    homie_float['time'] = pd.to_datetime(homie_float['time'])
    sensor_data = homie_float.loc[(homie_float['node_type']=='Mijia sensor')].copy()
    downsampled = sensor_data.set_index('time').groupby(['node_name']).resample('1min')['value'].mean().reset_index()
    # The float csv file only contains temperature data
    downsampled.rename(columns={'value':'temperature'}, inplace=True)
    return downsampled

In [None]:
def downsample_mijia_humidity_data():
    homie_integer = pd.read_csv('data/homie_integer.csv')
    homie_integer['time'] = pd.to_datetime(homie_integer['time'])
    sensor_data = homie_integer.loc[(homie_integer['node_type']=='Mijia sensor')].copy()
    sensor_data = sensor_data.loc[sensor_data['property_name']=='Humidity'].copy()
    downsampled = sensor_data.set_index('time').groupby(['node_name']).resample('1min')['value'].median().reset_index()
    downsampled.rename(columns={'value':'humidity'}, inplace=True)
    return downsampled


In [None]:
def build_dataset():
    downsampled_temps = downsample_mijia_temp_data()
    downsampled_humidity = downsample_mijia_humidity_data()
    result = pd.merge(downsampled_temps, downsampled_humidity, on=['node_name', 'time'])
    return result

In [None]:
dataset = build_dataset()

In [None]:
def calculate_differential_inplace(df, number_minutes):
    past_values = df.groupby('node_name')['temperature'].shift(number_minutes)
    df[f'{number_minutes}min_differential'] = (df['temperature'] - past_values)/number_minutes

In [None]:
calculate_differential_inplace(dataset, 10)

In [None]:
def plot_temp_variations(data): 
    df = data.loc[(data['time']>'2020-12-01 00:00:00.001')&
                (data['time']<'2020-12-02 00:00:00.001')&
                (data['node_name']=='Living room shelves')
                ].copy()
    df['time_elapsed'] = df['time'].apply(lambda t: t.value)

    fig = px.scatter(df, x="10min_differential", y="temperature",color='time_elapsed', 
                title='temperature vs temperature differential', hover_name='time')
    print(df.shape)
    return fig.show()

plot_temp_variations(dataset)

In [None]:
def plot_boxplots_per_sensor(df):
    remove = ["Table dangly", "Outside chair", "Fridge drawer", "Fridge door", "2AA3D2", "392F3E", "Tree top", "Tree bottom"]
    data = df[~df['node_name'].isin(remove)].dropna().copy()
    # Separating out the features
    x = data.loc[:, ['temperature', 'humidity']].values
    # Separating out the target
    # y = df.loc[:,['node_name']].values
    # Standardizing the features
    x = StandardScaler().fit_transform(x)

    pca = PCA(n_components=1)
    principalComponents = pca.fit_transform(x)
    print(len(principalComponents))
    print(data.shape)
    data['PCA']= principalComponents
    fig = px.box(data, y="PCA", x='node_name')
    return fig.show()

#plot_boxplots_per_sensor(dataset)

In [None]:
def plot_week_month_comparison(df):
    remove = ["Table dangly", "Outside chair", "Fridge drawer", "Fridge door", "2AA3D2", "392F3E", "Tree top", "Tree bottom"]
    data = df[~df['node_name'].isin(remove)].dropna().copy()
    # data['week_number'] = data['time'].dt.isocalendar().week
    data['day_number'] = data['time'].dt.isocalendar().day
    data['day_name'] = data['time'].dt.day_name()
    data['month_number'] = data['time'].dt.month
    data['time_of_day']= data['time'].dt.time
    # data = df.loc[df['week_number']== 13].copy()
    
    data = data.set_index('time').groupby(['day_name']).resample('30min')['temperature'].mean().reset_index()
    data['time_of_day']= data['time'].dt.time
    data['month_number'] = data['time'].dt.month
    data = data.groupby(['time_of_day','day_name','month_number'])['temperature'].mean().reset_index()
    data = data.loc[(data['month_number']==1)|(data['month_number']==4)]
    
    fig = px.line(data, x="time_of_day", y='temperature', color='day_name',facet_row="month_number", width=700, height=700,category_orders={"day_name": ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]})
    fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = ["00:00:00", "02:00:00", "04:00:00", "06:00:00", "08:00:00", "10:00:00","12:00:00", "14:00:00", "16:00:00", "18:00:00", "20:00:00", "22:00:00"],
        ticktext = ["00:00", "02:00", "04:00", "06:00", "08:00", "10:00","12:00", "14:00", "16:00", "18:00", "20:00", "22:00"]
    )
)
    fig.update_xaxes(tickangle=45)
    return fig.show()

plot_week_month_comparison(dataset)

In [None]:
def compare_days_of_the_week(df):
    remove = ["Table dangly", "Outside chair", "Fridge drawer", "Fridge door", "2AA3D2", "392F3E", "Tree top", "Tree bottom"]
    data = df[~df['node_name'].isin(remove)].dropna().copy()
    # data['week_number'] = data['time'].dt.isocalendar().week
    data['day_number'] = data['time'].dt.isocalendar().day
    data['day_name'] = data['time'].dt.day_name()
    data['month_number'] = data['time'].dt.month
    data['time_of_day']= data['time'].dt.time
    # data = df.loc[df['week_number']== 13].copy()
    
    data = data.set_index('time').groupby(['day_name']).resample('30min')['temperature'].mean().reset_index()
    data['time_of_day']= data['time'].dt.time
    data = data.groupby(['time_of_day','day_name'])['temperature'].mean().reset_index()
    
    fig = px.line(data, x="time_of_day", y='temperature', color='day_name',labels=dict(time_of_day="Time of Day", temperature="Temperature (°C)", day_name="Day of the Week"),category_orders={"day_name": ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]})
    fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = ["00:00:00", "02:00:00", "04:00:00", "06:00:00", "08:00:00", "10:00:00","12:00:00", "14:00:00", "16:00:00", "18:00:00", "20:00:00", "22:00:00"],
        ticktext = ["00:00", "02:00", "04:00", "06:00", "08:00", "10:00","12:00", "14:00", "16:00", "18:00", "20:00", "22:00"]
    )
)
    fig.update_xaxes(tickangle=45)
    return fig.show()

compare_days_of_the_week(dataset)