In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
data = pd.read_csv('big_data.csv', sep=',', index_col='Unnamed: 0')

  mask |= (ar1 == a)


In [3]:
data.head()

Unnamed: 0,state_id,entity_id,state,last_changed,last_updated
0,1406062,sensor.room_temperature,21.7,2020-05-01 00:01:22,2020-05-01 00:01:22
1,1406063,sensor.room_humidity,52.6,2020-05-01 00:01:22,2020-05-01 00:01:22
2,1406064,sensor.bathroom_temperature,22.7,2020-05-01 00:01:22,2020-05-01 00:01:22
3,1406065,sensor.bathroom_humidity,56.0,2020-05-01 00:01:22,2020-05-01 00:01:22
4,1406066,sensor.fittonia_temperature,21.1,2020-05-01 00:01:22,2020-05-01 00:01:22


In [4]:
data.shape

(2412216, 5)

In [9]:
data['entity_id'].unique()

array(['sensor.room_temperature', 'sensor.room_humidity',
       'sensor.bathroom_temperature', 'sensor.bathroom_humidity',
       'sensor.fittonia_temperature', 'sensor.fittonia_moisture',
       'sensor.fittonia_fertility', 'sensor.fittonia_lux',
       'sensor.kitchen_temperature', 'sensor.kitchen_humidity',
       'plant.fittonia', 'sensor.kitchen_thermostat_temperature',
       'climate.kitchen_thermostat', 'binary_sensor.kitchen_window',
       'sensor.equipment_temperature', 'binary_sensor.entrance_door',
       'person.one', 'sensor.weather_temperature', 'light.balcony',
       'sensor.weather_humidity', 'light.kitchen_rgb',
       'sensor.bathroom_thermostat_temperature',
       'climate.bathroom_thermostat',
       'sensor.room_thermostat_temperature', 'climate.room_thermostat',
       'light.hallway_rgb', 'binary_sensor.critical', 'light.bathroom',
       'switch.raspberry_pi_fan', 'binary_sensor.balcony_door',
       'light.bedside_lamp', 'light.hue_go', 'media_player.tv',


### 1) Dropping sensors we don't need and column 'state_id'

In [8]:
sensors_to_drop = ['sensor.calathea_temperature',
                   'plant.calathea',
                   'sensor.fittonia_temperature',
                   'plant.fittonia',
                   'sensor.room_illuminance_lux',
                   'sensor.room_illuminance_lux_2',
                   'sensor.calathea_lux',
                   'sensor.calathea_sunlight_accumulated',
                   'sensor.calathea_fertility',
                   'sensor.fittonia_fertility',
                   'sensor.fittonia_lux',
                   'sensor.fittonia_moisture',
                   'sensor.fittonia_sunlight_accumulated',
                   'binary_sensor.warning',
                   'switch.raspberry_pi_fan',
                   'media_player.tv',
                   'binary_sensor.laptop_work',
                   'binary_sensor.critical',
                   'binary_sensor.laptop',
                   'binary_sensor.printer',
                   'vacuum.roborock',
                   'switch.phone_charger',
                   'sensor.calathea_moisture',
                   'light.office_ceiling',
                   'light.kitchen_rgb',
                   'light.balcony',
                   'light.hue_go',
                   'light.kitchen_ceiling',
                   'light.room_ceiling',
                   'light.bathroom',
                   'light.bedside_lamp',
                   'light.desk_lamp',
                   'light.hallway_rgb',
                   'light.hallway_ceiling',
                   'light.blaulicht',
                   'binary_sensor.entrance_door',
                   'light.room_corner',
                   'light.storage',
                   'sensor.washing_machine_vibration',
                   'sensor.washing_machine_vibration_strength',
                   'sensor.kitchen_thermostat_temperature',
                   'climate.kitchen_thermostat',
                   'sensor.bathroom_thermostat_temperature',
                   'climate.bathroom_thermostat',
                   'sensor.room_thermostat_temperature',
                   'climate.room_thermostat',
                   'sensor.kitchen_temperature_3',
                   'sensor.kitchen_temperature_2',
                   'sensor.kitchen_illuminance_lux',
                   'sensor.hallway_temperature',
                   'sensor.sunlight_lux',
                   'sensor.sysmon_cpu_use',
                   'sensor.sysmon_cpu_temperature',
                   'fan.fan',
                   'sensor.bathroom_temperature_2',
                   'sensor.room_temperature_2']

In [9]:
%%time
for item in sensors_to_drop:
    data.drop(data[data['entity_id'] == item].index, axis=0, inplace=True)

Wall time: 21.2 s


In [10]:
data.shape

(928315, 5)

In [11]:
data.drop(['state_id'], axis=1, inplace=True)

In [12]:
data.shape

(928315, 4)

### 2) Dropping NaNs and duplicates

In [13]:
data.dropna(inplace=True)
data.shape

(928287, 4)

In [14]:
data.duplicated().sum()

21075

In [15]:
data.drop_duplicates(keep=False,inplace=True)
data.shape

(891427, 4)

### 3) Dropping '1-second-artefacts' as well as 'unavaliable' and 'unknown' values

In [16]:
data['updated_shift'] = data['last_updated'].shift()
data.drop(data[data['last_changed'] == data['updated_shift']].index, axis=0, inplace = True)
data.shape

(480662, 5)

In [17]:
data.drop(data[data['state'] == 'unavailable'].index, axis=0, inplace=True)
data.drop(data[data['state'] == 'unknown'].index, axis=0, inplace=True)
data.drop(data[data['state'] == '\\N'].index, axis=0, inplace=True)
data.drop(data[data['state'] == 'None'].index, axis=0, inplace=True)
data.drop(data[data['state'] == 'ok'].index, axis=0, inplace=True)

data.shape

(477838, 5)

In [18]:
grouped_data = data[['entity_id', 'state', 'last_updated']].groupby(['entity_id', 'last_updated']).count()
grouped_data['state'].unique()

array([1], dtype=int64)

Perfect! No more dupicates! Now we can pivot!

### 4) But first we have to deal with time columns!

In [19]:
data.drop(['last_changed', 'updated_shift'], axis=1, inplace=True)
data.shape

(477838, 3)

In [20]:
data['timestamp'] = pd.to_datetime(data['last_updated'])
data.drop(['last_updated'], axis=1, inplace=True)
data.shape

(477838, 3)

### 5) Let's pivot the table!

In [21]:
df = data.pivot(index='timestamp', columns='entity_id', values='state')

In [22]:
df.shape

(443739, 21)

In [240]:
df.head(3)

entity_id,binary_sensor.balcony_door,binary_sensor.kitchen_window,binary_sensor.room_window,climate.air_conditioner,climate.air_conditioner_old,cover.balcony,cover.windows,person.one,person.two,sensor.ac_power,...,sensor.bathroom_humidity,sensor.bathroom_temperature,sensor.equipment_temperature,sensor.kitchen_humidity,sensor.kitchen_temperature,sensor.office_co2,sensor.room_humidity,sensor.room_temperature,sensor.weather_humidity,sensor.weather_temperature
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-01 00:01:22,,,,,,,,,,,...,,,,,,,,21.7,,
2020-05-01 00:03:23,,,,,,,,,,,...,,,,,,,,21.7,,
2020-05-01 00:03:41,,off,,,,,,,,,...,,,,,,,,,,


### 6) Let's divide numeric and categorical columns

In [23]:
numeric_columns = ['sensor.ac_power',
       'sensor.atmospheric_pressure', 'sensor.bathroom_humidity',
       'sensor.bathroom_temperature',
       'sensor.equipment_temperature', 'sensor.kitchen_humidity',
       'sensor.kitchen_temperature',
       'sensor.office_co2', 'sensor.room_humidity', 'sensor.room_temperature', 'sensor.weather_humidity',
       'sensor.weather_temperature']

In [24]:
categorical_columns = ['binary_sensor.balcony_door', 'binary_sensor.kitchen_window',
       'binary_sensor.room_window', 'climate.air_conditioner', 'climate.air_conditioner_old', 'cover.balcony',
       'cover.windows', 'person.one', 'person.two']

### 7) Working with numeric data 

In [25]:
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column]) 

In [26]:
numeric_df = df[numeric_columns].groupby([df.index.month, df.index.day, df.index.hour, df.index.minute]).mean()

In [27]:
numeric_df.shape

(179068, 12)

In [34]:
numeric_df['check_time'] = numeric_df.index
numeric_df['check_time_2'] = numeric_df['check_time'].transform(lambda x: f'2020-{x[0]}-{x[1]} {x[2]}:{x[3]}:00')
numeric_df['time'] = pd.to_datetime(numeric_df['check_time_2'])

In [37]:
del numeric_df['check_time']
del numeric_df['check_time_2']

In [38]:
numeric_df.shape

(179068, 13)

In [39]:
numeric_df['time'].head(3)

timestamp  timestamp  timestamp  timestamp
5          1          0          1           2020-05-01 00:01:00
                                 3           2020-05-01 00:03:00
                                 5           2020-05-01 00:05:00
Name: time, dtype: datetime64[ns]

In [40]:
numeric_df['time'].tail(3)

timestamp  timestamp  timestamp  timestamp
9          30         23         57          2020-09-30 23:57:00
                                 58          2020-09-30 23:58:00
                                 59          2020-09-30 23:59:00
Name: time, dtype: datetime64[ns]

### 8) Working with categorical data 

In [41]:
categorical_df = df[categorical_columns].groupby([df.index.month, df.index.day, df.index.hour, df.index.minute]).agg(lambda x: stats.mode(x)[0][0])

In [42]:
categorical_df['check_time'] = categorical_df.index
categorical_df['check_time_2'] = categorical_df['check_time'].transform(lambda x: f'2020-{x[0]}-{x[1]} {x[2]}:{x[3]}:00')
categorical_df['time'] = pd.to_datetime(categorical_df['check_time_2'])
del categorical_df['check_time']
del categorical_df['check_time_2']

In [43]:
categorical_df.shape

(179068, 10)

In [44]:
categorical_df['time'].head(3)

timestamp  timestamp  timestamp  timestamp
5          1          0          1           2020-05-01 00:01:00
                                 3           2020-05-01 00:03:00
                                 5           2020-05-01 00:05:00
Name: time, dtype: datetime64[ns]

In [45]:
categorical_df['time'].tail(3)

timestamp  timestamp  timestamp  timestamp
9          30         23         57          2020-09-30 23:57:00
                                 58          2020-09-30 23:58:00
                                 59          2020-09-30 23:59:00
Name: time, dtype: datetime64[ns]

### 9) Creating continuous time dataframe

In [49]:
times = pd.date_range('2020-05-01', '2020-10-01', freq='1min')

In [51]:
times

DatetimeIndex(['2020-05-01 00:00:00', '2020-05-01 00:01:00',
               '2020-05-01 00:02:00', '2020-05-01 00:03:00',
               '2020-05-01 00:04:00', '2020-05-01 00:05:00',
               '2020-05-01 00:06:00', '2020-05-01 00:07:00',
               '2020-05-01 00:08:00', '2020-05-01 00:09:00',
               ...
               '2020-09-30 23:51:00', '2020-09-30 23:52:00',
               '2020-09-30 23:53:00', '2020-09-30 23:54:00',
               '2020-09-30 23:55:00', '2020-09-30 23:56:00',
               '2020-09-30 23:57:00', '2020-09-30 23:58:00',
               '2020-09-30 23:59:00', '2020-10-01 00:00:00'],
              dtype='datetime64[ns]', length=220321, freq='T')

In [50]:
dataframe = pd.DataFrame(index=times)
dataframe.shape

(220321, 0)

In [52]:
dataframe = dataframe.join(numeric_df.set_index('time')) #joining numeric data
dataframe.shape

(220321, 12)

In [53]:
dataframe = dataframe.join(categorical_df.set_index('time')) #joining categorical data
dataframe.shape

(220321, 21)

## Let's fill NaN values

In [54]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 220321 entries, 2020-05-01 00:00:00 to 2020-10-01 00:00:00
Freq: T
Data columns (total 21 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   sensor.ac_power               92133 non-null   float64
 1   sensor.atmospheric_pressure   3585 non-null    float64
 2   sensor.bathroom_humidity      690 non-null     float64
 3   sensor.bathroom_temperature   18489 non-null   float64
 4   sensor.equipment_temperature  9973 non-null    float64
 5   sensor.kitchen_humidity       736 non-null     float64
 6   sensor.kitchen_temperature    18058 non-null   float64
 7   sensor.office_co2             81117 non-null   float64
 8   sensor.room_humidity          635 non-null     float64
 9   sensor.room_temperature       50882 non-null   float64
 10  sensor.weather_humidity       449 non-null     float64
 11  sensor.weather_temperature    15455 non-null   float64
 12  bi

In [55]:
dataframe.head()

Unnamed: 0,sensor.ac_power,sensor.atmospheric_pressure,sensor.bathroom_humidity,sensor.bathroom_temperature,sensor.equipment_temperature,sensor.kitchen_humidity,sensor.kitchen_temperature,sensor.office_co2,sensor.room_humidity,sensor.room_temperature,...,sensor.weather_temperature,binary_sensor.balcony_door,binary_sensor.kitchen_window,binary_sensor.room_window,climate.air_conditioner,climate.air_conditioner_old,cover.balcony,cover.windows,person.one,person.two
2020-05-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2020-05-01 00:01:00,,,,,,,,,,21.7,...,,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-05-01 00:02:00,,,,,,,,,,,...,,,,,,,,,,
2020-05-01 00:03:00,,,,,,,,,,21.7,...,,0.0,off,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-05-01 00:04:00,,,,,,,,,,,...,,,,,,,,,,


In [56]:
dataframe.sample(10)

Unnamed: 0,sensor.ac_power,sensor.atmospheric_pressure,sensor.bathroom_humidity,sensor.bathroom_temperature,sensor.equipment_temperature,sensor.kitchen_humidity,sensor.kitchen_temperature,sensor.office_co2,sensor.room_humidity,sensor.room_temperature,...,sensor.weather_temperature,binary_sensor.balcony_door,binary_sensor.kitchen_window,binary_sensor.room_window,climate.air_conditioner,climate.air_conditioner_old,cover.balcony,cover.windows,person.one,person.two
2020-09-14 06:46:00,2.15,,,,,,,464.0,,,...,,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
2020-05-24 05:15:00,,,,,,,,,,,...,,,,,,,,,,
2020-06-16 22:37:00,,,,24.8,,,,,,,...,,0.0,0.0,0.0,off,0.0,0.0,0.0,0.0,0.0
2020-06-13 04:05:00,,,,,,,,,,,...,,,,,,,,,,
2020-07-23 17:48:00,779.5,,,,,,,402.0,,22.1,...,,0.0,0.0,0.0,cool,0.0,0.0,0.0,0.0,0.0
2020-07-27 08:46:00,740.666667,,,,,,,428.0,,,...,,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
2020-09-14 15:14:00,1062.1,,,,,,,,,,...,,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
2020-05-22 05:53:00,,,,,,,,,,,...,,,,,,,,,,
2020-05-08 19:58:00,,,,,,,,,,,...,,,,,,,,,,
2020-08-23 14:24:00,1057.0,,,,,,,400.0,,,...,,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [57]:
dataframe.replace(0, np.NaN, inplace=True) #"0" values a actually created from 'NaN' values while using groupby method. Let's reverse it.

In [58]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 220321 entries, 2020-05-01 00:00:00 to 2020-10-01 00:00:00
Freq: T
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   sensor.ac_power               91911 non-null  float64
 1   sensor.atmospheric_pressure   3585 non-null   float64
 2   sensor.bathroom_humidity      690 non-null    float64
 3   sensor.bathroom_temperature   18489 non-null  float64
 4   sensor.equipment_temperature  9973 non-null   float64
 5   sensor.kitchen_humidity       736 non-null    float64
 6   sensor.kitchen_temperature    18058 non-null  float64
 7   sensor.office_co2             81117 non-null  float64
 8   sensor.room_humidity          635 non-null    float64
 9   sensor.room_temperature       50882 non-null  float64
 10  sensor.weather_humidity       449 non-null    float64
 11  sensor.weather_temperature    15455 non-null  float64
 12  binary_sensor.ba

In [59]:
dataframe.fillna(method='ffill', inplace = True)

In [60]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 220321 entries, 2020-05-01 00:00:00 to 2020-10-01 00:00:00
Freq: T
Data columns (total 21 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   sensor.ac_power               187995 non-null  float64
 1   sensor.atmospheric_pressure   135133 non-null  float64
 2   sensor.bathroom_humidity      217174 non-null  float64
 3   sensor.bathroom_temperature   220310 non-null  float64
 4   sensor.equipment_temperature  220315 non-null  float64
 5   sensor.kitchen_humidity       177707 non-null  float64
 6   sensor.kitchen_temperature    220278 non-null  float64
 7   sensor.office_co2             131289 non-null  float64
 8   sensor.room_humidity          177310 non-null  float64
 9   sensor.room_temperature       220320 non-null  float64
 10  sensor.weather_humidity       219735 non-null  float64
 11  sensor.weather_temperature    220302 non-null  float64
 12  bi

In [65]:
dataframe[132475:132480]

Unnamed: 0,sensor.ac_power,sensor.atmospheric_pressure,sensor.bathroom_humidity,sensor.bathroom_temperature,sensor.equipment_temperature,sensor.kitchen_humidity,sensor.kitchen_temperature,sensor.office_co2,sensor.room_humidity,sensor.room_temperature,...,sensor.weather_temperature,binary_sensor.balcony_door,binary_sensor.kitchen_window,binary_sensor.room_window,climate.air_conditioner,climate.air_conditioner_old,cover.balcony,cover.windows,person.one,person.two
2020-07-31 23:55:00,67.0,1015.0,43.0,23.9,29.1,50.3,24.4,400.0,41.9,23.7,...,17.1,on,off,off,off,,open,open,not_home,not_home
2020-07-31 23:56:00,67.0,1015.0,43.0,23.9,29.1,50.3,24.4,400.0,41.9,23.7,...,17.0,on,off,off,off,,open,open,not_home,not_home
2020-07-31 23:57:00,67.0,1015.0,43.0,23.9,29.3,50.3,24.4,400.0,41.9,23.7,...,17.0,on,off,off,off,,open,open,not_home,not_home
2020-07-31 23:58:00,67.0,1015.0,43.0,23.9,29.3,50.3,24.4,400.0,41.9,23.7,...,17.0,on,off,off,off,,open,open,not_home,not_home
2020-07-31 23:59:00,67.0,1015.0,43.0,23.9,29.3,50.3,24.4,400.0,41.9,23.7,...,17.0,on,off,off,off,,open,open,not_home,not_home


In [63]:
dataframe.to_csv('wide_data.csv')

In [66]:
dataframe[87840:132480].to_csv('july.csv')