In [1]:
import pandas as pd
import numpy as np

In [2]:
# Output markdown to cells
from IPython.display import display, Markdown

---
## Home Area Network Plug Readings

In [3]:
# Read HANPR data
url_path = "https://data.gov.au/dataset/4e21dea3-9b87-4610-94c7-15a8a77907ef/resource/63d2b1cd-f453-4440-8bb7-ed083326f5ae/download/sgsc-cthanplug-readings.csv"
HAN_readings_df = pd.read_csv(url_path)

In [4]:
# Trim leading and trailing whitespaces in column names
HAN_readings_df.columns = HAN_readings_df.columns.str.strip()

In [5]:
HAN_readings_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10828120 entries, 0 to 10828119
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   CUSTOMER_ID    int64  
 1   READING_TIME   object 
 2   PLUG_NAME      object 
 3   READING_VALUE  float64
 4   CALENDAR_KEY   int64  
 5   RECORD_COUNT   int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 495.7+ MB


In [6]:
# Number of unique households with HANPR data
HAN_readings_df.CUSTOMER_ID.unique().shape

(808,)

---
### Dataframe structure plot

The **unique** values for each column are:

In [7]:
HAN_readings_df.drop(['CUSTOMER_ID'], axis='columns').phd.unique_cols()

{'READING_TIME': array(['2013-08-19 14:48:40', '2013-08-19 14:48:41',
        '2013-08-19 14:48:42', ..., '2013-12-11 00:00:11',
        '2014-02-07 14:07:16', '2014-02-18 16:57:53'], dtype=object),
 'PLUG_NAME': array(['Microwave', 'TV', 'Dishwasher', 'Kettle', 'WashingMachine',
        'AirCon', 'Computer', 'Freezer', 'Fridge', 'B1', 'Washing',
        'Dryer', 'Lights', 'TV2', 'W1', 'Oven', 'Pool Pump', 'B2',
        'Lights2', 'Aircon', 'Plug 1', 'Plug 2', 'Lights Upstairs',
        'Lights Downstairs', 'Stove', 'Bathroom', 'Sound System', 'Radio',
        'Kitchen', 'Aircon A', 'Aircon B', 'Aircon C', 'Toaster', 'TV3',
        'AirCon2', 'Hot Water System', 'Fridge2', 'Garage', 'W2',
        'Rachels computer', 'Bedroom TV', 'Computer2', 'Laptop',
        'Bedroom 1', 'Ent System', 'Fridge 2', 'Washing 2', 'Home Theatre',
        'Coffee Machine', 'Aquarium', 'Kitchen2', 'PanelMeter', 'Lamp1',
        'Other1', 'Fan', 'B3', 'Computer3', 'Hot Plates', 'Microwave 2',
        'Heater

In [8]:
# Get the devices and bedrooms per consumer
customer_devices_df = HAN_readings_df.groupby('CUSTOMER_ID').apply(lambda x: x['PLUG_NAME'].unique())

In [9]:
customer_devices_df.rename("device_list", inplace=True)

CUSTOMER_ID
8145135     [Kitchen, Microwave, TV, Washing, Aircon, Dish...
8145501                                                  [B1]
8147703     [Kettle, Washing, Computer, Dryer, Lights, TV,...
8149711     [Aircon, Hot Water System, Oven, Microwave, Fr...
8151717                                                  [B2]
                                  ...                        
11450557                                                 [B1]
11452683    [Washing, Dishwasher, TV, Kitchen2, Bathroom, ...
11462018    [Dishwasher, TV, Washing, Aircon, Computer, Ho...
11466569                                                 [B2]
11590454                                                 [W1]
Name: device_list, Length: 808, dtype: object

---
### *Potential features/Feature engineering*

First we try to see if we can extract features from the *Home network plug readings* dataset. We can create features per household, like number of TVs, number of ACs, etc., which can later be used to predict how the households have responded.

In [10]:
# Sort and print the various devices used
np.sort(HAN_readings_df.PLUG_NAME.unique())

array(['AirCon', 'AirCon2', 'AirCon3', 'Aircon', 'Aircon A', 'Aircon B',
       'Aircon C', 'Aquarium', 'Aquarium 1', 'Aquarium 2', 'Av room',
       'B1', 'B2', 'B3', 'BackRoom', 'Bathroom', 'Bedroom 1', 'Bedroom 2',
       'Bedroom 3', 'Bedroom TV', 'Bottle fridge', 'Coffee Machine',
       'Computer', 'Computer Z', 'Computer2', 'Computer3', 'D3',
       'Dishwasher', 'Drainage Pump', 'Dryer', 'Ent System', 'Fan',
       'Freezer', 'Fridge', 'Fridge 2', 'Fridge 3', 'Fridge2',
       'Games Console', 'Garage', 'H1', 'Heat pump A', 'Heat pump B',
       'Heat pump C', 'Heater', 'Home Theatre', 'Hot Plates',
       'Hot Water System', 'Iron', 'Kettle', 'Kitchen', 'Kitchen2', 'L1',
       'L2', 'L3', 'LAPTOP1', 'LAPTOP2', 'LAPTOP3', 'LAUNGE 1',
       'LAUNGE 2', 'LOUNGE 1', 'LOUNGE 2', 'Lamp1', 'Lamp2', 'Lamp3',
       'Laptop', 'Lights', 'Lights Downstairs', 'Lights Upstairs',
       'Lights2', 'MW', 'Massage chair', 'Microwave', 'Microwave 2',
       'Office', 'Other1', 'Oven', 'POWER

In [11]:
devices_df = customer_devices_df.explode().reset_index()

In [12]:
display(Markdown("The total number of households for which we have household device data is **{}**.".format(devices_df.CUSTOMER_ID.unique().shape[0])))

The total number of households for which we have household device data is **808**.

In [13]:
# Number of AC units
AC_mask = devices_df['device_list'].str.contains('aircon*', case=False, regex=True)

# Number of households that we data about AC  
display(Markdown("The total number of households for which we have AC data is **{}**.".format(devices_df[AC_mask].CUSTOMER_ID.unique().shape[0])))

The total number of households for which we have AC data is **119**.

In [14]:
# Number of heat pumps
HP_mask = devices_df['device_list'].str.contains('heat\s*pump*', case=False, regex=True)

# Number of households that we data about heat pumps
display(Markdown("The total number of households for which we have heat pump data is **{}**.".format(devices_df[HP_mask].CUSTOMER_ID.unique().shape[0])))

The total number of households for which we have heat pump data is **1**.

In [15]:
# Number of fridges
fridge_freezer_mask = devices_df['device_list'].str.contains('fridge*', case=False, regex=True) | devices_df['device_list'].str.contains('Freezer')

# Number of households that we data about freezer/fridges
display(Markdown("The total number of households for which we have freezer/fridge data is **{}**.".format(devices_df[fridge_freezer_mask].CUSTOMER_ID.unique().shape[0])))

The total number of households for which we have freezer/fridge data is **94**.

In [16]:
# Number of computers 
PC_laptop_mask = devices_df['device_list'].str.contains('comp*', case=False, regex=True) | devices_df['device_list'].str.contains('lapt*', case=False, regex=True)

# Number of households that we data about PCs/laptops
display(Markdown("The total number of households for which we have PC/laptop data is **{}**.".format(devices_df[PC_laptop_mask].CUSTOMER_ID.unique().shape[0])))

The total number of households for which we have PC/laptop data is **121**.

In [17]:
# Proportion of households that have AC or fridge or laptop
devices_df[AC_mask | fridge_freezer_mask | PC_laptop_mask].shape[0]/devices_df.CUSTOMER_ID.unique().shape[0]

0.49876237623762376

We can see that the various features only constitute a small subset of the households, and so they will **not be really informative** to be used for the modelling of residential response behaviour.