# Feature extraction

Here we will add manually extracted features to the cleaned, resampled and merged table.

This code will get abstracted into `src/house_climate/data/extract_features.py`

**Prerequisites:**

Execute the below to prepare dataset 

```bash
make data_interim
```

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path
from dotenv import find_dotenv

In [3]:
REPO_ROOT = Path(find_dotenv()).parent
CLEANED_DATA = REPO_ROOT / "data" / "interim" / "01_preprocess.parquet"

___

## Load in preprocessed parquet file

In [4]:
merged = pd.read_parquet(CLEANED_DATA)
merged

Unnamed: 0_level_0,humidity,temperature,zone_id,value,weather,exterior_temp,zone_type,zone_name,hours_in_day
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-12-31 23:45:00+00:00,0.500,20.00,1,NONE,UNKNOWN,,HEATING,Living room,24.0
2021-12-31 23:45:00+00:00,0.500,20.00,2,NONE,UNKNOWN,,HEATING,Bedroom,24.0
2021-12-31 23:45:00+00:00,0.500,20.00,6,NONE,UNKNOWN,,HEATING,Haadiyah Room,24.0
2021-12-31 23:45:00+00:00,0.500,20.00,7,NONE,UNKNOWN,,HEATING,Ensuite,24.0
2021-12-31 23:45:00+00:00,0.500,20.00,9,NONE,UNKNOWN,,HEATING,Office,24.0
...,...,...,...,...,...,...,...,...,...
2023-01-02 00:15:00+00:00,0.557,19.87,9,HIGH,NIGHT_CLOUDY,0.4,HEATING,Upstairs bathroom,24.0
2023-01-02 00:15:00+00:00,0.585,20.06,10,NONE,NIGHT_CLOUDY,0.4,HEATING,Upstairs bathroom,24.0
2023-01-02 00:15:00+00:00,0.579,20.35,12,NONE,NIGHT_CLOUDY,0.4,HEATING,Upstairs bathroom,24.0
2023-01-02 00:15:00+00:00,0.595,21.16,16,LOW,NIGHT_CLOUDY,0.4,HEATING,Upstairs bathroom,24.0


In [5]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4190470 entries, 2021-12-31 23:45:00+00:00 to 2023-01-02 00:15:00+00:00
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   humidity       float64
 1   temperature    float64
 2   zone_id        Int8   
 3   value          object 
 4   weather        object 
 5   exterior_temp  float64
 6   zone_type      object 
 7   zone_name      object 
 8   hours_in_day   float64
dtypes: Int8(1), float64(4), object(4)
memory usage: 295.7+ MB


In [6]:
merged.describe()

Unnamed: 0,humidity,temperature,zone_id,exterior_temp,hours_in_day
count,4190470.0,4190470.0,4190470.0,13101.0,4190470.0
mean,0.5905803,21.38312,7.884678,1.989455,24.00069
std,0.07457257,1.615118,4.755862,0.631705,0.0741466
min,0.276,12.92,1.0,0.4,23.0
25%,0.532,20.29,2.0,1.62,24.0
50%,0.581,21.44,7.0,1.9,24.0
75%,0.6406667,22.39933,10.0,2.47,24.0
max,0.901,29.43,17.0,3.33,25.0


___

## Add features

In [7]:
def add_features(df):
    df = (
        df
        .assign(hour_of_day=lambda x: x.index.hour,
                day_of_week=lambda x: x.index.weekday,
                day_of_year=lambda x: x.index.dayofyear,
                )
        .assign(day_night=lambda x: pd.cut(x.hour_of_day, bins=[0,8,20], labels=['night', 'day']),
                is_weekend=lambda x: x.day_of_week.isin([5, 6]),
                )
    )
    return df

with_features = merged.pipe(add_features)
with_features

Unnamed: 0_level_0,humidity,temperature,zone_id,value,weather,exterior_temp,zone_type,zone_name,hours_in_day,hour_of_day,day_of_week,day_of_year,day_night,is_weekend
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-12-31 23:45:00+00:00,0.500,20.00,1,NONE,UNKNOWN,,HEATING,Living room,24.0,23,4,365,,False
2021-12-31 23:45:00+00:00,0.500,20.00,2,NONE,UNKNOWN,,HEATING,Bedroom,24.0,23,4,365,,False
2021-12-31 23:45:00+00:00,0.500,20.00,6,NONE,UNKNOWN,,HEATING,Haadiyah Room,24.0,23,4,365,,False
2021-12-31 23:45:00+00:00,0.500,20.00,7,NONE,UNKNOWN,,HEATING,Ensuite,24.0,23,4,365,,False
2021-12-31 23:45:00+00:00,0.500,20.00,9,NONE,UNKNOWN,,HEATING,Office,24.0,23,4,365,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-02 00:15:00+00:00,0.557,19.87,9,HIGH,NIGHT_CLOUDY,0.4,HEATING,Upstairs bathroom,24.0,0,0,2,,False
2023-01-02 00:15:00+00:00,0.585,20.06,10,NONE,NIGHT_CLOUDY,0.4,HEATING,Upstairs bathroom,24.0,0,0,2,,False
2023-01-02 00:15:00+00:00,0.579,20.35,12,NONE,NIGHT_CLOUDY,0.4,HEATING,Upstairs bathroom,24.0,0,0,2,,False
2023-01-02 00:15:00+00:00,0.595,21.16,16,LOW,NIGHT_CLOUDY,0.4,HEATING,Upstairs bathroom,24.0,0,0,2,,False


In [8]:
with_features.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4190470 entries, 2021-12-31 23:45:00+00:00 to 2023-01-02 00:15:00+00:00
Data columns (total 14 columns):
 #   Column         Dtype   
---  ------         -----   
 0   humidity       float64 
 1   temperature    float64 
 2   zone_id        Int8    
 3   value          object  
 4   weather        object  
 5   exterior_temp  float64 
 6   zone_type      object  
 7   zone_name      object  
 8   hours_in_day   float64 
 9   hour_of_day    int64   
 10  day_of_week    int64   
 11  day_of_year    int64   
 12  day_night      category
 13  is_weekend     bool    
dtypes: Int8(1), bool(1), category(1), float64(4), int64(3), object(4)
memory usage: 399.6+ MB


In [9]:
with_features.describe()

Unnamed: 0,humidity,temperature,zone_id,exterior_temp,hours_in_day,hour_of_day,day_of_week,day_of_year
count,4190470.0,4190470.0,4190470.0,13101.0,4190470.0,4190470.0,4190470.0,4190470.0
mean,0.5905803,21.38312,7.884678,1.989455,24.00069,11.50023,3.013901,193.856
std,0.07457257,1.615118,4.755862,0.631705,0.0741466,6.922289,2.00211,105.4759
min,0.276,12.92,1.0,0.4,23.0,0.0,0.0,1.0
25%,0.532,20.29,2.0,1.62,24.0,6.0,1.0,103.0
50%,0.581,21.44,7.0,1.9,24.0,12.0,3.0,204.0
75%,0.6406667,22.39933,10.0,2.47,24.0,18.0,5.0,285.0
max,0.901,29.43,17.0,3.33,25.0,23.0,6.0,365.0


Dataset saved to disk by the python module version of this feature extraction pipeline