# Load in the uncompressed pickle file

In [4]:
import pandas as pd
train_df = pd.read_pickle('tz_aware_merged_data_yh_v2.pkl')

# Compress the pickle files

From https://www.kaggle.com/caesarlupum/ashrae-ligthgbm-simple-fe

In [5]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
## Reducing memory
train_df = reduce_mem_usage(train_df)

Mem. usage decreased to 1265.39 Mb (52.6% reduction)


In [7]:
## Save to compressed pickle files
train_df.to_pickle("tz_aware_merged_data_yh_v2_cmp.pkl")

# --------------------------------------------------------------------------------------------------------
# Load in compressed pickle file

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objects as go
from plotly.colors import n_colors
import plotly.figure_factory as ff
import datetime
from astral import Astral
import chart_studio
chart_studio.tools.set_credentials_file(username='sskowronski', api_key='D8RUhDTi9YKoYpESoM06')

In [2]:
train_df = pd.read_pickle('tz_aware_merged_data_yh_v2_cmp.pkl')

# Initial survey of train_df

In [30]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year,hour,day,week,primary_use_num
0,0,0,2016-01-01,220.046478,0,Education,7432,2008,8,19.40625,...,0,4,1,1,53,1,0,0,0,0
1,1,0,2016-01-01,101.917961,0,Education,2720,2004,5,19.40625,...,0,4,1,1,53,1,0,0,0,0
2,2,0,2016-01-01,5.634698,0,Education,5376,1991,4,19.40625,...,0,4,1,1,53,1,0,0,0,0
3,3,0,2016-01-01,366.496399,0,Education,23685,2002,10,19.40625,...,0,4,1,1,53,1,0,0,0,0
4,4,0,2016-01-01,1568.406494,0,Education,116607,1975,1,19.40625,...,0,4,1,1,53,1,0,0,0,0


In [29]:
train_df.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'timestamp_utc',
       'wind_direction', 'wind_speed', 'timezone', 'country_code', 'location',
       'dst', 'local_time', 'timedelta', 'hour_of_day', 'day_of_week',
       'day_of_month', 'day_of_year', 'week_of_year', 'month_of_year', 'hour',
       'day', 'week', 'primary_use_num'],
      dtype='object')

# Update and add time variables

In [5]:
# I won't be using these in my plotting

del train_df["day_of_week"], train_df["hour_of_day"], train_df["time_index"], train_df["avg"], train_df["std"], train_df["outlier"], train_df["index"]

In [10]:
for df in [train_df]:
    df['timedelta'] = ((pd.to_timedelta(df['timestamp'] - df['timestamp'].min()).dt.total_seconds().astype('int64')) / 3600)

    # Categorical date and  time features
    df['hour_of_day'] = df['timestamp'].dt.hour.values.astype(np.uint8)
    df['day_of_week'] = df['timestamp'].dt.dayofweek.values.astype(np.uint8)
    df['day_of_month'] = df['timestamp'].dt.day.values.astype(np.uint8)  
    df['day_of_year'] = df['timestamp'].dt.dayofyear.values.astype(np.uint16)
    df['week_of_year'] = df['timestamp'].dt.weekofyear.values.astype(np.uint8)
    df['month_of_year'] = df['timestamp'].dt.month.values.astype(np.uint8)

    # Continuous date and time features
    df['hour'] = df['timedelta'].astype(np.uint16)
    df['day'] = (df['hour'] / 24).astype(np.uint16)
    df['week'] = (df['day'] / 7).astype(np.uint8)

In [11]:
train_df[train_df.week_of_year==53]

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,timedelta,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year,hour,day,week
0,0,0,2016-01-01 00:00:00,220.046478,0,Education,7432,2008.0,8.0,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
1,1,0,2016-01-01 00:00:00,101.917961,0,Education,2720,2004.0,5.0,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
2,2,0,2016-01-01 00:00:00,5.634698,0,Education,5376,1991.0,4.0,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
3,3,0,2016-01-01 00:00:00,366.496399,0,Education,23685,2002.0,10.0,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
4,4,0,2016-01-01 00:00:00,1568.406494,0,Education,116607,1975.0,1.0,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
5,5,0,2016-01-01 00:00:00,11.006485,0,Education,8000,2000.0,2.0,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
6,6,0,2016-01-01 00:00:00,86.074173,0,Lodging/residential,27926,1981.0,5.0,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
7,7,0,2016-01-01 00:00:00,550.792114,0,Education,121074,1989.0,10.0,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
8,8,0,2016-01-01 00:00:00,400.311279,0,Education,60809,2003.0,10.0,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
9,9,0,2016-01-01 00:00:00,100.309166,0,Office,27000,2010.0,6.0,19.40625,...,0.0,0,4,1,1,53,1,0,0,0


# Make `year_built` and `floor_count` display as an integer

In [12]:
make_int = lambda x: '{:.0f}'.format(x)
train_df.year_built=train_df.year_built.apply(make_int)
train_df.floor_count=train_df.floor_count.apply(make_int)

In [13]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,timedelta,hour_of_day,day_of_week,day_of_month,day_of_year,week_of_year,month_of_year,hour,day,week
0,0,0,2016-01-01,220.046478,0,Education,7432,2008,8,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
1,1,0,2016-01-01,101.917961,0,Education,2720,2004,5,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
2,2,0,2016-01-01,5.634698,0,Education,5376,1991,4,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
3,3,0,2016-01-01,366.496399,0,Education,23685,2002,10,19.40625,...,0.0,0,4,1,1,53,1,0,0,0
4,4,0,2016-01-01,1568.406494,0,Education,116607,1975,1,19.40625,...,0.0,0,4,1,1,53,1,0,0,0


# Create dictionary of `primary_use` variable and turn the columns into category variables (0 through 15)

In [26]:
primary_use_list = train_df.primary_use.unique().tolist()
primary_use_list.sort()
primary_use_values = list(range(16))
primary_use_dict = dict(zip(primary_use_list, primary_use_values))
primary_use_lambda = lambda x:primary_use_dict[x]

In [28]:
train_df["primary_use_num"] = train_df.primary_use.apply(primary_use_lambda)

In [27]:
primary_use_dict

{'Education': 0,
 'Entertainment/public assembly': 1,
 'Food sales and service': 2,
 'Healthcare': 3,
 'Lodging/residential': 4,
 'Manufacturing/industrial': 5,
 'Office': 6,
 'Other': 7,
 'Parking': 8,
 'Public services': 9,
 'Religious worship': 10,
 'Retail': 11,
 'Services': 12,
 'Technology/science': 13,
 'Utility': 14,
 'Warehouse/storage': 15}

# --------------------------------------------------------------------------------------------------------
# Save to compressed, cleaned up pickle file

In [31]:
train_df.to_pickle("tz_aware_merged_data_yh_v2_cmp_clean.pkl")

# --------------------------------------------------------------------------------------------------------

# Other thoughts

# Lag + window variables

From https://www.kaggle.com/rohanrao/ashrae-divide-and-conquer

# Correction to units for site 0 electric meter

From https://www.kaggle.com/c/ashrae-energy-prediction/discussion/119261#latest-685633

Not necessary if we do modelling building-wise or site-wise

Multiply by 0.2931 to get to model inputs into kWh like the other sites, and 3.4118 to get back to kBTU for scoring.

# Add leaked site location latitude + longitude

From https://www.kaggle.com/c/ashrae-energy-prediction/discussion/112841#latest-649772

Will use this for sunrise/sunset calculations