In [6]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [7]:
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo('f2').min and c_max < np.finfo('f2').max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo('f4').min and c_max < np.finfo('f4').max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo('i1').min and c_max < np.iinfo('i1').max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo('i2').min and c_max < np.iinfo('i2').max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo('i4').min and c_max < np.iinfo('i4').max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
        elif col == 'timestamp':
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != 'datetime':
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print("Data usage is less about", round(start_mem - end_mem, 2), "Mb (-", round(100* (start_mem - end_mem) / start_mem, 1), "%)")
    return df

In [8]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
weather = weather[weather["site_id"] == 0]
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")
energy = energy[energy["building_id"]<20]
energy = pd.merge(left=energy, right=buildings, how="left",
                 left_on="building_id", right_on="building_id")
del buildings
print(energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   building_id    175680 non-null  int64  
 1   meter          175680 non-null  int64  
 2   timestamp      175680 non-null  object 
 3   meter_reading  175680 non-null  float64
 4   site_id        175680 non-null  int64  
 5   primary_use    175680 non-null  object 
 6   square_feet    175680 non-null  int64  
 7   year_built     175680 non-null  float64
 8   floor_count    0 non-null       float64
dtypes: float64(3), int64(4), object(2)
memory usage: 12.1+ MB
None


In [10]:
weather["precip_depth_1_hr"] = weather["precip_depth_1_hr"].apply(lambda x: 0 if x < 0 else x)
interpolate_columns = ["air_temperature", "dew_temperature", 
                       "cloud_coverage", "wind_speed", "precip_depth_1_hr", 
                       "sea_level_pressure"]
for col in interpolate_columns:
    weather[col] = weather[col].interpolate(limit_direction='both',
                                          kind='cubic')

In [11]:
weather["wind_direction_rad"] = weather["wind_direction"] / np.pi
weather["wind_direction_sin"] = np.sin(weather["wind_direction"])
weather["wind_direction_cos"] = np.cos(weather["wind_direction"])
weather["air_temperature_diff1"] = weather["air_temperature"].diff()
weather.at[0, "air_temperature_diff1"] = weather.at[1, "air_temperature"]
weather["air_temperature_diff2"] = weather["air_temperature_diff1"].diff()
weather.at[0, "air_temperature_diff2"] = weather.at[1, "air_temperature_diff1"]

In [12]:
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(left=energy, right=weather, how="left",
                 left_index=True, right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(columns=["meter", "site_id", "year_built",
                             "square_feet", "floor_count"], axis=1)
del weather
energy = reduce_memory_usage(energy)

Data usage is less about 15.41 Mb (- 71.9 %)


In [16]:
energy["hour"] = energy["timestamp"].dt.hour.astype("int8")
energy["weekday"] = energy["timestamp"].dt.weekday.astype("int8")
# energy["week"] = energy["timestamp"].dt.week.astype("int8")
energy["month"] = energy["timestamp"].dt.month.astype("int8")
energy["date"] = pd.to_datetime(energy["timestamp"].dt.date)
dates_range = pd.date_range(start='2015-12-31', end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(),
                                 end=dates_range.max())
energy["is_holiday"] = energy["date"].isin(us_holidays).astype("int8")
for weekday in range(0, 7):
    energy["is_wday" + str(weekday)] = energy["weekday"].isin([weekday]).astype("int8")
# for week in range(1, 54):
#     energy["is_w" + str(week)] = energy["week"].isin([week]).astype("int8")
for month in range(1, 13):
    energy["is_m" + str(month)] = energy["month"].isin([month]).astype("int8")

In [17]:
energy["meter_reading_log"] = np.log(energy["meter_reading"] + 1)

In [19]:
energy_train, energy_test = train_test_split(energy[energy["meter_reading"]>0],
                                            test_size=0.2)
print(energy_train.head())

                 timestamp  ...  meter_reading_log
87559  2016-07-01 09:00:00  ...           5.496094
113310 2016-08-24 01:00:00  ...           7.226562
152387 2016-11-13 11:00:00  ...           6.312500
78794  2016-06-13 03:00:00  ...           5.519531
91879  2016-07-10 09:00:00  ...           5.328125

[5 rows x 41 columns]


  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


In [22]:
hours = range(0, 24)
buildings = range(0, energy_train["building_id"].max() + 1)
lr_columns = ["meter_reading_log", "hour", "building_id",
             "air_temperature", "dew_temperature",
             "sea_level_pressure", "wind_speed", "cloud_coverage",
             "air_temperature_diff1", "air_temperature_diff2",
             "is_holiday"]
for wday in range(0, 7):
    lr_columns.append("is_wday" + str(wday))
# for week in range(1, 54):
#     lr_columns.append("is_w" + str(week))
for month in range(1, 13):
    lr_columns.append("is_m" + str(month))
energy_train_lr = pd.DataFrame(energy_train, columns=lr_columns)
energy_lr = [[]] * len(buildings)
for building in buildings:
    energy_lr[building] = [[]] * len(hours)
    energy_train_b = energy_train_lr[energy_train_lr["building_id"] == building]
    for hour in hours:
        energy_train_bh = energy_train_b[energy_train_b["hour"] == hour]
        y = energy_train_bh["meter_reading_log"]
        x = energy_train_bh.drop(labels=["meter_reading_log",
                                        "hour", "building_id"], axis=1)
        model = LinearRegression(fit_intercept=False).fit(x, y)
        energy_lr[building][hour] = model.coef_
        energy_lr[building][hour] = np.append(energy_lr[building][hour], model.intercept_)
print(energy_lr)

[[array([ 3.38191097e-03,  3.95461963e-03, -8.57075583e-03, -1.05252676e-02,
       -9.26515274e-03, -1.04451468e-02,  3.41816689e-03, -3.48623879e-02,
        7.49582434e+00,  7.51744318e+00,  7.54334784e+00,  7.50748348e+00,
        7.51539230e+00,  7.47412682e+00,  7.48098993e+00,  1.90734863e-06,
       -1.85966492e-05,  1.41121447e-04, -2.53804028e-05,  6.44024324e+00,
        6.64801311e+00,  6.72842979e+00,  6.73160887e+00,  6.61245060e+00,
        6.60032368e+00,  6.44431448e+00,  6.32795048e+00,  0.00000000e+00]), array([ 1.70534141e-02,  8.45292583e-04, -7.86584802e-04, -9.30731371e-03,
       -6.61493046e-03, -7.88371917e-03,  1.31731285e-02,  5.43103926e-02,
        3.09538364e+00,  3.13715005e+00,  3.17212534e+00,  3.13154674e+00,
        3.12638569e+00,  3.13580084e+00,  3.09065628e+00,  3.33786011e-05,
        1.99079514e-05,  7.36117363e-05, -4.47630882e-05,  2.57478619e+00,
        2.81421447e+00,  2.88389659e+00,  2.87851596e+00,  2.76864028e+00,
        2.78779602e+0

In [23]:
def calculate_model(x):
    lr = -1
    model = energy_lr[x.building_id][x.hour]
    if len(model) > 0:
        lr = np.sum([x[col]*model[i] for i, col in enumerate(lr_columns[3:])])
        lr += model[len(lr_columns)-3]
        lr = np.exp(lr)
    else:
        lr = 0
    x["meter_reading_lr_q"] = (np.log(x.meter_reading + 1) - 
                               np.log(1 + lr))**2
    return x

energy_test = energy_test.apply(calculate_model,
                               axis=1, result_type="expand")
energy_test_lr_rmsle = np.sqrt(energy_test["meter_reading_lr_q"].sum() / 
                              len(energy_test))
print("Linear regression quality, 20 buildings:", energy_test_lr_rmsle)

Linear regression quality, 20 buildings: 0.1928742338248597
