In [1]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, BayesianRidge

In [2]:
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo('f2').min and c_max < np.finfo('f2').max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo('f4').min and c_max < np.finfo('f4').max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo('i1').min and c_max < np.iinfo('i1').max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo('i2').min and c_max < np.iinfo('i2').max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo('i4').min and c_max < np.iinfo('i4').max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
        elif col == 'timestamp':
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != 'datetime':
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print("Data usage is less about", round(start_mem - end_mem, 2), "Mb (-", round(100* (start_mem - end_mem) / start_mem, 1), "%)")
    return df

In [3]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")
energy = energy[(energy["building_id"]<20)]
energy = pd.merge(left=energy, right=buildings, how="left",
                   left_on="building_id", right_on="building_id")
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(left=energy, right=weather, how="left",
                  left_index=True, right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(columns=["meter", "year_built",
                              "square_feet", "floor_count"], axis=1)
del buildings
del weather
energy = reduce_memory_usage(energy)
print (energy.info())

Data usage is less about 11.56 Mb (- 71.9 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           175680 non-null  datetime64[ns]
 1   site_id             175680 non-null  int8          
 2   building_id         175680 non-null  int8          
 3   meter_reading       175680 non-null  float16       
 4   primary_use         175680 non-null  category      
 5   air_temperature     175620 non-null  float16       
 6   cloud_coverage      99080 non-null   float16       
 7   dew_temperature     175620 non-null  float16       
 8   precip_depth_1_hr   175660 non-null  float16       
 9   sea_level_pressure  173980 non-null  float16       
 10  wind_direction      170680 non-null  float16       
 11  wind_speed          175680 non-null  float16       
dtypes: category(1), datetime64[ns](1), float1

In [4]:
energy["hour"] = energy["timestamp"].dt.hour.astype("int8")
energy["weekday"] = energy["timestamp"].dt.weekday.astype("int8")
for weekday in range(0,7):
    energy['is_wday' + str(weekday)] = energy['weekday'].isin([weekday]).astype("int8")
energy["date"] = pd.to_datetime(energy["timestamp"].dt.date)
dates_range = pd.date_range(start='2015-12-31', end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(),
                                  end=dates_range.max())
energy['is_holiday'] = energy['date'].isin(us_holidays).astype("int8")
energy["meter_reading_log"] = np.log(energy["meter_reading"] + 1)

In [5]:
energy_train, energy_test = train_test_split(energy[(energy["meter_reading"]>0)], test_size=0.2)
print (energy_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 86858 entries, 68880 to 80119
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   timestamp           86858 non-null  datetime64[ns]
 1   site_id             86858 non-null  int8          
 2   building_id         86858 non-null  int8          
 3   meter_reading       86858 non-null  float16       
 4   primary_use         86858 non-null  category      
 5   air_temperature     86858 non-null  float16       
 6   cloud_coverage      50554 non-null  float16       
 7   dew_temperature     86858 non-null  float16       
 8   precip_depth_1_hr   86858 non-null  float16       
 9   sea_level_pressure  86417 non-null  float16       
 10  wind_direction      84066 non-null  float16       
 11  wind_speed          86858 non-null  float16       
 12  hour                86858 non-null  int8          
 13  weekday             86858 non-null  int8       

In [6]:
from sklearn.metrics import r2_score

In [7]:
hours = range(0, 24)
buildings = range(0, energy_train["building_id"].max() + 1)
lr_columns = ["meter_reading_log", "hour", "building_id", "is_holiday"]
for wday in range(0, 7):
    lr_columns.append("is_wday" + str(wday))

In [8]:
lr_models = {
    "LinearRegression": LinearRegression,
    "Lasso-0.01": Lasso,
    "Lasso-0.1": Lasso,
    "Lasso-1.0": Lasso,
    "Ridge-0.01": Ridge,
    "Ridge-0.1": Ridge,
    "Ridge-1.0": Ridge,
    "ElasticNet-1-1": ElasticNet,
    "ElasticNet-0.1-1": ElasticNet,
    "ElasticNet-1-0.1": ElasticNet,
    "ElasticNet-0.1-0.1": ElasticNet,
    "BayesianRidge": BayesianRidge
}
energy_train_lr = pd.DataFrame(energy_train, columns=lr_columns)

In [14]:
lr_models_scores = {}
for _ in lr_models:
    lr_model = lr_models[_]
    energy_lr_scores = [[]]*len(buildings)
    for building in buildings:
        energy_lr_scores[building] = [0]*len(hours)
        energy_train_b = energy_train_lr[energy_train_lr["building_id"] == building]
        for hour in hours:
            energy_train_bh = energy_train_b[energy_train_b["hour"] == hour]
            y = energy_train_bh["meter_reading_log"]
            x = energy_train_bh.drop(labels=["meter_reading_log", 
                                    "hour", "building_id"], axis=1)
            if _ in ["Ridge-0.1", "Lasso-0.1"]:
                model = lr_model(alpha=.1, fit_intercept=False).fit(x, y)
            elif _ in ["Ridge-0.01", "Lasso-0.01"]:
                model = lr_model(alpha=.01, fit_intercept=False).fit(x, y)
            elif _ == "ElasticNet-1-1":
                model = lr_model(alpha=1, l1_ratio=1, fit_intercept=False).fit(x, y)
            elif _ == "ElasticNet-1-0.1":
                model = lr_model(alpha=1, l1_ratio=.1, fit_intercept=False).fit(x, y)
            elif _ == "ElasticNet-0.1-1":
                model = lr_model(alpha=.1, l1_ratio=1, fit_intercept=False).fit(x, y)
            elif _ == "ElasticNet-0.1-0.1":
                model = lr_model(alpha=.1, l1_ratio=.1, fit_intercept=False).fit(x, y)
            else:
                model = lr_model(fit_intercept=False).fit(x, y)
            energy_lr_scores[building][hour] = r2_score(y, model.predict(x))
    lr_models_scores[_] = np.mean(energy_lr_scores)
print(lr_models_scores)

{'LinearRegression': np.float64(0.13212598553174684), 'Lasso-0.01': np.float64(-0.1889712145494823), 'Lasso-0.1': np.float64(-30.987507611291154), 'Lasso-1.0': np.float64(-2415.3274100496233), 'Ridge-0.01': np.float64(0.13171559427374263), 'Ridge-0.1': np.float64(0.09144253842049516), 'Ridge-1.0': np.float64(-3.616500969413656), 'ElasticNet-1-1': np.float64(-2415.3274100496233), 'ElasticNet-0.1-1': np.float64(-30.987507611291154), 'ElasticNet-1-0.1': np.float64(-1998.5796497317942), 'ElasticNet-0.1-0.1': np.float64(-399.76709211983325), 'BayesianRidge': np.float64(0.13211608242999465)}


In [18]:
# Перевірка моделей LR, Ridge, BayesianRidge
energy_lr = []
energy_ridge = []
energy_br = []
for building in buildings:
    energy_lr.append([])
    energy_ridge.append([])
    energy_br.append([])
    energy_train_b = energy_train_lr[energy_train_lr["building_id"] == building]
    for hour in hours:
        energy_lr[building].append([0]*(len(lr_columns) - 3))
        energy_ridge[building].append([0]*(len(lr_columns) - 3))
        energy_br[building].append([0]*(len(lr_columns) - 3))
        energy_train_bh = energy_train_b[energy_train_b["hour"] == hour]
        y = energy_train_bh["meter_reading_log"]
        if len(y) > 0:
            x = energy_train_bh.drop(labels=["meter_reading_log", 
                        "hour", "building_id"], axis=1)
            model = LinearRegression(fit_intercept=False).fit(x, y)
            energy_lr[building][hour] = model.coef_
            model = Ridge(alpha=.01, fit_intercept=False).fit(x, y)
            energy_ridge[building][hour] = model.coef_
            model = BayesianRidge(fit_intercept=False).fit(x, y)
            energy_br[building][hour] = model.coef_
print(energy_lr[0][0])  
print(energy_ridge[0][0])  
print(energy_br[0][0])  

[-0.07673821  5.43768403  5.4458912   5.49464699  5.45166328  5.4244278
  5.44155093  5.43708147]
[-0.07415801  5.43509742  5.44387495  5.49261269  5.44938032  5.42232397
  5.43953628  5.43514035]
[-0.07609063  5.43703548  5.44538582  5.49413709  5.45109099  5.42390043
  5.44104595  5.43659493]
