In [1]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")

In [3]:
energy = energy[energy["building_id"]==0]

In [4]:
energy = pd.merge(left=energy, right=buildings, how="left",
                  left_on="building_id", right_on="building_id")
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(left=energy, right=weather, how="left",
                  left_index=True, right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(columns=["meter", "site_id", "floor_count"], axis=1)

del buildings
del weather
print(energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   timestamp           8784 non-null   object 
 1   building_id         8784 non-null   int64  
 2   meter_reading       8784 non-null   float64
 3   primary_use         8784 non-null   object 
 4   square_feet         8784 non-null   int64  
 5   year_built          8784 non-null   float64
 6   air_temperature     8781 non-null   float64
 7   cloud_coverage      4954 non-null   float64
 8   dew_temperature     8781 non-null   float64
 9   precip_depth_1_hr   8783 non-null   float64
 10  sea_level_pressure  8699 non-null   float64
 11  wind_direction      8534 non-null   float64
 12  wind_speed          8784 non-null   float64
dtypes: float64(9), int64(2), object(2)
memory usage: 892.3+ KB
None


In [5]:
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo('f2').min and c_max < np.finfo('f2').max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo('f4').min and c_max < np.finfo('f4').max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo('i1').min and c_max < np.iinfo('i1').max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo('i2').min and c_max < np.iinfo('i2').max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo('i4').min and c_max < np.iinfo('i4').max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
        elif col == 'timestamp':
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != 'datetime':
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print("Data usage is less about", round(start_mem - end_mem, 2), "Mb (-", round(100* (start_mem - end_mem) / start_mem, 1), "%)")
    return df

In [6]:
energy = reduce_memory_usage(energy)

Data usage is less about 0.62 Mb (- 71.1 %)


In [7]:
energy["precip_depth_1_hr"] = energy["precip_depth_1_hr"].apply(lambda x: 0 if x < 0 else x)
interpolate_columns = ["air_temperature", "dew_temperature", 
                       "cloud_coverage", "wind_speed", "precip_depth_1_hr", 
                       "sea_level_pressure"]
for col in interpolate_columns:
    energy[col] = energy[col].interpolate(limit_direction='both',
                                          kind='cubic')

In [8]:
pd.set_option("use_inf_as_na", True)
for col in interpolate_columns:
    print(col, "Inf+NaN:", energy[col].isnull().sum())

air_temperature Inf+NaN: 0
dew_temperature Inf+NaN: 0
cloud_coverage Inf+NaN: 0
wind_speed Inf+NaN: 0
precip_depth_1_hr Inf+NaN: 0
sea_level_pressure Inf+NaN: 0


  pd.set_option("use_inf_as_na", True)


In [9]:
energy_train, energy_test = train_test_split(energy[energy["meter_reading"]>0], test_size=0.2)
print(energy_train.head())

               timestamp  building_id  meter_reading primary_use  square_feet  \
6698 2016-10-06 02:00:00            0        243.625   Education         7432   
5916 2016-09-03 12:00:00            0        232.125   Education         7432   
7665 2016-11-15 09:00:00            0        217.000   Education         7432   
4421 2016-07-03 05:00:00            0        245.750   Education         7432   
3993 2016-06-15 09:00:00            0        285.250   Education         7432   

      year_built  air_temperature  cloud_coverage  dew_temperature  \
6698      2008.0        26.093750        4.000000        23.296875   
5916      2008.0        23.906250        6.000000        23.296875   
7665      2008.0        15.601562        7.000000        12.796875   
4421      2008.0        23.906250        4.332031        22.796875   
3993      2008.0        25.000000        0.000000        23.296875   

      precip_depth_1_hr  sea_level_pressure  wind_direction  wind_speed  
6698              

  has_large_values = (abs_vals > 1e6).any()


In [10]:
regression_columns = ["meter_reading", "air_temperature", "dew_temperature", 
                       "cloud_coverage", "wind_speed", "precip_depth_1_hr", 
                       "sea_level_pressure"]
energy_train_lr = pd.DataFrame(energy_train, columns=regression_columns)
y = energy_train_lr["meter_reading"]
x = energy_train_lr.drop(labels=["meter_reading"], axis=1)
model = LinearRegression().fit(x, y)
print(model.coef_, model.intercept_)

[ 2.68365936  3.61521833 -2.23550463 -1.97167689  0.14664558 -0.98427396] 1117.7261423710365


In [11]:
def calculate_model (x):
    lr = np.sum([x[col] * model.coef_[i] for i,col in enumerate(regression_columns[1:])])
    lr += model.intercept_
    x["meter_reading_lr_q"] = (np.log(1 + x.meter_reading) -
                               np.log(1 + lr))**2
    return x

energy_test = energy_test.apply(calculate_model,
                                    axis=1, result_type="expand")
energy_test_lr_rmsle = np.sqrt(energy_test["meter_reading_lr_q"].sum() / len(energy_test))
print ("Linear regression quality:", energy_test_lr_rmsle, round(energy_test_lr_rmsle, 1))

Linear regression quality: 0.22334807000314832 0.2
