# Intro

## Initial settings

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os,random, math, psutil, pickle 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
import pandas as pd
import seaborn as sns
import gc
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import missingno as msno
from scipy import stats
from scipy.stats import skew
import math

sns.set()
%matplotlib inline

# to make this notebook's output stable across runs
np.random.seed(42)

from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('max_columns', 150)

py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Import modelling libs
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
import lightgbm as lgb

from keras.models import Model, load_model, Sequential
from keras.layers import Input, Dropout, Dense, Embedding, SpatialDropout1D, concatenate, BatchNormalization, Flatten, LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K
from keras.losses import mean_squared_error as mse_loss
from keras import optimizers
from keras.optimizers import RMSprop, Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.arima_model import ARIMA

Using TensorFlow backend.


### Reducing Memory Size Function

In [2]:
## Function to reduce the DF size

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

### Description Table

In [None]:
path = 'C:/Users/al146/OneDrive - Heriot-Watt University/Data/Findhorn/test/'

# creats a list of all files with .csv format in above path.
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        if '.csv' in file:
            files.append(os.path.join(r, file))

# a function for merging csv files and saving the clean result
def mergeCSV(fileslist):
    result = pd.DataFrame()
    for file in fileslist:
        table = pd.DataFrame(columns=['file_id'])
        table.loc[len(table)] = file
        filecsv = open(file)
        table['total_no'] = len(filecsv.readlines())
        df = pd.read_csv(file, names=['date', 'time', 'meter'])
        table['zero_no'] = sum((df == 0).sum(axis=1))
        table['na_no'] = sum(pd.isnull(df['meter']))
        table['zero%'] = (table['zero_no']/table['total_no'])*100
        table['na%'] = (table['na_no']/table['total_no'])*100
        result = result.append(table, ignore_index=True)
    result.file_id.replace({'C:/Users/al146/OneDrive - Heriot-Watt University/Data/Findhorn/test/':''}, regex=True, inplace=True)
    result.file_id.replace({'_15Feb2015_28Mar2015.csv':''}, regex=True, inplace=True)
    result.to_csv('csv/zero_na_no.csv')

mergeCSV(files)

zero_na_no = pd.read_csv('csv/zero_na_no.csv')
zero_na_no.drop(zero_na_no.columns[0], axis=1, inplace=True)

### Merging the input files

In [None]:
# a functon for replacing the date format of excel with python
def read_date(date):
    return xlrd.xldate.xldate_as_datetime(date, 0)

# a function for merging csv files and saving the clean result
def mergeCSV(fileslist):
    result = pd.DataFrame()
    for file in fileslist:
        table = pd.read_csv(file, names=['date', 'time', 'meter_reading'], skiprows=1)
        table["timestamp"] = table["date"] + " " + table["time"].astype(str)
        table['timestamp'] = pd.to_datetime(table['timestamp'], errors='raise', infer_datetime_format=True)
        table = table.drop(['time', 'date'], axis=1)
        table['site_id'] = file
        result = result.append(table, ignore_index=True)
    #result.site_id.replace({'_\(2015-02-15,42days\).csv':''}, regex=True, inplace=True)
    result.site_id.replace({'_15Feb2015_28Mar2015.csv':''}, regex=True, inplace=True)
    result.site_id.replace({'C:/Users/al146/OneDrive - Heriot-Watt University/Data/Findhorn/test/':''}, regex=True, inplace=True)
    #result.site_id.replace({'C:/Users/al146/Desktop/New folder/Processed-':''}, regex=True, inplace=True)
    result.site_id.replace({'D_1Ph_':''}, regex=True, inplace=True)
    result.site_id.replace({'_mf':''}, regex=True, inplace=True)
    result.to_csv('csv/merge_data.csv')
mergeCSV(files)

In [None]:
del r, f, d, file, files, path

### Creating one house data

In [None]:
data_c32 = pd.read_csv('C:/Users/al146/OneDrive - Heriot-Watt University/Data/Findhorn/test/C32_15Feb2015_28Mar2015.csv', names=['date', 'time', 'meter_reading'], skiprows=1)
data_c32["timestamp"] = data_c32["date"] + " " + data_c32["time"].astype(str)
data_c32['timestamp'] = pd.to_datetime(data_c32['timestamp'], errors='raise', infer_datetime_format=True)
data_c32 = data_c32.drop(['time', 'date'], axis=1)
data_c32['site_id'] = 'c32'
data_c32.head()

### Weather data

In [None]:
weatherdata = pd.read_csv('csv/weather20092019.csv')
weatherdata.columns = weatherdata.columns.str.replace(' ', '')

In [None]:
# filitering to the station 132
weatherdata = weatherdata[weatherdata['src_id']==132]

In [None]:
drop_cols = ['Unnamed:0', 'min_grss_temp', 'min_conc_temp', 'min_grss_temp_q', 'min_conc_temp_q', 'meto_stmp_time', 'midas_stmp_etime', 'min_grss_temp_j', 'min_conc_temp_j', 'max_air_temp_q',
            'min_air_temp_q', 'max_air_temp_j', 'min_air_temp_j', 'id_type', 'id', 'ob_hour_count', 'version_num', 'met_domain_name', 'src_id', 'rec_st_ind']
weatherdata = weatherdata.drop(columns=drop_cols)

In [None]:
weatherdata['date'] = pd.to_datetime(weatherdata['ob_end_time'])

weatherdata['max_air_temp'] = pd.to_numeric(weatherdata['max_air_temp'], errors='coerce')
weatherdata['min_air_temp'] = pd.to_numeric(weatherdata['min_air_temp'], errors='coerce')

# adding a mean column
weatherdata['mean_air_temp'] = weatherdata.mean(axis=1)

In [None]:
weather_df = weatherdata[['date','max_air_temp', 'min_air_temp', 'mean_air_temp']].set_index('date').resample('D').mean().reset_index()

del weatherdata

In [None]:
weather_df.to_pickle('pkl/weather_df.pkl')
del weather_df, drop_cols
gc.collect()

## Loading data

### Reading and cleaning the files

In [None]:
data_df = reduce_mem_usage(pd.read_csv('csv/merge_data.csv'))
holidays_df = reduce_mem_usage(pd.read_csv('csv/UKholidays.csv'))
weather_df = reduce_mem_usage(pd.read_pickle('pkl/weather_df.pkl'))

data_df.drop(data_df.columns[0], axis=1, inplace=True)

data_df['site_id'] = data_df['site_id'].astype('str')
data_df['timestamp'] = pd.to_datetime(data_df['timestamp'], errors='raise')
holidays_df['date'] = pd.to_datetime(holidays_df['date'], errors='raise', infer_datetime_format=True)

# replace negative values with zero
data_df['meter_reading'].mask(data_df['meter_reading'] < 0, 0.09, inplace=True)
data_c32['meter_reading'].mask(data_c32['meter_reading'] < 0, 0.09, inplace=True)

### Aggregating the Data

In [None]:
data_agg_int = pd.DataFrame()
for i, id in enumerate(data_df['site_id'].value_counts().index.to_list()):
    data_agg = data_df[data_df['site_id'] == id][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').sum().reset_index()
    data_agg['site_id'] = np.str(id)
    data_agg_int = data_agg_int.append(data_agg, ignore_index=True)

data_agg = data_agg_int.copy()

In [None]:
del i, id, data_agg_int, data_agg

### Adding datetime features

In [None]:
data_df['month'] = data_df['timestamp'].dt.month.astype(np.int8)
data_df['week_of_year'] = data_df['timestamp'].dt.weekofyear.astype(np.int8)
data_df['day_of_year'] = data_df['timestamp'].dt.dayofyear.astype(np.int16)
data_df['hour_of_day'] = data_df['timestamp'].dt.hour.astype(np.int8)  
data_df['day_of_week'] = data_df['timestamp'].dt.dayofweek.astype(np.int8)
data_df['day_of_month'] = data_df['timestamp'].dt.day.astype(np.int8)
data_df['week_of_month'] = data_df['timestamp'].dt.day/7
data_df['week_of_month'] = data_df['week_of_month'].apply(lambda x: math.ceil(x)).astype(np.int8)

In [None]:
data_c32['month'] = data_c32['timestamp'].dt.month.astype(np.int8)
data_c32['week_of_year'] = data_c32['timestamp'].dt.weekofyear.astype(np.int8)
data_c32['day_of_year'] = data_c32['timestamp'].dt.dayofyear.astype(np.int16)
data_c32['hour_of_day'] = data_c32['timestamp'].dt.hour.astype(np.int8)  
data_c32['day_of_week'] = data_c32['timestamp'].dt.dayofweek.astype(np.int8)
data_c32['day_of_month'] = data_c32['timestamp'].dt.day.astype(np.int8)
data_c32['week_of_month'] = data_c32['timestamp'].dt.day/7
data_c32['week_of_month'] = data_c32['week_of_month'].apply(lambda x: math.ceil(x)).astype(np.int8)

### Convert data type

In [None]:
# list of site_ids
data_df.site_id.unique()

In [None]:
convert_dict = {'site_id' : str,
                'month': int,
                'week_of_year': int,
                'day_of_year': int,
                'hour_of_day': int,
                'day_of_week': int,
                'day_of_month': int,
                'week_of_month': int,
                'meter_reading': float
               }

data_df = data_df.astype(convert_dict)
data_df.dtypes.value_counts()

## Data description

In [None]:
data_df.info()
holidays_df.info()
weather_df.info()

In [None]:
print('Size of train_df data', data_df.shape)
print('Size of train_df data', holidays_df.shape)
print('Size of train_df data', weather_df.shape)

In [None]:
data_df.head()

In [None]:
data_df.describe()

## Saving the file

In [None]:
data_df.to_pickle('pkl/data_df.pkl')
holidays_df.to_pickle('pkl/holidays_df.pkl')
data_c32.to_pickle('pkl/data_c32.pkl')

del data_df
del holidays_df
del weather_df
del data_c32

gc.collect()

# EDA

In [None]:
# Loading the pickle file
eda_df = pd.read_pickle('pkl/data_df.pkl')
#eda_df.drop(columns=['month', 'week_of_year', 'day_of_year', 'hour_of_day', 'day_of_week', 'day_of_month', 'week_of_month'], inplace=True)

## Plots for non-aggregated data

### Daily and hourly demand for each building

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(20, 20))

for i, id in enumerate(eda_df['site_id'].value_counts().index.to_list()):
    eda_df[eda_df['site_id'] == id][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[i%4][i//4], alpha=0.3, label='By Hour', color='blue').set_ylabel('Mean meter reading', fontsize=10);
    eda_df[eda_df['site_id'] == id][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[i%4][i//4], alpha=1, label='By day', color='black').set_xlabel('');
    axes[i%4][i//4].grid(color='black', alpha=0.5, linestyle='dashed', linewidth=0.5)
    axes[i%4][i//4].patch.set_facecolor('white')
    axes[i%4][i//4].legend();
    axes[i%4][i//4].set_title(id, fontsize=12);
    plt.subplots_adjust(hspace=1)

### Missing data and zeros visualized

In [None]:
# Load data to another df
train_sns = eda_df.set_index(['timestamp'])

# change the site_id to numbers first
train_sns['site_id'] = train_sns.site_id.replace(['A01', 'A03', 'A18', 'B02', 'B08', 'C02', 'C14', 'C19',
       'C24', 'C32', 'D08', 'D15', 'D20', 'D25', 'F27'], [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14])

f,ax=plt.subplots(1,1,figsize=(10,20))
df = train_sns.copy().reset_index()
df['timestamp'] = df["timestamp"].values.astype('timedelta64[ns]')
df['timestamp'] = pd.to_timedelta(df.timestamp).dt.total_seconds() / 3600
df['timestamp'] = df.timestamp.astype(int)
df.timestamp -= df.timestamp.min()

# create a new column for sites with no energy reading
df['nodata'] = 0
df.nodata[(df['meter_reading'].isnull())] = 1

# sum the meter reading for each hour
df = df.groupby(['timestamp', 'site_id']).sum()
df = df.reset_index()

missmap = np.empty((15, df.timestamp.max()+1))
missmap.fill(np.nan)
for l in df.values:
    if l[3]>=1 and l[2]==0:
        missmap[int(l[1]), int(l[0])] = 0
    elif l[2]==0:
        missmap[int(l[1]), int(l[0])] = 0.5
    else:
        missmap[int(l[1]), int(l[0])] = 1

# Define colors
from matplotlib.colors import LinearSegmentedColormap
colors = ('gainsboro', 'gray', 'royalblue')
cmap = LinearSegmentedColormap.from_list('Custom', colors, len(colors))

sns.heatmap(missmap, cmap=cmap, ax=ax, cbar=True, cbar_kws={"shrink": .5})

# Set the colorbar labels
colorbar = ax.collections[0].colorbar
colorbar.set_ticks([0.15, 0.5, 0.85])
colorbar.set_ticklabels(['meter reading with no value', 'meter reading available with zero value', 'meter reading available with non-zero value'])

# Set axis labels
ax.set_ylabel('site_id')    
ax.set_xlabel('hours elapsed since')
ax.set_title('Missing data and zeros visualized')

### Average meter_reading by hour

In [None]:
del train_sns

train_data = eda_df['hour_of_day'].value_counts(dropna=False, normalize=True).sort_index().values
ind = np.arange(len(train_data))
width = 0.5

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
tr = axes.bar(ind, train_data, width, color='royalblue')

axes.set_ylabel('Normalized number of observations');
axes.set_xlabel('Hour');
axes.set_xticks(ind + width / 2)
axes.set_xticklabels(eda_df['hour_of_day'].value_counts().sort_index().index, rotation=0)
axes2 = axes.twinx()
mr = axes2.plot(ind, eda_df[['hour_of_day', 'meter_reading']].groupby('hour_of_day')['meter_reading'].mean().sort_index().values, 'D-', color='tab:orange', label='Mean meter reading');
axes2.grid(False);
axes2.tick_params(axis='y', labelcolor='tab:orange');
axes2.set_ylabel('Mean meter reading by hour', color='tab:orange');
axes.legend([tr], ['Train'], facecolor='white');
axes2.legend(loc=2, facecolor='white');

### Average meter_reading by day

In [None]:
train_data = eda_df['day_of_month'].value_counts(dropna=False, normalize=True).sort_index().values
ind = np.arange(len(train_data))
width = 0.35

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
tr = axes.bar(ind, train_data, width, color='royalblue')

axes.set_ylabel('Normalized number of observations');
axes.set_xlabel('day');
axes.set_xticks(ind + width / 2)
axes.set_xticklabels(eda_df['day_of_month'].value_counts().sort_index().index, rotation=0)
axes2 = axes.twinx()
mr = axes2.plot(ind, eda_df[['day_of_month', 'meter_reading']].groupby('day_of_month')['meter_reading'].mean().sort_index().values, 'D-', color='tab:orange', label='Mean meter reading');
axes2.grid(False);
axes2.tick_params(axis='y', labelcolor='tab:orange');
axes2.set_ylabel('Mean meter reading by day of month', color='tab:orange');
axes.legend([tr], ['Train'], facecolor='white');
axes2.legend(loc=2, facecolor='white');

### Average meter_reading by weekday

In [None]:
train_data = eda_df['day_of_week'].value_counts(dropna=False, normalize=True).sort_index().values
ind = np.arange(len(train_data))
width = 0.4

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
tr = axes.bar(ind, train_data, width, color='royalblue')

axes.set_ylabel('Normalized number of observations');
axes.set_xlabel('weekday');
axes.set_xticks(ind + width / 2)
axes2 = axes.twinx()
mr = axes2.plot(ind, eda_df[['day_of_week', 'meter_reading']].groupby('day_of_week')['meter_reading'].mean().sort_index().values, 'D-', color='tab:orange', label='Mean meter reading');
axes2.grid(False);
axes2.tick_params(axis='y', labelcolor='tab:orange');
axes2.set_ylabel('Mean meter reading by day of week', color='tab:orange');
axes.legend([tr], ['Train'], facecolor='white');
axes2.legend(loc=2, facecolor='white');

axes.set_xticklabels(eda_df['day_of_week'].value_counts().sort_index().index.map({0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}), rotation=40);

### Average meter_reading by site_id

In [None]:
train_data = eda_df['site_id'].value_counts(dropna=False, normalize=True).sort_index().values
ind = np.arange(len(train_data))
width = 0.4

fig, axes = plt.subplots(1,1,figsize=(18, 6), dpi=100)
tr = axes.bar(ind, train_data, width, color='royalblue')

axes.set_ylabel('Normalized number of observations');
axes.set_xlabel('site_id');
axes.set_xticks(ind + width / 2)
axes.set_xticklabels(eda_df['site_id'].value_counts().sort_index().index, rotation=0)
axes2 = axes.twinx()
mr = axes2.plot(ind, eda_df[['site_id', 'meter_reading']].groupby('site_id')['meter_reading'].mean().sort_index().values, 'D-', color='tab:orange', label='Mean meter reading');
axes2.grid(False);
axes2.tick_params(axis='y', labelcolor='tab:orange');
axes2.set_ylabel('Mean meter reading by site_id', color='tab:orange');
axes.legend([tr], ['Train'], facecolor='white');
axes2.legend(loc=2, facecolor='white');

### Box plot of meter_readig by site_id

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(18, 6))
sns.boxplot(x='site_id', y='meter_reading', data=eda_df, showfliers=False);

### Number of observations by day

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(14, 6))
eda_df['timestamp'].dt.floor('d').value_counts().sort_index().plot(ax=axes).set_xlabel('Date', fontsize=14);
axes.set_title('Number of observations by day', fontsize=16);
axes.legend(['Train']);

### Amount of data and NaNs

In [None]:
total = eda_df.isnull().sum().sort_values(ascending = False)
percent = (eda_df.isnull().sum()/eda_df.isnull().count()*100).sort_values(ascending = False)
missing__train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing__train_data.head(10)

In [None]:
train_data = (eda_df.count() / len(eda_df)).drop('meter_reading').sort_values().values
ind = np.arange(len(train_data))
width = 0.35

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
tr = axes.bar(ind, train_data, width, color='royalblue')

axes.set_ylabel('Amount of data available');
axes.set_xticks(ind + width / 2)
axes.set_xticklabels((eda_df.count() / len(eda_df)).drop('meter_reading').sort_values().index, rotation=40)
axes.legend([tr], ['Train']);

In [None]:
msno.matrix(eda_df.head(200), figsize=(15,4))

In [None]:
msno.bar(eda_df.head(200), figsize=(15,4))

In [None]:
a = msno.heatmap(eda_df, sort='ascending', figsize=(15,4))

In [None]:
msno.dendrogram(eda_df, figsize=(15,4))

## Examine the Distribution of the Target Column

In [None]:
plt.figure(figsize = (15,5))
eda_df['meter_reading'].plot()

In [None]:
eda_df['meter_reading'].plot(kind='hist',
                            bins=10,
                            figsize=(10, 5),
                           title='Distribution of Target Variable (meter_reading)')
plt.show()

In [None]:
#Target's log-log histogram:

ax = np.log1p(eda_df['meter_reading']).hist()
ax.set_yscale('log')
eda_df.meter_reading.describe()

## Outlier analysis

In [None]:
plt.figure(figsize=(10,3))
_ = stats.probplot(eda_df['meter_reading'], fit=True, rvalue=True, plot=plt)
#plt.title("Probability plot for meter_reading shows extreme skewness")
plt.show()

In [None]:
plt.figure(figsize=(10,3))
sns.distplot(eda_df.meter_reading, hist=False)
#plt.title(f"Target variable meter_reading is highly skewed")
plt.title("Energy demand")
plt.ylabel("Count of readings")
plt.xlabel(f"Measured consumption")
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,3))
sns.distplot(np.log1p(eda_df.meter_reading), hist=False)
#plt.title(f"After log transform, distributions of energy types look comparably skewed")
plt.title("Log transform of energy demand")
plt.ylabel("Count of readings")
plt.xlabel(f"Measured consumption")
plt.tight_layout()
plt.show()

In [None]:
y_mean_time = eda_df.groupby('timestamp').meter_reading.mean()
y_mean_time.plot(figsize=(15, 3))

In [None]:
y_mean_time.rolling(window=10).std().plot(figsize=(15, 3))
plt.axhline(y=0.1, color='red')
plt.axvspan(0, 23, color='green', alpha=0.1)

In [None]:
train_df2 = eda_df.copy()

# change the site_id to numbers first
train_df2['site_id'] = train_df2.site_id.replace(['A01', 'A03', 'A18', 'B02', 'B08', 'C02', 'C14', 'C19',
       'C24', 'C32', 'D08', 'D15', 'D20', 'D25', 'F27'], [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14])

# for first four
for bldg_id in [0, 1, 2, 3]:
    plt.figure(figsize=(15,3))
    tmp_df = train_df2[train_df2.site_id == bldg_id].copy()
    tmp_df.set_index("timestamp", inplace=True)
    tmp_df.resample("D").meter_reading.sum().plot()
    plt.title(f"Meter readings for building #{bldg_id} ")
    plt.xlabel("time")
    plt.ylabel("meter_reading")
    plt.tight_layout()
    plt.show()

## Delete eda_df

In [None]:
del eda_df