In [2]:
import pandas as pd 

In [11]:
df = pd.read_csv('train_final.zip')

In [13]:
def _low_mem_mgmt(df):
    df[['meter','site_id','building_id','square_feet']] = df[['meter','site_id', 'building_id', 'square_feet']].astype('int32')

    # Change the dtypes (float64 -> float32)
    df[['meter_reading', 'year_built','floor_count', 'air_temperature', 'cloud_coverage',
        'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
        'wind_direction','wind_speed']] = df[['meter_reading', 'year_built',
        'floor_count', 'air_temperature', 'cloud_coverage',
        'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
        'wind_direction','wind_speed']].astype('float32')
    return df

In [14]:
df = _low_mem_mgmt(df)

In [34]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [15]:
df.head()

Unnamed: 0,meter,timestamp,meter_reading,site_id,building_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,0.0,0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0
1,0,2016-01-01 00:00:00,0.0,0,1,Education,2720,2004.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0
2,0,2016-01-01 00:00:00,0.0,0,2,Education,5376,1991.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0
3,0,2016-01-01 00:00:00,0.0,0,3,Education,23685,2002.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0
4,0,2016-01-01 00:00:00,0.0,0,4,Education,116607,1975.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0


# Site_ID Analysis

In [19]:
df[['meter', 'site_id', 'building_id']].groupby(['site_id', 'building_id']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,meter
site_id,building_id,Unnamed: 2_level_1
0,0,8784
0,1,8784
0,2,8784
0,3,8784
0,4,8784
0,5,8784
0,6,8784
0,7,16051
0,8,8784
0,9,16049


# Site_Wise Meter_Reading
{0: electricity, 1: chilledwater, 2: steam, 3: hotwater}

In [23]:
import numpy as np

In [26]:
df[['site_id','meter_reading']].groupby(['site_id']).sum()

Unnamed: 0_level_0,meter_reading
site_id,Unnamed: 1_level_1
0,592014200.0
1,82744260.0
2,465040800.0
3,298441000.0
4,135226600.0
5,24468690.0
6,662008300.0
7,446404700.0
8,37333490.0
9,704847900.0


# Missing Values

In [28]:
df.isnull().sum()/len(df)

meter                 0.000000
timestamp             0.000000
meter_reading         0.000000
site_id               0.000000
building_id           0.000000
primary_use           0.000000
square_feet           0.000000
year_built            0.599900
floor_count           0.826528
air_temperature       0.004781
cloud_coverage        0.436551
dew_temperature       0.004953
precip_depth_1_hr     0.185447
sea_level_pressure    0.060925
wind_direction        0.071678
wind_speed            0.007107
dtype: float64

Dropping year_built and floor_count

In [29]:
df.drop(columns=['year_built', 'floor_count'], inplace=True)

In [30]:
df.head()

Unnamed: 0,meter,timestamp,meter_reading,site_id,building_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,0.0,0,0,Education,7432,25.0,6.0,20.0,,1019.700012,0.0,0.0
1,0,2016-01-01 00:00:00,0.0,0,1,Education,2720,25.0,6.0,20.0,,1019.700012,0.0,0.0
2,0,2016-01-01 00:00:00,0.0,0,2,Education,5376,25.0,6.0,20.0,,1019.700012,0.0,0.0
3,0,2016-01-01 00:00:00,0.0,0,3,Education,23685,25.0,6.0,20.0,,1019.700012,0.0,0.0
4,0,2016-01-01 00:00:00,0.0,0,4,Education,116607,25.0,6.0,20.0,,1019.700012,0.0,0.0


# Visualization

In [94]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(rc={'figure.figsize':(11.7,8.27)})

# Precipitation Viz

In [130]:
def _create_precip_plot():
    sns.set(rc={'figure.figsize':(40, 40)})
    %matplotlib agg
    %matplotlib agg
    fig, axs = plt.subplots(ncols=4, nrows=4)
    axs = axs.reshape(-1,1).tolist()
    axss = [i for x in axs for i in x]
    for i in list(df['site_id'].unique()):
        tmp_df = df[df.site_id == i].copy()
        tmp_df.set_index("timestamp", inplace=True)    
        sns_plot = tmp_df.resample('D').precip_depth_1_hr.sum()
        axss[i].set_title(f'Site_ID_{i}',  fontdict={'fontsize': 30, 'fontweight': 'bold'})
        k = axss[i].plot(sns_plot)

    k[0].figure.savefig(f"visual/precip.png")
    return 'DONE'

In [131]:
_create_precip_plot()

'DONE'

# Air Temperature Viz

In [134]:
def _create_air_temp_plot():
    sns.set(rc={'figure.figsize':(40, 40)})
    %matplotlib agg
    %matplotlib agg
    fig, axs = plt.subplots(ncols=4, nrows=4)
    axs = axs.reshape(-1,1).tolist()
    axss = [i for x in axs for i in x]
    for i in list(df['site_id'].unique()):
        tmp_df = df[df.site_id == i].copy()
        tmp_df.set_index("timestamp", inplace=True)    
        sns_plot = tmp_df.resample('D').air_temperature.mean()
        axss[i].set_title(f'Site_ID_{i}',  fontdict={'fontsize': 30, 'fontweight': 'bold'})
        k = axss[i].plot(sns_plot)

    k[0].figure.savefig(f"visual/air_temp.png")
    return 'DONE'

In [135]:
_create_air_temp_plot()

'DONE'

# Meter and Meter_Reading Viz

In [136]:
def _create_meter_meterreading_plot():
    sns.set(rc={'figure.figsize':(40, 40)})
    %matplotlib agg
    %matplotlib agg
    fig, axs = plt.subplots(ncols=2, nrows=2)
    axs = axs.reshape(-1,1).tolist()
    axss = [i for x in axs for i in x]
    for i in list(df['meter'].unique()):
        tmp_df = df[df.meter == i].copy()
        tmp_df.set_index("timestamp", inplace=True)    
        sns_plot = tmp_df.resample('D').meter_reading.mean()
        axss[i].set_title(f'Meter_{i}',  fontdict={'fontsize': 30, 'fontweight': 'bold'})
        k = axss[i].plot(sns_plot)

    k[0].figure.savefig(f"visual/meter_reading.png")
    return 'DONE'

In [137]:
_create_meter_meterreading_plot()

'DONE'