# Merging & imputing missing weather data

## Import libraries

In [None]:
''' importing basic data analysis packages'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os,random, math, psutil, pickle 
import missingno as msno
from datetime import timedelta 

''' For ML'''
from sklearn import metrics, svm
from sklearn.linear_model  import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import preprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

## Reading in data

In [None]:
# Reading in weather_train_df
weather_train_df = pd.read_csv('weather_train.csv')
weather_test_df = pd.read_csv('weather_test.csv')
# Reading in timezone data
tz = pd.read_csv('time_zones.csv')

In [3]:
# adding one more day of data from weather test to weather train to help with timezone adjustment
weather_test_df['timestamp'] = pd.to_datetime(weather_test_df['timestamp'])
weather_test_df['date']  = weather_test_df['timestamp'].dt.date
weather_test_selection = weather_test_df.loc[weather_test_df.date == weather_test_df['date'][0]]
weather_test_selection.drop(columns = ['date'])
frames = [weather_train_df, weather_test_selection]
weather_train_extra = pd.concat(frames)
weather_train_extra= weather_train_extra.drop(columns = ['date'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [4]:
tz.head(5)

Unnamed: 0,site_id,timezone,country_code,location
0,0,US/Eastern,US,"Orlando, FL"
1,1,Europe/London,UK,"UK, Southampton"
2,2,US/Mountain,US,"Tempe, AZ"
3,3,US/Eastern,US,"Washington, WA"
4,4,US/Pacific,US,"San Francisco, CA"


In [5]:
# merging weather data with tz data
weather_train_tz = weather_train_extra.merge(tz, on=['site_id'], how ='left')

In [6]:
weather_train_tz['timestamp'] = pd.to_datetime(weather_train_tz['timestamp'])
# Adding in daylight saving data
weather_train_tz['dst'] = 0

# 2016

weather_train_tz.loc[((weather_train_tz['timezone'] == 'US/Eastern') & 
                 (weather_train_tz['timestamp'] >= '2016-03-13 02:00:00') & 
                 (weather_train_tz['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1
weather_train_tz.loc[((weather_train_tz['timezone'] == 'US/Mountain') & 
                 (weather_train_tz['timestamp'] >= '2016-03-13 02:00:00') & 
                 (weather_train_tz['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1
weather_train_tz.loc[((weather_train_tz['timezone'] == 'US/Pacific') & 
                 (weather_train_tz['timestamp'] >= '2016-03-13 02:00:00') & 
                 (weather_train_tz['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1
weather_train_tz.loc[((weather_train_tz['timezone'] == 'US/Central') & 
                 (weather_train_tz['timestamp'] >= '2016-03-13 02:00:00') & 
                 (weather_train_tz['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1
weather_train_tz.loc[((weather_train_tz['timezone'] == 'Canada/Eastern') & 
                 (weather_train_tz['timestamp'] >= '2016-03-13 02:00:00') & 
                 (weather_train_tz['timestamp'] < '2016-11-06 01:00:00')), 'dst'] = 1
weather_train_tz.loc[((weather_train_tz['timezone'] == 'Europe/London') & 
                 (weather_train_tz['timestamp'] >= '2016-03-27 01:00:00') & 
                 (weather_train_tz['timestamp'] < '2016-10-30 02:00:00')), 'dst'] = 1
weather_train_tz.loc[((weather_train_tz['timezone'] == 'Europe/Dublin') & 
                 (weather_train_tz['timestamp'] >= '2016-03-27 01:00:00') & 
                 (weather_train_tz['timestamp'] < '2016-10-30 02:00:00')), 'dst'] = 1

In [7]:
# changing name of timestamp column into UTC since these timestamp is UTC based
weather_train_tz= weather_train_tz.rename(columns = {'timestamp':'timestamp_utc'})

In [8]:
weather_train_tz.head(5)

Unnamed: 0,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,site_id,timestamp_utc,wind_direction,wind_speed,timezone,country_code,location,dst
0,25.0,6.0,20.0,,1019.7,0,2016-01-01 00:00:00,0.0,0.0,US/Eastern,US,"Orlando, FL",0
1,24.4,,21.1,-1.0,1020.2,0,2016-01-01 01:00:00,70.0,1.5,US/Eastern,US,"Orlando, FL",0
2,22.8,2.0,21.1,0.0,1020.2,0,2016-01-01 02:00:00,0.0,0.0,US/Eastern,US,"Orlando, FL",0
3,21.1,2.0,20.6,0.0,1020.1,0,2016-01-01 03:00:00,0.0,0.0,US/Eastern,US,"Orlando, FL",0
4,20.0,2.0,20.0,-1.0,1020.0,0,2016-01-01 04:00:00,250.0,2.6,US/Eastern,US,"Orlando, FL",0


In [9]:
timezones = list(tz.timezone.unique())
timezones_offset = [-5, 0, -7, -8, -5, -6, 1]
timezones_dict = dict(zip(timezones, timezones_offset))
timezones_dict

{'US/Eastern': -5,
 'Europe/London': 0,
 'US/Mountain': -7,
 'US/Pacific': -8,
 'Canada/Eastern': -5,
 'US/Central': -6,
 'Europe/Dublin': 1}

In [10]:
# Covert data to  datetime
weather_train_tz['timestamp_utc'] = pd.to_datetime(weather_train_tz['timestamp_utc'])

weather_train_tz['local_time'] = weather_train_tz['timestamp_utc']


from datetime import timedelta 
for zone in timezones_dict.keys():
    weather_train_tz.local_time[weather_train_tz.timezone==zone] += timedelta(hours = timezones_dict[zone])
    
weather_train_tz.loc[weather_train_tz['dst'] == 1, 'local_time'] += timedelta(hours = 1)  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [11]:
weather_train_tz.head(5)

Unnamed: 0,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,site_id,timestamp_utc,wind_direction,wind_speed,timezone,country_code,location,dst,local_time
0,25.0,6.0,20.0,,1019.7,0,2016-01-01 00:00:00,0.0,0.0,US/Eastern,US,"Orlando, FL",0,2015-12-31 19:00:00
1,24.4,,21.1,-1.0,1020.2,0,2016-01-01 01:00:00,70.0,1.5,US/Eastern,US,"Orlando, FL",0,2015-12-31 20:00:00
2,22.8,2.0,21.1,0.0,1020.2,0,2016-01-01 02:00:00,0.0,0.0,US/Eastern,US,"Orlando, FL",0,2015-12-31 21:00:00
3,21.1,2.0,20.6,0.0,1020.1,0,2016-01-01 03:00:00,0.0,0.0,US/Eastern,US,"Orlando, FL",0,2015-12-31 22:00:00
4,20.0,2.0,20.0,-1.0,1020.0,0,2016-01-01 04:00:00,250.0,2.6,US/Eastern,US,"Orlando, FL",0,2015-12-31 23:00:00


In [12]:
weather_train_tz.isnull().sum(axis=0)

air_temperature          55
cloud_coverage        69366
dew_temperature         113
precip_depth_1_hr     50413
sea_level_pressure    10645
site_id                   0
timestamp_utc             0
wind_direction         6277
wind_speed              304
timezone                  0
country_code              0
location                  0
dst                       0
local_time                0
dtype: int64

In [13]:
# merge with imputed meter and building data

meter_building = pd.read_pickle('train_df_imputed.pkl')


In [14]:
meter_building['timestamp'] = pd.to_datetime(meter_building['timestamp'])
weather_train_tz['local_time'] = pd.to_datetime(weather_train_tz['local_time'])

In [15]:
meter_building_weather = meter_building.merge(weather_train_tz, left_on = ['timestamp', 'site_id'], right_on = ['local_time', 'site_id'], how = 'left')

In [18]:
missing_data = pd.DataFrame(meter_building_weather.isnull().sum(axis=0))
missing_data = missing_data.rename(columns ={0:'missing_data_count'})
missing_data['missing_data_pc(%)'] = missing_data['missing_data_count']/len(meter_building_weather['building_id'])*100

In [19]:
missing_data

Unnamed: 0,missing_data_count,missing_data_pc(%)
building_id,0,0.0
meter,0,0.0
timestamp,0,0.0
meter_reading,0,0.0
site_id,0,0.0
primary_use,0,0.0
square_feet,0,0.0
year_built,0,0.0
floor_count,0,0.0
time_index,0,0.0


In [None]:
meter_building_weather.to_pickle ('tz_aware_merged_data_yh_v2.pkl')

## Imputing missing weather data 

We think air_temperature and dew_tempearature are the two features that matters the most  and have relatively few missing data (< 1%) .  We are going to impute them by using the average of the value before and after (smoothing out temperature).  We are not going to use ML (as we did with building data) because temperature data should be bounded by time and site and most of the missing data are sporatic. 

### Imputing air_temperature

In [21]:
# Filter out unique site_id and timestamp
air_temp_to_impute = meter_building_weather.loc[:,meter_building_weather.columns.isin(['air_temperature','site_id','timestamp'])]
air_temp_to_impute = air_temp_to_impute.drop_duplicates(subset=['timestamp','site_id'])

In [22]:
# First round - filling in NA by averaging values ahead and behind the missing data. this is bound by specific site and timestamp
air_temp_imputed= pd.DataFrame(columns = ['timestamp', 'air_temperature','site_id'])
grouped = air_temp_to_impute.groupby('site_id')
for key, group in grouped:
    df = pd.DataFrame(group[['timestamp','air_temperature']]).sort_values(by = 'timestamp', ascending = True)
    df.air_temperature = df.air_temperature.fillna((df.air_temperature.shift() + df.air_temperature.shift(-1))/2)
    df['site_id'] = key
    air_temp_imputed= air_temp_imputed.append(df)
    
# Second round - filling in NA by backfilling (most of the remaining missing data do not have data ahead to fill)
air_temp_imputed_v2= pd.DataFrame(columns = ['timestamp', 'air_temperature','site_id'])    
grouped = air_temp_imputed.groupby('site_id')
for key, group in grouped:
    df = pd.DataFrame(group[['timestamp','air_temperature']]).sort_values(by = 'timestamp', ascending = True)
    df.air_temperature = df.air_temperature.fillna(method = 'bfill')
    df['site_id'] = key
    air_temp_imputed_v2= air_temp_imputed_v2.append(df)

In [23]:
# merge imputed data to an index
index_air = meter_building_weather[['timestamp', 'site_id']]
index_air= index_air.merge(air_temp_imputed_v2, on = ['timestamp', 'site_id'], how= 'left')

In [24]:
# Checking that there is no missing air_temperature data
index_air.isna().sum(axis=0)

timestamp          0
site_id            0
air_temperature    0
dtype: int64

In [25]:
# merge index_air into the data. making a copy just in case
meter_building_weather_imputed = meter_building_weather.copy()
meter_building_weather_imputed['air_temperature']= meter_building_weather_imputed['air_temperature'].fillna(index_air['air_temperature'])

In [27]:
# Checking to see data merged correctly.  Also making sure the shape of df remains the same
meter_building_weather_imputed.isna().sum(axis=0)


building_id                 0
meter                       0
timestamp                   0
meter_reading               0
site_id                     0
primary_use                 0
square_feet                 0
year_built                  0
floor_count                 0
time_index                  0
day_of_week                 0
hour_of_day                 0
index                       0
avg                         0
std                         0
outlier                     0
air_temperature             0
cloud_coverage        5331707
dew_temperature         50328
precip_depth_1_hr     2514303
sea_level_pressure    1019098
timestamp_utc           44738
wind_direction         679653
wind_speed              68037
timezone                44738
country_code            44738
location                44738
dst                     44738
local_time              44738
dtype: int64

In [29]:
meter_building_weather_imputed.shape == meter_building_weather.shape

True

### Imputing dew_temperature

In [31]:
# Filter out unique site_id and timestamp
dew_temp_to_impute = meter_building_weather.loc[:,meter_building_weather_imputed.columns.isin(['dew_temperature','site_id','timestamp'])]
dew_temp_to_impute = dew_temp_to_impute.drop_duplicates(subset=['timestamp','site_id'])

In [32]:
# First round - filling in NA by averaging values ahead and behind the missing data. this is bound by specific site and timestamp
dew_temp_imputed= pd.DataFrame(columns = ['timestamp', 'dew_temperature','site_id'])
grouped = dew_temp_to_impute.groupby('site_id')
for key, group in grouped:
    df = pd.DataFrame(group[['timestamp','dew_temperature']]).sort_values(by = 'timestamp', ascending = True)
    df.dew_temperature = df.dew_temperature.fillna((df.dew_temperature.shift() + df.dew_temperature.shift(-1))/2)
    df['site_id'] = key
    dew_temp_imputed= dew_temp_imputed.append(df)
    
# Second round - filling in NA by backfilling (most of the remaining missing data do not have data ahead to fill)
dew_temp_imputed_v2= pd.DataFrame(columns = ['timestamp', 'dew_temperature','site_id'])
grouped = dew_temp_imputed.groupby('site_id')
for key, group in grouped:
    df = pd.DataFrame(group[['timestamp','dew_temperature']]).sort_values(by = 'timestamp', ascending = True)
    df.dew_temperature = df.dew_temperature.fillna(method = 'bfill')
    df['site_id'] = key
    dew_temp_imputed_v2= dew_temp_imputed_v2.append(df)

In [33]:
# merge imputed data to an index
index_dew = meter_building_weather_imputed[['timestamp', 'site_id']]
index_dew= index_air.merge(dew_temp_imputed_v2, on = ['timestamp', 'site_id'], how= 'left')

In [34]:
# Checking that there is no missing air_temperature data
index_dew.isna().sum(axis=0)

timestamp          0
site_id            0
air_temperature    0
dew_temperature    0
dtype: int64

In [35]:
# merge index_air into the data. making a copy just in case
meter_building_weather_imputed = meter_building_weather_imputed.copy()
meter_building_weather_imputed['dew_temperature']= meter_building_weather_imputed['dew_temperature'].fillna(index_dew['dew_temperature'])

In [36]:
# Checking to see data merged correctly.  Also making sure the shape of df remains the same
meter_building_weather_imputed.isna().sum(axis=0)


building_id                 0
meter                       0
timestamp                   0
meter_reading               0
site_id                     0
primary_use                 0
square_feet                 0
year_built                  0
floor_count                 0
time_index                  0
day_of_week                 0
hour_of_day                 0
index                       0
avg                         0
std                         0
outlier                     0
air_temperature             0
cloud_coverage        5331707
dew_temperature             0
precip_depth_1_hr     2514303
sea_level_pressure    1019098
timestamp_utc           44738
wind_direction         679653
wind_speed              68037
timezone                44738
country_code            44738
location                44738
dst                     44738
local_time              44738
dtype: int64

In [37]:
meter_building_weather_imputed.shape == meter_building_weather.shape

True

In [38]:
meter_building_weather_imputed.to_pickle ('tz_aware_merged_data_weather_imputed.pkl')