In [142]:
import pandas as pd
import pandas_profiling
import numpy as np 
import json
import datetime
import re
import ast
from fastparquet import ParquetFile, write

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

### Data Folder Instructions

In [2]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# Purple Air data ia assumed to be in a subfolder called 'purpleair' 
# NOAA data ia assumed to be in a subfolder called 'noaa' 
# For example, if the base data folder is '/users/data', purpleair data should be in '/users/data/purpleair'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

### Combine PurpleAir and NOAA data

In [122]:
def createHashKey(row):
    if np.isnan(row['lat']):
        str_lat = ''
    else:
        str_lat = str(row['lat'])
        
        
    if np.isnan(row['lon']):
        str_lon = ''
    else:
        str_lon = str(row['lon'])
        
    return hash(str_lat + str_lon)

In [123]:
# Read from noaa data that was stored earlier
sep2019_noaa_df = pd.read_parquet("{}/noaa/bay_Sep2019_withloc.parquet".format(datafolder))
Sep27_noaa_df = sep2019_noaa_df[sep2019_noaa_df.date == '09/27/19']

In [124]:
Sep27_noaa_df['datetime'] = tst[['year', 'month','day','hour','minute']].apply(lambda x: int(''.join(x)), axis=1)

In [125]:
Sep27_noaa_df.drop(['year', 'month','day','hour','minute','date','timestamp'], axis=1, inplace=True)

In [126]:
Sep27_noaa_df.head()

Unnamed: 0,wban_number,call_sign,call_sign2,rec_length,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields,lat,lon,datetime
6745,23234,KSFO,SFO,103,5-MIN,KSFO,270800Z,,True,250,6.0,False,,False,,False,18,37.62,-122.365,201909270950
6746,23234,KSFO,SFO,103,5-MIN,KSFO,270805Z,,True,250,7.0,False,,False,,False,18,37.62,-122.365,201909271000
6747,23234,KSFO,SFO,103,5-MIN,KSFO,270810Z,,True,240,8.0,False,,False,,False,18,37.62,-122.365,201909271010
6748,23234,KSFO,SFO,102,5-MIN,KSFO,270815Z,,True,250,5.0,False,,False,,False,18,37.62,-122.365,201909271020
6749,23234,KSFO,SFO,103,5-MIN,KSFO,270820Z,,True,250,6.0,False,,False,,False,18,37.62,-122.365,201909271030


In [76]:
Sep27_noaa_df.wban_number.count(), Sep27_noaa_df.lat.nunique(), Sep27_noaa_df.lon.nunique()

(1728, 6, 6)

In [77]:
# Read from station data that was stored earlier
unique_station_df = pd.read_parquet("{}/noaa/uniq_station_data.parquet".format(datafolder))
# Get unique station data for bounding box
bay_stations_df = unique_station_df[(unique_station_df.lat > 37) & (unique_station_df.lat < 38) 
                              & (unique_station_df.lon > -123) & (unique_station_df.lon < -122)]
bay_stations_df.reset_index(inplace=True, drop=True)
bay_stations_df = bay_stations_df.dropna()
bay_stations_df.drop(['wban_number'], axis=1, inplace=True)
bay_stations_df.drop_duplicates(inplace=True)
bay_stations_df.head()

Unnamed: 0,lat,lon
0,37.513,-122.501
1,37.721,-122.221
2,37.75,-122.217
3,37.654,-122.115
4,37.667,-122.117


In [78]:
len(bay_stations_df), bay_stations_df.lat.nunique(), bay_stations_df.lon.nunique()

(41, 34, 30)

In [79]:
bay_stations_df[['lat','lon']] = bay_stations_df[['lat','lon']].apply(pd.to_numeric)

In [127]:
bay_ts_df = pd.read_parquet("{}/purpleair/ts_0927_withaddress_final.parquet".format(datafolder))
bay_ts_df.head()

Unnamed: 0_level_0,0_3um,0_5um,1_0um,2_5um,5_0um,10_0um,pm1_0,pm10_0,created,pm1_0_atm,pm2_5_atm,pm10_0_atm,uptime,rssi,temperature,humidity,pm2_5_cf_1,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,lat,lon,parent_id,is_owner,city,county,zipcode,created_at,year,month,day,hour,minute
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
0,935.56,270.69,45.63,7.76,2.35,0.0,4.57,8.82,201909270000,4.34,7.07,9.19,913.0,-74.0,83.0,46.0,7.07,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:00,2019,9,27,0,0
1,842.38,244.69,51.99,9.53,3.35,0.0,3.78,8.69,201909270010,4.24,6.82,9.66,923.0,-71.0,82.0,47.0,6.82,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:10,2019,9,27,0,10
2,832.65,247.2,41.0,6.56,2.14,1.1,3.93,8.15,201909270020,4.07,7.6,9.65,932.0,-72.0,81.0,47.0,7.6,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:20,2019,9,27,0,20
3,916.69,260.89,48.72,7.73,2.65,0.0,3.97,8.58,201909270030,3.97,6.94,8.58,942.0,-75.0,81.0,48.0,6.94,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:30,2019,9,27,0,30
4,924.0,269.25,51.26,6.03,3.65,1.59,4.68,9.53,201909270040,4.68,6.85,9.53,952.0,-73.0,80.0,49.0,6.85,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:40,2019,9,27,0,40


In [128]:
bay_ts_df.sensor_id.count(), bay_ts_df.sensor_id.nunique(), bay_ts_df.lat.nunique(), bay_ts_df.lon.nunique()

(73660, 520, 298, 298)

In [129]:
# Add lat-lon based hashes to noaa and purple air dataframes
# bay_stations_df['asoslatlonhash'] = bay_stations_df.apply (lambda row: createHashKey(row), axis=1)
bay_ts_df['tslatlonhash'] = bay_ts_df.apply (lambda row: createHashKey(row), axis=1)
Sep27_noaa_df['asoslatlonhash'] = Sep27_noaa_df.apply (lambda row: createHashKey(row), axis=1)

In [131]:
# Keep only the asos columns needed to determine the lat-lon mapping
Sep27_noaa_latlon_df = Sep27_noaa_df[['asoslatlonhash','lat','lon']]
Sep27_noaa_latlon_df.drop_duplicates(inplace=True)
Sep27_noaa_latlon_df.asoslatlonhash.count(), Sep27_noaa_latlon_df.asoslatlonhash.nunique()

(6, 6)

In [132]:
# Keep only the purple air columns needed to determine the lat-lon mapping
usa_purple_latlon_df = bay_ts_df[['tslatlonhash','lat','lon']]
usa_purple_latlon_df.drop_duplicates(inplace=True)
usa_purple_latlon_df.tslatlonhash.count(), usa_purple_latlon_df.tslatlonhash.nunique()

(298, 298)

In [133]:
Sep27_noaa_latlon_df.set_index('asoslatlonhash', inplace=True)
usa_purple_latlon_df.set_index('tslatlonhash', inplace=True)

In [134]:
# Find the closest asos lat-lon mapping corresponding to the purple air records
closest_points = {}
for name, point in usa_purple_latlon_df.iterrows():
#     print(name, point)
#     break
    distances = (((Sep27_noaa_latlon_df - point) ** 2).sum(axis=1)**.5)
    closest_points[name] = distances.sort_values().index[0]

In [135]:
# Create dataframe from lat-lon mapping
latlonmap_df = pd.DataFrame(list(closest_points.items()), columns=['tslatlonhash','asoslatlonhash'])
latlonmap_df.count()

tslatlonhash      298
asoslatlonhash    298
dtype: int64

In [136]:
# Merge purple air data to lat-lon mapping first and then 
# merge the resulting dataframe to asos dataframe
merged_df = pd.merge(bay_ts_df, latlonmap_df, on='tslatlonhash')

In [137]:
merged_df.sensor_id.count()

73660

In [138]:
merged_df.head()

Unnamed: 0,0_3um,0_5um,1_0um,2_5um,5_0um,10_0um,pm1_0,pm10_0,created,pm1_0_atm,pm2_5_atm,pm10_0_atm,uptime,rssi,temperature,humidity,pm2_5_cf_1,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,lat,lon,parent_id,is_owner,city,county,zipcode,created_at,year,month,day,hour,minute,tslatlonhash,asoslatlonhash
0,935.56,270.69,45.63,7.76,2.35,0.0,4.57,8.82,201909270000,4.34,7.07,9.19,913.0,-74.0,83.0,46.0,7.07,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:00,2019,9,27,0,0,6124648403255080948,-4417701102135688327
1,842.38,244.69,51.99,9.53,3.35,0.0,3.78,8.69,201909270010,4.24,6.82,9.66,923.0,-71.0,82.0,47.0,6.82,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:10,2019,9,27,0,10,6124648403255080948,-4417701102135688327
2,832.65,247.2,41.0,6.56,2.14,1.1,3.93,8.15,201909270020,4.07,7.6,9.65,932.0,-72.0,81.0,47.0,7.6,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:20,2019,9,27,0,20,6124648403255080948,-4417701102135688327
3,916.69,260.89,48.72,7.73,2.65,0.0,3.97,8.58,201909270030,3.97,6.94,8.58,942.0,-75.0,81.0,48.0,6.94,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:30,2019,9,27,0,30,6124648403255080948,-4417701102135688327
4,924.0,269.25,51.26,6.03,3.65,1.59,4.68,9.53,201909270040,4.68,6.85,9.53,952.0,-73.0,80.0,49.0,6.85,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:40,2019,9,27,0,40,6124648403255080948,-4417701102135688327


In [139]:
Sep27_noaa_df.drop(['lat','lon'], axis=1, inplace=True)

In [149]:
combined_df = pd.merge(merged_df, Sep27_noaa_df,  how='left', left_on=['asoslatlonhash', 'created'], right_on=['asoslatlonhash', 'datetime'])
combined_df.drop(['tslatlonhash', 'asoslatlonhash','rec_length','num_fields', 'datetime'], axis=1, inplace=True)

In [150]:
combined_df.head()

Unnamed: 0,0_3um,0_5um,1_0um,2_5um,5_0um,10_0um,pm1_0,pm10_0,created,pm1_0_atm,pm2_5_atm,pm10_0_atm,uptime,rssi,temperature,humidity,pm2_5_cf_1,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,lat,lon,parent_id,is_owner,city,county,zipcode,created_at,year,month,day,hour,minute,wban_number,call_sign,call_sign2,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd
0,935.56,270.69,45.63,7.76,2.35,0.0,4.57,8.82,201909270000,4.34,7.07,9.19,913.0,-74.0,83.0,46.0,7.07,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:00,2019,9,27,0,0,23234,KSFO,SFO,5-MIN,KSFO,271505Z,,True,260,10.0,False,,False,,False
1,935.56,270.69,45.63,7.76,2.35,0.0,4.57,8.82,201909270000,4.34,7.07,9.19,913.0,-74.0,83.0,46.0,7.07,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:00,2019,9,27,0,0,23234,KSFO,SFO,5-MIN,KSFO,280305Z,,True,250,10.0,False,,False,,False
2,842.38,244.69,51.99,9.53,3.35,0.0,3.78,8.69,201909270010,4.24,6.82,9.66,923.0,-71.0,82.0,47.0,6.82,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:10,2019,9,27,0,10,23234,KSFO,SFO,5-MIN,KSFO,271510Z,,True,270,10.0,False,,False,,False
3,842.38,244.69,51.99,9.53,3.35,0.0,3.78,8.69,201909270010,4.24,6.82,9.66,923.0,-71.0,82.0,47.0,6.82,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:10,2019,9,27,0,10,23234,KSFO,SFO,5-MIN,KSFO,280310Z,,True,250,13.0,True,17.0,False,,False
4,832.65,247.2,41.0,6.56,2.14,1.1,3.93,8.15,201909270020,4.07,7.6,9.65,932.0,-72.0,81.0,47.0,7.6,,outside,,False,16939,#SAFQ11,37.72244,-122.439302,,0,San Francisco,San Francisco County,94112,2019/09/27T00:20,2019,9,27,0,20,23234,KSFO,SFO,5-MIN,KSFO,271515Z,,True,260,10.0,False,,False,,False


In [152]:
# Write to file
parquet_file = "{}/20190927.parquet".format(datafolder)
write(parquet_file, combined_df,compression='GZIP')