In [1]:
import pandas as pd
import pandas_profiling
import numpy as np 
import json
import datetime
import re
import ast
from fastparquet import ParquetFile, write

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

### Data Folder Instructions

In [2]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# Purple Air data ia assumed to be in a subfolder called 'purpleair' 
# NOAA data ia assumed to be in a subfolder called 'noaa' 
# For example, if the base data folder is '/users/data', purpleair data should be in '/users/data/purpleair'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

### Combine PurpleAir and NOAA data

In [3]:
def createHashKey(row):
    if np.isnan(row['lat']):
        str_lat = ''
    else:
        str_lat = str(row['lat'])
        
        
    if np.isnan(row['lon']):
        str_lon = ''
    else:
        str_lon = str(row['lon'])
        
    return hash(str_lat + str_lon)

In [4]:
for i in range(1,31):
    # Read from noaa data that was stored earlier
    noaa_df = pd.read_parquet("{}/noaa/daily/asos_201909{:02}.parquet".format(datafolder, i))
    
    # Read epa data from file
    epa_df = pd.read_parquet("{}/ambient/daily/epa_201909{:02}.parquet".format(datafolder, i))
    epa_df['createdhr'] = epa_df['created'].apply(lambda x: int(str(x)[:-2]))  # date key at hour level as the data is hourly
    
    # Read purple air data from file
    bay_ts_df = pd.read_parquet("{}/pa_ts/201909{:02}.parquet".format(datafolder, i))
    bay_ts_df['createdhr'] = bay_ts_df['created'].apply(lambda x: int(str(x)[:-2]))  # date key at hour level to join with hourly epa data
    
    # Add lat-lon based hashes to noaa and purple air dataframes
    bay_ts_df['tslatlonhash'] = bay_ts_df.apply (lambda row: createHashKey(row), axis=1)
    noaa_df['asoslatlonhash'] = noaa_df.apply (lambda row: createHashKey(row), axis=1)
    epa_df['epalatlonhash'] = epa_df.apply (lambda row: createHashKey(row), axis=1)
    
    # Keep only the asos columns needed to determine the lat-lon mapping
    noaa_latlon_df = noaa_df[['asoslatlonhash','lat','lon']]
    noaa_latlon_df.drop_duplicates(inplace=True)
    
    # Keep only the epa columns needed to determine the lat-lon mapping
    epa_latlon_df = epa_df[['epalatlonhash','lat','lon']]
    epa_latlon_df.drop_duplicates(inplace=True)
    
    # Keep only the purple air columns needed to determine the lat-lon mapping
    usa_purple_latlon_df = bay_ts_df[['tslatlonhash','lat','lon']]
    usa_purple_latlon_df.drop_duplicates(inplace=True)
    
    noaa_latlon_df.set_index('asoslatlonhash', inplace=True)
    usa_purple_latlon_df.set_index('tslatlonhash', inplace=True)
    epa_latlon_df.set_index('epalatlonhash', inplace=True)
    
    # Find the closest asos lat-lon mapping corresponding to the purple air records
    closest_asos_points = {}
    for name, point in usa_purple_latlon_df.iterrows():
    #     print(name, point)
    #     break
        distances = (((noaa_latlon_df - point) ** 2).sum(axis=1)**.5)
        closest_asos_points[name] = distances.sort_values().index[0]

    # Create dataframe from lat-lon mapping
    asoslatlonmap_df = pd.DataFrame(list(closest_asos_points.items()), columns=['tslatlonhash','asoslatlonhash'])
    
    # Find the closest asos lat-lon mapping corresponding to the purple air records
    closest_epa_points = {}
    for name, point in usa_purple_latlon_df.iterrows():
    #     print(name, point)
    #     break
        distances = (((epa_latlon_df - point) ** 2).sum(axis=1)**.5)
        closest_epa_points[name] = distances.sort_values().index[0]

    # Create dataframe from lat-lon mapping
    epalatlonmap_df = pd.DataFrame(list(closest_epa_points.items()), columns=['tslatlonhash','epalatlonhash'])
    
    # Merge purple air data to lat-lon mappings first and then 
    # merge the resulting dataframe to asos and epa dataframes
    merged_df = pd.merge(bay_ts_df, asoslatlonmap_df, on='tslatlonhash')
    merged_df = pd.merge(merged_df, epalatlonmap_df, on='tslatlonhash')
    
    # Drop common and unwanted columns from noaa and epa dataframes
    noaa_df.drop(['lat','lon'], axis=1, inplace=True)
    epa_df.drop(['lat','lon'], axis=1, inplace=True)
    
    # Combine asos data
    combined_df = pd.merge(merged_df, noaa_df,  how='left', left_on=['asoslatlonhash', 'created'], right_on=['asoslatlonhash', 'datetime'])

    # Combine epa data
    combined_df = pd.merge(combined_df, epa_df,  how='left', left_on=['epalatlonhash', 'createdhr'], right_on=['epalatlonhash', 'createdhr'])

    # # Drop unwanted columns
    combined_df.drop(['tslatlonhash', 'asoslatlonhash', 'epalatlonhash', 'rec_length','num_fields', 'datetime', 'utc', 'parameter', 'createdhr','created_y'], axis=1, inplace=True)
    
    combined_df.columns = ['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0','pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime','rssi', 
                       'temperature', 'humidity', 'pm2_5_cf_1', 'device_loc_typ', 'is_owner', 'sensor_id', 'sensor_name', 'parent_id','lat', 'lon',  'thingspeak_primary_id', 
                       'thingspeak_primary_id_read_key', 'thingspeak_secondary_id', 'thingspeak_secondary_id_read_key', 'a_h', 'high_reading_flag', 'hidden',
                       'city', 'county', 'zipcode', 'created_at', 'year', 'month', 'day', 'hour', 'minute', 'wban_number', 'call_sign', 'call_sign2', 'interval', 
                       'call_sign3', 'zulu_time', 'report_modifier', 'wind_data', 'wind_direction', 'wind_speed', 'gusts', 'gust_speed', 'variable_winds', 'variable_wind_info', 
                       'sys_maint_reqd', 'epa_pm25_unit', 'epa_pm25_value', 'raw_concentration', 'aqi', 'category', 'site_name', 'agency_name', 'full_aqs_code', 'intl_aqs_code']
    
    # Write to file
    parquet_file = "{}/combined/201909{:02}.parquet".format(datafolder, i)
    write(parquet_file, combined_df,compression='GZIP')

In [5]:
tst = pd.read_parquet("{}/combined/20190930.parquet".format(datafolder))

In [8]:
tst.created.unique()

array([201909300000, 201909300010, 201909300020, 201909300030,
       201909300040, 201909300050, 201909300100, 201909300110,
       201909300120, 201909300130, 201909300140, 201909300150,
       201909300200, 201909300210, 201909300220, 201909300230,
       201909300240, 201909300250, 201909300300, 201909300310,
       201909300320, 201909300330, 201909300340, 201909300350,
       201909300400, 201909300410, 201909300420, 201909300430,
       201909300440, 201909300450, 201909300500, 201909300510,
       201909300520, 201909300530, 201909300540, 201909300550,
       201909300600, 201909300610, 201909300620, 201909300630,
       201909300640, 201909300650, 201909300700, 201909300710,
       201909300720, 201909300730, 201909300740, 201909300750,
       201909300800, 201909300810, 201909300820, 201909300830,
       201909300840, 201909300850, 201909300900, 201909300910,
       201909300920, 201909300930, 201909300940, 201909300950,
       201909301000, 201909301010, 201909301020, 201909