In [1]:
import pandas as pd
import pandas_profiling
import numpy as np 
import json
import datetime
import re
import ast

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

### Data Folder Instructions

In [2]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# Purple Air data ia assumed to be in a subfolder called 'purpleair' 
# NOAA data ia assumed to be in a subfolder called 'noaa' 
# For example, if the base data folder is '/users/data', purpleair data should be in '/users/data/purpleair'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

### Combine PurpleAir and NOAA data

In [3]:
def createHashKey(row):
    if np.isnan(row['lat']):
        str_lat = ''
    else:
        str_lat = str(int(row['lat']))
        
        
    if np.isnan(row['lon']):
        str_lon = ''
    else:
        str_lon = str(int(row['lon']))
        
    return hash(str_lat + str_lon)

In [4]:
# Read from file that was stored earlier
unique_station_df = pd.read_parquet("{}/noaa/uniq_station_data.parquet".format(datafolder))
sep2019_noaa_df = pd.read_parquet("{}/noaa/Sep2019_withloc.parquet".format(datafolder))
Sep27_df = sep2019_noaa_df[sep2019_noaa_df.date == '09/27/19']

In [5]:
unique_station_df.count()

wban_number    26315
lat            26315
lon            26305
dtype: int64

In [6]:
unique_station_df = unique_station_df.dropna()
unique_station_df.drop(['wban_number'], axis=1, inplace=True)
unique_station_df.drop_duplicates(inplace=True)
unique_station_df.count()

lat    25712
lon    25712
dtype: int64

In [7]:
unique_station_df[['lat','lon']] = unique_station_df[['lat','lon']].apply(pd.to_numeric)

In [8]:
usa_purple_df = pd.read_parquet("{}/purpleair/0927Full_withaddress_usa.parquet".format(datafolder))

In [9]:
usa_purple_df.sensor_id.count()

2406866

In [10]:
# Add lat-lon based hashes to noaa and purple air dataframes
unique_station_df['asoslatlonhash'] = unique_station_df.apply (lambda row: createHashKey(row), axis=1)
usa_purple_df['palatlonhash'] = usa_purple_df.apply (lambda row: createHashKey(row), axis=1)
Sep27_df['asoslatlonhash'] = Sep27_df.apply (lambda row: createHashKey(row), axis=1)

In [11]:
# Keep only the purple air columns needed to determine the lat-lon mapping
usa_purple_latlon_df = usa_purple_df[['palatlonhash','lat','lon']]
usa_purple_latlon_df.drop_duplicates(inplace=True)
usa_purple_latlon_df.palatlonhash.count()

7062

In [12]:
unique_station_df.set_index('asoslatlonhash', inplace=True)
usa_purple_latlon_df.set_index('palatlonhash', inplace=True)

In [13]:
# Find the closest asos lat-lon mapping corresponding to the purple air records
closest_points = {}
for name, point in usa_purple_latlon_df.iterrows():
#     print(name, point)
#     break
    distances = (((unique_station_df - point) ** 2).sum(axis=1)**.5)
    closest_points[name] = distances.sort_values().index[0]

In [14]:
# Create dataframe from lat-lon mapping
latlonmap_df = pd.DataFrame(list(closest_points.items()), columns=['palatlonhash','asoslatlonhash'])

In [15]:
# Merge purple air data to lat-lon mapping first and then 
# merge the resulting dataframe to asos dataframe
merged_df = pd.merge(usa_purple_df, latlonmap_df, on='palatlonhash')

In [16]:
merged_df.sensor_id.count()

2406866

In [17]:
merged_df.head()

Unnamed: 0,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,last_seen,lat,lon,pm2_5val,parent_id,sensor_type,humidity,is_owner,pressure,temp_f,pm2_5val_10m_avg,pm2_5val_30m_avg,pm2_5val_1h_avg,pm2_5val_6h_avg,pm2_5val_24h_avg,pm2_5val_1wk_avg,city,country,county,state,zipcode,palatlonhash,asoslatlonhash
0,,inside,,False,24115,2nd South 12th East,1547065985,40.764907,-111.856653,0.15,,PMS5003+PMS5003+BME280,15.0,0,869.14,89.0,0.47,19.37,34.37,16.29,4.76,0.71,Salt Lake City,United States,Salt Lake County,Utah,84102,4732930680302527505,4732930680302527505
1,,,,False,24116,2nd South 12th East B,1547065988,40.764907,-111.856653,,24115.0,,15.0,0,869.16,89.0,,,,,,,Salt Lake City,United States,Salt Lake County,Utah,84102,4732930680302527505,4732930680302527505
2,,outside,,False,10808,1-800 Contacts,1569567528,40.507316,-111.899188,1.19,,PMS5003+PMS5003+BME280,26.0,0,859.36,72.0,3.7,5.25,5.39,3.66,3.48,4.9,Draper,United States,,Utah,84020,4732930680302527505,4732930680302527505
3,,,,False,10809,1-800 Contacts B,1569567528,40.507316,-111.899188,2.03,10808.0,,,0,,,4.16,5.86,6.05,4.14,3.91,5.28,Draper,United States,,Utah,84020,4732930680302527505,4732930680302527505
4,,outside,1.0,False,5460,1027 Hollywood,1569567516,40.72751,-111.861434,1.71,,PMS5003+PMS5003+BME280,28.0,0,861.21,75.0,1.59,1.73,1.89,1.74,1.64,3.81,Salt Lake City,United States,Salt Lake County,Utah,84105,4732930680302527505,4732930680302527505


In [17]:
Sep27_df.drop(['lat','lon'], axis=1, inplace=True)

In [None]:
# merged_df_new = merged_df.set_index(merged_df.asoslatlonhash).join(Sep27_df.set_index(Sep27_df.asoslatlonhash), how='left', lsuffix='pa', rsuffix='asos').reset_index(drop=True)