In [1]:
import pandas as pd
import pandas_profiling
import numpy as np 
import json
import datetime
import re
import ast

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

### Data Folder Instructions

In [2]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# Purple Air data ia assumed to be in a subfolder called 'purpleair' 
# NOAA data ia assumed to be in a subfolder called 'noaa' 
# For example, if the base data folder is '/users/data', purpleair data should be in '/users/data/purpleair'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

### Combine PurpleAir and NOAA data

In [3]:
purpleair_df = pd.read_csv("{}/purpleair/09141731_withaddress.csv".format(datafolder))
noaa_df = pd.read_parquet("{}/noaa/aug_asos_data_withloc.parquet".format(datafolder))

In [4]:
purpleair_df = purpleair_df.drop(['Unnamed: 0'], axis=1)
purpleair_df.head(2)

Unnamed: 0,age,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,last_seen,lat,lon,pm2_5val,parent_id,thingspeak_primary_id,thingspeak_primary_id_read_key,thingspeak_secondary_id,thingspeak_secondary_id_read_key,sensor_type,humidity,is_owner,pressure,temp_f,av_stat_last_modified,av_stat_time_since_last_modified,pm2_5val_10m_avg,pm2_5val_30m_avg,pm2_5val_1h_avg,pm2_5val_6h_avg,pm2_5val_24h_avg,pm2_5val_1wk_avg,city,country,county,state,zipcode
0,357358,,inside,,False,24115,2nd South 12th East,1547065985,40.764907,-111.856653,0.15,,672791,CLV9HLXOGIYQNYD2,672792,WAZLM3J4Q9OHKNGE,PMS5003+PMS5003+BME280,15.0,0,869.14,89.0,1547066000000.0,80079.0,0.47,19.37,34.37,16.29,4.76,0.71,Salt Lake City,United States,Salt Lake County,Utah,84102
1,357358,,,,False,24116,2nd South 12th East B,1547065988,40.764907,-111.856653,,24115.0,672793,UQJBDQ2XXPP73U45,672795,5G9B9E4XFL32S845,,15.0,0,869.16,89.0,,,,,,,,,Salt Lake City,United States,Salt Lake County,Utah,84102


In [5]:
noaa_df.head(2)

Unnamed: 0_level_0,wban_number,call_sign,call_sign2,year,month,day,hour,minute,rec_length,date,timestamp,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields,lat,lon
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
0,54779,K12N,12N,2019,8,1,0,0,99,08/01/19,00:00:31,5-MIN,K12N,010500Z,AUTO,True,0,0,False,,False,,False,17,41.009,-74.736
1,54779,K12N,12N,2019,8,1,0,5,99,08/01/19,00:05:31,5-MIN,K12N,010505Z,AUTO,True,0,0,False,,False,,False,17,41.009,-74.736


In [6]:
noaa_df.lat.nunique(), noaa_df.lon.nunique()

(933, 958)

In [7]:
purpleair_df.lat.nunique(), purpleair_df.lon.nunique()

(8507, 8506)

The next few cells are being used to test out the initial thoughts for combining the two dataframes. 
- Since we don't have the noaa data for September, the month data in noaa df will be replaced to mimic September. 
- The purple air data being used for this test was downloaded on 09/17 at 5:30 PM. Hence new columns with the corresponding timestamp will be added to the purple air df. 
- New columns will be added to both the dataframes to represent the integer portion of the lat and lon values as the weather station data is not available for all the lat and lon data in the purple air df. For this test, an exact match of these values will be used. However, we may need to use a range for the match. 
- Key columns will be created in both the dataframes by combining the date, time and lat-lon integer values and the same will be used to combine the datasets.

In [8]:
noaa_df_copy = noaa_df.copy()

In [9]:
noaa_df[['lat','lon']] = noaa_df[['lat','lon']].apply(pd.to_numeric)

In [10]:
# Add new lat-lon columns in with just the integer portions
# -- PURPLE AIR 
purpleair_df.loc[purpleair_df['lat'].notnull(), 'lat_int'] = purpleair_df.loc[purpleair_df['lat'].notnull(), 'lat'].apply(int)
purpleair_df.loc[purpleair_df['lon'].notnull(), 'lon_int'] = purpleair_df.loc[purpleair_df['lon'].notnull(), 'lon'].apply(int)
# -- NOAA DATA
noaa_df.loc[noaa_df['lat'].notnull(), 'lat_int'] = noaa_df.loc[noaa_df['lat'].notnull(), 'lat'].apply(int)
noaa_df.loc[noaa_df['lon'].notnull(), 'lon_int'] = noaa_df.loc[noaa_df['lon'].notnull(), 'lon'].apply(int)

In [139]:
# # Number of purple air records matching noaa records when considering just the integer portion of lat and lon
# purpleair_df[(purpleair_df.lat_int == 19) & (purpleair_df.lon_int == -155)].sensor_id.count()

105

In [11]:
noaa_df[(noaa_df.timestamp == '17:30:31') & (noaa_df.date == '08/17/19')].wban_number.count()

927

In [12]:
# Replace month values in noaa data - This should not be required once we have current data
noaa_df.month = '9'
noaa_df[['date']] = noaa_df[['date']].replace("08/", "09/", regex=True)

In [13]:
# Add date and time columns in purpleair df
purpleair_df['date'] = '09/17/19'
purpleair_df['timestamp'] = '17:30:31'

In [14]:
purpleair_df[['date','timestamp','lat_int','lon_int']].head()

Unnamed: 0,date,timestamp,lat_int,lon_int
0,09/17/19,17:30:31,40.0,-111.0
1,09/17/19,17:30:31,40.0,-111.0
2,09/17/19,17:30:31,37.0,-121.0
3,09/17/19,17:30:31,37.0,-121.0
4,09/17/19,17:30:31,18.0,-67.0


In [15]:
noaa_df[['date','timestamp','lat_int','lon_int']].head()

Unnamed: 0_level_0,date,timestamp,lat_int,lon_int
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,09/01/19,00:00:31,41,-74
1,09/01/19,00:05:31,41,-74
2,09/01/19,00:10:31,41,-74
3,09/01/19,00:15:31,41,-74
4,09/01/19,00:20:31,41,-74


In [94]:
# import hashlib
# hashlib.sha256(b"test").hexdigest()

In [16]:
def createKey(row):
    if np.isnan(row['lat_int']):
        str_lat = ''
    else:
        str_lat = str(int(row['lat_int']))
        
        
    if np.isnan(row['lon_int']):
        str_lon = ''
    else:
        str_lon = str(int(row['lon_int']))
        
    return hash(str(row['date']) + str(row['timestamp']) + str_lat + str_lon)

In [None]:
# Create hashed key columns

In [17]:
noaa_df['df_key'] = noaa_df.apply (lambda row: createKey(row), axis=1)

In [18]:
purpleair_df['df_key'] = purpleair_df.apply (lambda row: createKey(row), axis=1)

In [19]:
merged_df = pd.merge(purpleair_df, noaa_df, on='df_key')

In [20]:
merged_df.head()

Unnamed: 0,age,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,last_seen,lat_x,lon_x,pm2_5val,parent_id,thingspeak_primary_id,thingspeak_primary_id_read_key,thingspeak_secondary_id,thingspeak_secondary_id_read_key,sensor_type,humidity,is_owner,pressure,temp_f,av_stat_last_modified,av_stat_time_since_last_modified,pm2_5val_10m_avg,pm2_5val_30m_avg,pm2_5val_1h_avg,pm2_5val_6h_avg,pm2_5val_24h_avg,pm2_5val_1wk_avg,city,country,county,state,zipcode,lat_int_x,lon_int_x,date_x,timestamp_x,df_key,wban_number,call_sign,call_sign2,year,month,day,hour,minute,rec_length,date_y,timestamp_y,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields,lat_y,lon_y,lat_int_y,lon_int_y
0,357358,,inside,,False,24115,2nd South 12th East,1547065985,40.764907,-111.856653,0.15,,672791,CLV9HLXOGIYQNYD2,672792,WAZLM3J4Q9OHKNGE,PMS5003+PMS5003+BME280,15.0,0,869.14,89.0,1547066000000.0,80079.0,0.47,19.37,34.37,16.29,4.76,0.71,Salt Lake City,United States,Salt Lake County,Utah,84102,40.0,-111.0,09/17/19,17:30:31,-9068692025977670988,24127,KSLC,SLC,2019,9,17,17,30,106,09/17/19,17:30:31,5-MIN,KSLC,180030Z,,True,350,9,False,,False,,True,19,40.778,-111.969,40,-111
1,357358,,,,False,24116,2nd South 12th East B,1547065988,40.764907,-111.856653,,24115.0,672793,UQJBDQ2XXPP73U45,672795,5G9B9E4XFL32S845,,15.0,0,869.16,89.0,,,,,,,,,Salt Lake City,United States,Salt Lake County,Utah,84102,40.0,-111.0,09/17/19,17:30:31,-9068692025977670988,24127,KSLC,SLC,2019,9,17,17,30,106,09/17/19,17:30:31,5-MIN,KSLC,180030Z,,True,350,9,False,,False,,True,19,40.778,-111.969,40,-111
2,0,,outside,,False,10808,1-800 Contacts,1568507501,40.507316,-111.899188,2.46,,496465,NJQEPVB0BHDCQKXA,496466,68WPXUEO026F8BB9,PMS5003+PMS5003+BME280,5.0,0,866.64,98.0,1568508000000.0,120174.0,2.55,2.64,2.75,5.2,8.08,7.07,Draper,United States,,Utah,84020,40.0,-111.0,09/17/19,17:30:31,-9068692025977670988,24127,KSLC,SLC,2019,9,17,17,30,106,09/17/19,17:30:31,5-MIN,KSLC,180030Z,,True,350,9,False,,False,,True,19,40.778,-111.969,40,-111
3,0,,,,False,10809,1-800 Contacts B,1568507501,40.507316,-111.899188,2.19,10808.0,496467,U9BJQDBKB38741GM,496468,SICXKXJEM9WTQZ5O,,,0,,,1568508000000.0,120174.0,2.52,2.67,2.82,5.53,8.55,7.46,Draper,United States,,Utah,84020,40.0,-111.0,09/17/19,17:30:31,-9068692025977670988,24127,KSLC,SLC,2019,9,17,17,30,106,09/17/19,17:30:31,5-MIN,KSLC,180030Z,,True,350,9,False,,False,,True,19,40.778,-111.969,40,-111
4,1,,outside,1.0,False,5460,1027 Hollywood,1568507441,40.72751,-111.861434,3.44,,385885,5CPPE62979927J8N,385886,CY94WZ0PXRIG4FP9,PMS5003+PMS5003+BME280,10.0,0,869.12,97.0,1568507000000.0,119929.0,2.94,2.74,2.88,4.31,6.71,6.28,Salt Lake City,United States,Salt Lake County,Utah,84105,40.0,-111.0,09/17/19,17:30:31,-9068692025977670988,24127,KSLC,SLC,2019,9,17,17,30,106,09/17/19,17:30:31,5-MIN,KSLC,180030Z,,True,350,9,False,,False,,True,19,40.778,-111.969,40,-111


Some of the records are getting multiple matches with the current logic. May have to include some decimal precision to find more accurate matches

In [53]:
purpleair_df[purpleair_df.sensor_id == 18279]

Unnamed: 0,age,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,last_seen,lat,lon,pm2_5val,parent_id,thingspeak_primary_id,thingspeak_primary_id_read_key,thingspeak_secondary_id,thingspeak_secondary_id_read_key,sensor_type,humidity,is_owner,pressure,temp_f,av_stat_last_modified,av_stat_time_since_last_modified,pm2_5val_10m_avg,pm2_5val_30m_avg,pm2_5val_1h_avg,pm2_5val_6h_avg,pm2_5val_24h_avg,pm2_5val_1wk_avg,city,country,county,state,zipcode,lat_int,lon_int,date,timestamp,df_key
16914,0,,outside,,False,18279,"Wellington-Harrington, Cardinal Medeiros, One ...",1568507482,42.367905,-71.091116,3.73,,616598,NRI5KG0D337Q9YPI,616599,2HQS4UWA3M3YQG8W,PMS5003+PMS5003+BME280,61.0,0,1021.19,77.0,1568507000000.0,119886.0,4.85,5.13,4.63,2.72,3.93,5.58,Cambridge,United States,Middlesex County,Massachusetts,2141,42.0,-71.0,09/17/19,17:30:31,-4432959759473775883


In [52]:
merged_df[merged_df.sensor_id == 18279]

Unnamed: 0,age,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,last_seen,lat_x,lon_x,pm2_5val,parent_id,thingspeak_primary_id,thingspeak_primary_id_read_key,thingspeak_secondary_id,thingspeak_secondary_id_read_key,sensor_type,humidity,is_owner,pressure,temp_f,av_stat_last_modified,av_stat_time_since_last_modified,pm2_5val_10m_avg,pm2_5val_30m_avg,pm2_5val_1h_avg,pm2_5val_6h_avg,pm2_5val_24h_avg,pm2_5val_1wk_avg,city,country,county,state,zipcode,lat_int_x,lon_int_x,date_x,timestamp_x,df_key,wban_number,call_sign,call_sign2,year,month,day,hour,minute,rec_length,date_y,timestamp_y,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields,lat_y,lon_y,lat_int_y,lon_int_y
37967,0,,outside,,False,18279,"Wellington-Harrington, Cardinal Medeiros, One ...",1568507482,42.367905,-71.091116,3.73,,616598,NRI5KG0D337Q9YPI,616599,2HQS4UWA3M3YQG8W,PMS5003+PMS5003+BME280,61.0,0,1021.19,77.0,1568507000000.0,119886.0,4.85,5.13,4.63,2.72,3.93,5.58,Cambridge,United States,Middlesex County,Massachusetts,2141,42.0,-71.0,09/17/19,17:30:31,-4432959759473775883,14702,KBED,BED,2019,9,17,17,30,104,09/17/19,17:30:31,5-MIN,KBED,172230Z,,True,100,7,False,,False,,True,19,42.47,-71.289,42,-71
37968,0,,outside,,False,18279,"Wellington-Harrington, Cardinal Medeiros, One ...",1568507482,42.367905,-71.091116,3.73,,616598,NRI5KG0D337Q9YPI,616599,2HQS4UWA3M3YQG8W,PMS5003+PMS5003+BME280,61.0,0,1021.19,77.0,1568507000000.0,119886.0,4.85,5.13,4.63,2.72,3.93,5.58,Cambridge,United States,Middlesex County,Massachusetts,2141,42.0,-71.0,09/17/19,17:30:31,-4432959759473775883,14739,KBOS,BOS,2019,9,17,17,30,112,09/17/19,17:30:31,5-MIN,KBOS,172230Z,,True,100,11,False,,False,,False,19,42.361,-71.01,42,-71
37969,0,,outside,,False,18279,"Wellington-Harrington, Cardinal Medeiros, One ...",1568507482,42.367905,-71.091116,3.73,,616598,NRI5KG0D337Q9YPI,616599,2HQS4UWA3M3YQG8W,PMS5003+PMS5003+BME280,61.0,0,1021.19,77.0,1568507000000.0,119886.0,4.85,5.13,4.63,2.72,3.93,5.58,Cambridge,United States,Middlesex County,Massachusetts,2141,42.0,-71.0,09/17/19,17:30:31,-4432959759473775883,4780,KFIT,FIT,2019,9,17,17,30,111,09/17/19,17:30:31,5-MIN,KFIT,172230Z,AUTO,True,150,6,False,,False,,False,19,42.552,-71.756,42,-71
37970,0,,outside,,False,18279,"Wellington-Harrington, Cardinal Medeiros, One ...",1568507482,42.367905,-71.091116,3.73,,616598,NRI5KG0D337Q9YPI,616599,2HQS4UWA3M3YQG8W,PMS5003+PMS5003+BME280,61.0,0,1021.19,77.0,1568507000000.0,119886.0,4.85,5.13,4.63,2.72,3.93,5.58,Cambridge,United States,Middlesex County,Massachusetts,2141,42.0,-71.0,09/17/19,17:30:31,-4432959759473775883,14710,KMHT,MHT,2019,9,17,17,30,120,09/17/19,17:30:31,5-MIN,KMHT,172230Z,,True,120,6,False,,False,,False,20,42.93,-71.436,42,-71
37971,0,,outside,,False,18279,"Wellington-Harrington, Cardinal Medeiros, One ...",1568507482,42.367905,-71.091116,3.73,,616598,NRI5KG0D337Q9YPI,616599,2HQS4UWA3M3YQG8W,PMS5003+PMS5003+BME280,61.0,0,1021.19,77.0,1568507000000.0,119886.0,4.85,5.13,4.63,2.72,3.93,5.58,Cambridge,United States,Middlesex County,Massachusetts,2141,42.0,-71.0,09/17/19,17:30:31,-4432959759473775883,14710,KMHT,MHT,2019,9,17,17,30,120,09/17/19,17:30:31,5-MIN,KMHT,172230Z,,True,120,6,False,,False,,False,20,42.933,-71.438,42,-71
37972,0,,outside,,False,18279,"Wellington-Harrington, Cardinal Medeiros, One ...",1568507482,42.367905,-71.091116,3.73,,616598,NRI5KG0D337Q9YPI,616599,2HQS4UWA3M3YQG8W,PMS5003+PMS5003+BME280,61.0,0,1021.19,77.0,1568507000000.0,119886.0,4.85,5.13,4.63,2.72,3.93,5.58,Cambridge,United States,Middlesex County,Massachusetts,2141,42.0,-71.0,09/17/19,17:30:31,-4432959759473775883,14753,KMQE,MQE,2019,9,17,17,30,99,09/17/19,17:30:31,5-MIN,KMQE,172230Z,AUTO,True,140,7,False,,False,,False,17,42.212,-71.114,42,-71
37973,0,,outside,,False,18279,"Wellington-Harrington, Cardinal Medeiros, One ...",1568507482,42.367905,-71.091116,3.73,,616598,NRI5KG0D337Q9YPI,616599,2HQS4UWA3M3YQG8W,PMS5003+PMS5003+BME280,61.0,0,1021.19,77.0,1568507000000.0,119886.0,4.85,5.13,4.63,2.72,3.93,5.58,Cambridge,United States,Middlesex County,Massachusetts,2141,42.0,-71.0,09/17/19,17:30:31,-4432959759473775883,94746,KORH,ORH,2019,9,17,17,30,113,09/17/19,17:30:31,5-MIN,KORH,172230Z,,True,150,9,False,,False,,False,19,42.271,-71.873,42,-71
37974,0,,outside,,False,18279,"Wellington-Harrington, Cardinal Medeiros, One ...",1568507482,42.367905,-71.091116,3.73,,616598,NRI5KG0D337Q9YPI,616599,2HQS4UWA3M3YQG8W,PMS5003+PMS5003+BME280,61.0,0,1021.19,77.0,1568507000000.0,119886.0,4.85,5.13,4.63,2.72,3.93,5.58,Cambridge,United States,Middlesex County,Massachusetts,2141,42.0,-71.0,09/17/19,17:30:31,-4432959759473775883,54704,KOWD,OWD,2019,9,17,17,30,121,09/17/19,17:30:31,5-MIN,KOWD,172230Z,,True,140,4,False,,False,,True,21,42.191,-71.174,42,-71


In [42]:
# Number of countries for which we have wind and sensor data
merged_df.country.unique()

array(['United States', 'Mexico', 'Canada', 'Guam', 'U.S. Virgin Islands',
       'British Virgin Islands'], dtype=object)

Purple Air data had records for 105 countries while combining wind data restricts it to 6

In [66]:
# Purple Air state counts
pa_usa_st_ct = purpleair_df[purpleair_df.country == 'United States'].state.nunique()
pa_mex_st_ct = purpleair_df[purpleair_df.country == 'Mexico'].state.nunique()
pa_can_st_ct = purpleair_df[purpleair_df.country == 'Canada'].state.nunique()
# Merged data state counts
merged_usa_st_ct = merged_df[merged_df.country == 'United States'].state.nunique()
merged_mex_st_ct = merged_df[merged_df.country == 'Mexico'].state.nunique()
merged_can_st_ct = merged_df[merged_df.country == 'Canada'].state.nunique()

print("Purple Air data had records for {} states in USA and combined wind data has {}".format(pa_usa_st_ct, merged_usa_st_ct))
print("Purple Air data had records for {} states in Mexico and combined wind data has {}".format(pa_mex_st_ct, merged_mex_st_ct))
print("Purple Air data had records for {} states in Canada and combined wind data has {}".format(pa_can_st_ct, merged_can_st_ct))

Purple Air data had records for 51 states in USA while combining wind data has 51
Purple Air data had records for 11 states in Mexico while combining wind data has 1
Purple Air data had records for 7 states in Canada while combining wind data has 2


In [21]:
pandas_profiling.ProfileReport(merged_df)

