In [132]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import json
import datetime
import re
import ast

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [133]:
purpleair_df = pd.read_csv("/Users/apaul2/Documents/_Common/capstone/Project/data/PurpleAir/09141731_withaddress.csv")
noaa_df = pd.read_csv("/Users/apaul2/Documents/_Common/capstone/Project/data/NOAA/64010PHTO201908_withloc.csv")

In [134]:
purpleair_df = purpleair_df.drop(['Unnamed: 0'], axis=1)
purpleair_df.head(2)

Unnamed: 0,age,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,last_seen,lat,lon,pm2_5val,parent_id,thingspeak_primary_id,thingspeak_primary_id_read_key,thingspeak_secondary_id,thingspeak_secondary_id_read_key,sensor_type,humidity,is_owner,pressure,temp_f,av_stat_last_modified,av_stat_time_since_last_modified,pm2_5val_10m_avg,pm2_5val_30m_avg,pm2_5val_1h_avg,pm2_5val_6h_avg,pm2_5val_24h_avg,pm2_5val_1wk_avg,city,country,county,state,zipcode
0,357358,,inside,,False,24115,2nd South 12th East,1547065985,40.764907,-111.856653,0.15,,672791,CLV9HLXOGIYQNYD2,672792,WAZLM3J4Q9OHKNGE,PMS5003+PMS5003+BME280,15.0,0,869.14,89.0,1547066000000.0,80079.0,0.47,19.37,34.37,16.29,4.76,0.71,Salt Lake City,United States,Salt Lake County,Utah,84102
1,357358,,,,False,24116,2nd South 12th East B,1547065988,40.764907,-111.856653,,24115.0,672793,UQJBDQ2XXPP73U45,672795,5G9B9E4XFL32S845,,15.0,0,869.16,89.0,,,,,,,,,Salt Lake City,United States,Salt Lake County,Utah,84102


In [135]:
noaa_df = noaa_df.drop(['Unnamed: 0'], axis=1)
noaa_df.head(2)

Unnamed: 0,wban_number,call_sign,call_sign2,year,month,day,hour,minute,rec_length,date,timestamp,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields,descriptor,lat,lon,elev_m,begin_date,end_date
0,21504,PHTO,ITO,2019,8,1.0,0.0,0.0,129.0,08/01/19,00:00:31,5-MIN,PHTO,011000Z,AUTO,True,230,5.0,False,,False,,False,22,HILO INTERNATIONAL AIRPORT US HI PHTO,19.719,-155.053,11.6,19730101,20190920
1,21504,PHTO,ITO,2019,8,1.0,0.0,0.0,129.0,08/01/19,00:00:31,5-MIN,PHTO,011000Z,AUTO,True,230,5.0,False,,False,,False,22,HILO GENERAL LYMAN ARPT US HI PHTO,19.719,-155.053,11.0,19430415,19451228


In [136]:
noaa_df.lat.unique(), noaa_df.lon.unique()

(array([19.719]), array([-155.053]))

In [137]:
purpleair_df.lat.nunique(), purpleair_df.lon.nunique()

(8507, 8506)

The next few cells are being used to test out the initial thoughts for combining the two dataframes. 
- Since we don't have the noaa data for September, the month data in noaa df will be replaced to mimic September. 
- The purple air data being used for this test was downloaded on 09/17 at 5:30 PM. Hence new columns with the corresponding timestamp will be added to the purple air df. 
- New columns will be added to both the dataframes to represent the integer portion of the lat and lon values as the weather station data is not available for all the lat and lon data in the purple air df. For this test, an exact match of these values will be used. However, we may need to use a range for the match. 
- Key columns will be created in both the dataframes by combining the date, time and lat-lon integer values and the same will be used to combine the datasets.

In [138]:
# Add new lat-lon columns in with just the integer portions
# -- PURPLE AIR 
purpleair_df.loc[purpleair_df['lat'].notnull(), 'lat_int'] = purpleair_df.loc[purpleair_df['lat'].notnull(), 'lat'].apply(int)
purpleair_df.loc[purpleair_df['lon'].notnull(), 'lon_int'] = purpleair_df.loc[purpleair_df['lon'].notnull(), 'lon'].apply(int)
# -- NOAA DATA
noaa_df.loc[noaa_df['lat'].notnull(), 'lat_int'] = noaa_df.loc[noaa_df['lat'].notnull(), 'lat'].apply(int)
noaa_df.loc[noaa_df['lon'].notnull(), 'lon_int'] = noaa_df.loc[noaa_df['lon'].notnull(), 'lon'].apply(int)

In [139]:
# Number of purple air records matching noaa records when considering just the integer portion of lat and lon
purpleair_df[(purpleair_df.lat_int == 19) & (purpleair_df.lon_int == -155)].sensor_id.count()

105

In [140]:
noaa_df[(noaa_df.timestamp == '17:30:31') & (noaa_df.date == '08/17/19')].wban_number.count()

3

In [141]:
# Replace month values in noaa data - This should not be required once we have current data
noaa_df.month = '9'
noaa_df[['date']] = noaa_df[['date']].replace("08/", "09/", regex=True)

In [142]:
# Add date and time columns in purpleair df
purpleair_df['date'] = '09/17/19'
purpleair_df['timestamp'] = '17:30:31'

In [143]:
purpleair_df[['date','timestamp','lat_int','lon_int']].head()

Unnamed: 0,date,timestamp,lat_int,lon_int
0,09/17/19,17:30:31,40.0,-111.0
1,09/17/19,17:30:31,40.0,-111.0
2,09/17/19,17:30:31,37.0,-121.0
3,09/17/19,17:30:31,37.0,-121.0
4,09/17/19,17:30:31,18.0,-67.0


In [144]:
noaa_df[['date','timestamp','lat_int','lon_int']].head()

Unnamed: 0,date,timestamp,lat_int,lon_int
0,09/01/19,00:00:31,19,-155
1,09/01/19,00:00:31,19,-155
2,09/01/19,00:00:31,19,-155
3,09/01/19,00:05:31,19,-155
4,09/01/19,00:05:31,19,-155


In [94]:
# import hashlib
# hashlib.sha256(b"test").hexdigest()

In [145]:
def createKey(row):
    if np.isnan(row['lat_int']):
        str_lat = ''
    else:
        str_lat = str(int(row['lat_int']))
        
        
    if np.isnan(row['lon_int']):
        str_lon = ''
    else:
        str_lon = str(int(row['lon_int']))
        
    return hash(str(row['date']) + str(row['timestamp']) + str_lat + str_lon)

In [None]:
# Create hashed key columns

In [146]:
ntest = noaa_df.copy()
ptest = purpleair_df.copy()

In [155]:
noaa_df['df_key'] = ntest.apply (lambda row: createKey(row), axis=1)

In [156]:
purpleair_df['df_key'] = ptest.apply (lambda row: createKey(row), axis=1)

In [157]:
merged_df = pd.merge(purpleair_df, noaa_df, on='df_key')

In [158]:
merged_df.head()

Unnamed: 0,age,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,last_seen,lat_x,lon_x,pm2_5val,parent_id,thingspeak_primary_id,thingspeak_primary_id_read_key,thingspeak_secondary_id,thingspeak_secondary_id_read_key,sensor_type,humidity,is_owner,pressure,temp_f,av_stat_last_modified,av_stat_time_since_last_modified,pm2_5val_10m_avg,pm2_5val_30m_avg,pm2_5val_1h_avg,pm2_5val_6h_avg,pm2_5val_24h_avg,pm2_5val_1wk_avg,city,country,county,state,zipcode,lat_int_x,lon_int_x,date_x,timestamp_x,df_key,wban_number,call_sign,call_sign2,year,month,day,hour,minute,rec_length,date_y,timestamp_y,interval,call_sign3,zulu_time,report_modifier,wind_data,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields,descriptor,lat_y,lon_y,elev_m,begin_date,end_date,lat_int_y,lon_int_y
0,488273,,outside,1.0,False,11698,1st Lava Flow,1539211094,19.341286,-155.866901,2.73,,514935,U1L0W77JAK8LF8KH,514936,K1KFP034L1SAIOC6,PMS5003+PMS5003+BME280,52.0,0,964.0,88.0,1539211000000.0,79950.0,2.12,2.22,2.27,1.99,1.82,2.56,Captain Cook,United States,Hawaii County,Hawaii,96704,19.0,-155.0,09/17/19,17:30:31,8999040011973450260,21504,PHTO,ITO,2019,9,17.0,17.0,30.0,113.0,09/17/19,17:30:31,5-MIN,PHTO,180330Z,,True,80,6.0,False,,False,,False,19,HILO INTERNATIONAL AIRPORT US HI PHTO,19.719,-155.053,11.6,19730101,20190920,19,-155
1,488273,,outside,1.0,False,11698,1st Lava Flow,1539211094,19.341286,-155.866901,2.73,,514935,U1L0W77JAK8LF8KH,514936,K1KFP034L1SAIOC6,PMS5003+PMS5003+BME280,52.0,0,964.0,88.0,1539211000000.0,79950.0,2.12,2.22,2.27,1.99,1.82,2.56,Captain Cook,United States,Hawaii County,Hawaii,96704,19.0,-155.0,09/17/19,17:30:31,8999040011973450260,21504,PHTO,ITO,2019,9,17.0,17.0,30.0,113.0,09/17/19,17:30:31,5-MIN,PHTO,180330Z,,True,80,6.0,False,,False,,False,19,HILO GENERAL LYMAN ARPT US HI PHTO,19.719,-155.053,11.0,19430415,19451228,19,-155
2,488273,,outside,1.0,False,11698,1st Lava Flow,1539211094,19.341286,-155.866901,2.73,,514935,U1L0W77JAK8LF8KH,514936,K1KFP034L1SAIOC6,PMS5003+PMS5003+BME280,52.0,0,964.0,88.0,1539211000000.0,79950.0,2.12,2.22,2.27,1.99,1.82,2.56,Captain Cook,United States,Hawaii County,Hawaii,96704,19.0,-155.0,09/17/19,17:30:31,8999040011973450260,21504,PHTO,ITO,2019,9,17.0,17.0,30.0,113.0,09/17/19,17:30:31,5-MIN,PHTO,180330Z,,True,80,6.0,False,,False,,False,19,HILO INTERNATIONAL AP US HI PHTO,19.719,-155.053,11.0,19491001,19721231,19,-155
3,488273,,,1.0,False,11699,1st Lava Flow B,1539211124,19.341286,-155.866901,2.45,11698.0,514939,W32UOP0I63L5GE3P,514941,GWFLPF3YPJ234Q3T,,52.0,0,963.98,88.0,1539211000000.0,80020.0,2.18,2.28,2.32,1.96,1.79,2.51,Captain Cook,United States,Hawaii County,Hawaii,96704,19.0,-155.0,09/17/19,17:30:31,8999040011973450260,21504,PHTO,ITO,2019,9,17.0,17.0,30.0,113.0,09/17/19,17:30:31,5-MIN,PHTO,180330Z,,True,80,6.0,False,,False,,False,19,HILO INTERNATIONAL AIRPORT US HI PHTO,19.719,-155.053,11.6,19730101,20190920,19,-155
4,488273,,,1.0,False,11699,1st Lava Flow B,1539211124,19.341286,-155.866901,2.45,11698.0,514939,W32UOP0I63L5GE3P,514941,GWFLPF3YPJ234Q3T,,52.0,0,963.98,88.0,1539211000000.0,80020.0,2.18,2.28,2.32,1.96,1.79,2.51,Captain Cook,United States,Hawaii County,Hawaii,96704,19.0,-155.0,09/17/19,17:30:31,8999040011973450260,21504,PHTO,ITO,2019,9,17.0,17.0,30.0,113.0,09/17/19,17:30:31,5-MIN,PHTO,180330Z,,True,80,6.0,False,,False,,False,19,HILO GENERAL LYMAN ARPT US HI PHTO,19.719,-155.053,11.0,19430415,19451228,19,-155


In [159]:
merged_df.date_x.unique()

array(['09/17/19'], dtype=object)