# Investigation of PurpleAir's data
[PurpleAir](http://www.purpleair.com) sells low-cost air quality sensors that feed data to [real-time maps of PM2.5 pollution](https://www.purpleair.com/map?#11/37.789/-122.2048).   
This data will be used for a UC Berkeley capstone project [summarized here](https://docs.google.com/document/d/1NjCpqNd7rDnD6VOExVktGtquRzs21hpwZ8HhLQpYLO8/edit).

### Libraries and installs

In [1]:
import pandas as pd
import pandas_profiling
import numpy as np
import json
import os
import datetime, time
from dateutil import tz
import ast
import re
from matplotlib import pyplot as plt 
import seaborn as sns
import gmplot

import boto3
import s3fs
from fastparquet import ParquetFile, write

import urllib3
import json

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

https = urllib3.PoolManager()

### Data Folder Instructions

In [2]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# Purple Air data ia assumed to be in a subfolder called 'purpleair' 
# For example, if the base data folder is '/users/data', purpleair data should be in '/users/data/purpleair'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

### Historical Load

In [3]:
def createHashKey(row):
    if np.isnan(row['lat']):
        str_lat = ''
    else:
        str_lat = str(row['lat'])
        
        
    if np.isnan(row['lon']):
        str_lon = ''
    else:
        str_lon = str(row['lon'])
        
    return hash(str_lat + str_lon)

In [4]:
# Get data from sensor 2
def genTS2DF(sensordf, month, startday, yr):
    ts_s_df = pd.DataFrame(columns=['created_at', '0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0', 'pm10_0','sensorhash'])
    count, errCount = 0, 0

    for ind, val in sensordf.iterrows():
        qrystr = "https://api.thingspeak.com/channels/{0}/feeds.json?api_key={1}&start=20{4}-{2}-{3}%2000:00:00&end=20{4}-{2}-{3}%2023:59:59& \
                    timezone=America/Los_Angeles&timescale=10".format(val['thingspeak_secondary_id'], val['thingspeak_secondary_id_read_key'], month, startday, yr)
#         print(qrystr)
        try:
            count += 1
            r = https.request('GET',qrystr)
            if r.status == 200:
                j = json.loads(r.data.decode('utf-8'))
                df = pd.DataFrame(j['feeds'])
                df.columns=['created_at', '0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0', 'pm10_0']
                df['sensorhash'] = val['sensorhash']
                ts_s_df = pd.concat([ts_s_df,df],ignore_index=True)
        except Exception as e:
            errCount += 1
            continue
    print("For {}, Of the {} requests, {} errored out.".format(startday, count, errCount))
    
    # Add a key column based on time
    # This along with the sensorhash column will be used to join the two sensor datasets
    ts_s_df['created'] = ts_s_df['created_at'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%Y%m%d%H%M"))
    
    return ts_s_df

In [5]:
# Get data from sensor 1
def genTS1DF(sensordf, month, startday, yr):
    ts_p_df = pd.DataFrame(columns=['created_at', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime', 'rssi', 'temperature', 'humidity', 'pm2_5_cf_1','sensorhash'])
    count, errCount = 0, 0

    for ind, val in sensordf.iterrows():
        qrystr = "https://api.thingspeak.com/channels/{0}/feeds.json?api_key={1}&start=20{4}-{2}-{3}%2000:00:00&end=20{4}-{2}-{3}%2023:59:59& \
                    timezone=America/Los_Angeles&timescale=10".format(val['thingspeak_primary_id'], val['thingspeak_primary_id_read_key'], month, startday, yr)
#         print(qrystr)
        try:
            count += 1
            r = https.request('GET',qrystr)
            if r.status == 200:
                j = json.loads(r.data.decode('utf-8'))
                df = pd.DataFrame(j['feeds'])
                df.columns=['created_at', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime', 'rssi', 'temperature', 'humidity', 'pm2_5_cf_1']
                df['sensorhash'] = val['sensorhash']
                ts_p_df = pd.concat([ts_p_df,df],ignore_index=True)
        except Exception as e:
            errCount += 1
            continue
    print("Of the {} requests, {} errored out.".format(count, errCount))
    
    # Add a key column based on time
    # This along with the sensorhash column will be used to join the two sensor datasets
    ts_p_df['created'] = ts_p_df['created_at'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%Y%m%d%H%M"))
    
    return ts_p_df

In [6]:
# Get thingspeak data:
def getThingspeakData(bayarea_purple_df, month, monthname, day, yr):
    bay_pa_thingspeak_df = bayarea_purple_df[['sensorhash', 'thingspeak_primary_id','thingspeak_primary_id_read_key',
                                               'thingspeak_secondary_id','thingspeak_secondary_id_read_key']]
    bay_pa_thingspeak_df.drop_duplicates(inplace=True)
    bay_pa_thingspeak_df.reset_index(inplace=True, drop=True)

    ts_s_df = genTS2DF(bay_pa_thingspeak_df, month, "{:02}".format(day), yr)
    ts_p_df = genTS1DF(bay_pa_thingspeak_df, month, "{:02}".format(day), yr)
    # Merge data from the two sensors
    # Only keep records having particle data
    bay_ts_df = pd.merge(ts_s_df, ts_p_df,  how='left', left_on=['sensorhash','created'], right_on=['sensorhash','created'])
    bay_ts_df.drop(['created_at_y'], axis=1, inplace=True)
    
    # Write to file
    parquet_file = "{}/thingspeak/thingspeak_{}{:02}.parquet".format(datafolder, monthname, day)
    write(parquet_file, bay_ts_df,compression='GZIP')
    
    return bay_ts_df

In [7]:
# Merge Purple Air data
def mergePurpleAir(pa_df, ts_df, address_df, month, day, yr):
    # Some numeric columns may have "nan" as a string - convert these values to np.nan
    # so that the data type of these columns are correctly identified
    ts_df[['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0','pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime',
           'rssi', 'temperature', 'humidity', 'pm2_5_cf_1']] = ts_df[['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0',
           'pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime', 'rssi', 'temperature', 'humidity', 'pm2_5_cf_1']].replace("nan", np.nan, regex=True)
    ts_df[['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0','pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime',
           'rssi', 'temperature', 'humidity', 'pm2_5_cf_1']] = ts_df[['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0',
           'pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime', 'rssi', 'temperature', 'humidity', 'pm2_5_cf_1']].apply(pd.to_numeric)
    
    # Merge purple air data with sensor data
    # Only keep records having particle data
    ts_df = pd.merge(ts_df, pa_df,  how='left', left_on=['sensorhash'], right_on=['sensorhash'])
    
    # Join address dataframe with main dataframe
    ts_df = pd.merge(ts_df, address_df,  how='left', left_on=['lat','lon'], right_on=['lat','lon'])
    
    ts_df['created_at'] = ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%Y/%m/%dT%H:%M"))
    ts_df['year'] = ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%Y"))
    ts_df['month'] = ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%m"))
    ts_df['day'] = ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%d"))
    ts_df['hour'] = ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%H"))
    ts_df['minute'] = ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%M"))
    
    # Drop unwanted columns
    ts_df.drop(['created_at_x', 'sensorhash', 'country','state'], axis = 1, inplace=True)

# Add nan values for purple air data as we dont have data for these elements prior to 09/14
    ts_df['a_h'] = None
    ts_df['high_reading_flag'] = np.nan
    ts_df['hidden'] = None

    # Convert data type of attributes to string
    ts_df[['high_reading_flag','sensor_id','parent_id', 'is_owner']] = ts_df[['high_reading_flag','sensor_id','parent_id', 'is_owner']].astype(str)
    
    # Save final dataframe for future use
#     parquet_file = "{}/pa_ts/201909{}.parquet".format(datafolder,days_list[i])
    parquet_file = "{}/pa_ts/20{}{}{:02}.parquet".format(datafolder, yr, month, day)
    write(parquet_file, ts_df,compression='GZIP')
    
    return ts_df

In [8]:
def createNOAAdf(lines, fileName):
    """ Helper function to process noaa data"""
    
    # split lines and data chunks
    data = [] # an array of arrays, inner arrays are all data for one record, outer array is all records
    for line in lines:

        # reset any variables if needed
        record = [] 
        Report_Modifier = ''
        Wind_Data = False 
        Variable_Winds = False
        Gusts = False
        Wind_Direction = ''
        Wind_Speed = ''
        Gust_Speed = ''
        Variable_Wind_Info = ''
        System_Maintenance_Reqd = False

        try:
            line = line.split() # take string of one record's data and split into space separated chunks
            WBAN_Number = line[0][0:5] # The WBAN (Weather Bureau, Army, Navy) number is a unique 5-digit number
            Call_Sign = line[0][5:] # The call sign is a location identifier, three or four characters in length 
            suffix = line[1][-2:] # grab the last two digits that are the year (i.e. 19 for 2019)
            Year = '20'+suffix # in YYYY format
            CallSign_Date = re.split(Year, line[1])
            Call_Sign2 = CallSign_Date[0] # this seems to be the same as Call_Sign but without initial letter
            Date = CallSign_Date[1]
            Month = Date[0:2] # in MM format
            Day = Date[2:4] # in DD format
            Hour = Date[4:6] # in HH format
            Minute = Date[6:8] # Observations are recorded on whole five-minute increments (i.e. 00,05,10,...,50,55)
            Record_Length = Date[8:11] # I'm not sure what this is yet - Length of record??
            Date = Date[11:] # MM/DD/YY format
            Timestamp = line[2] # in HH:MM:SS format
            Interval = line[3] # should be 5-MIN as opposed to 1-MIN
            Call_Sign3 = line[4] # for some reason, a THIRD output of the call sign. random.
            Zulu_Time = line[5] # Zulu Time, or military time, or UTC
        except:
            continue

        # after this point, data could be missing/optional and data positions are not fixed
        currIndx = 6
        try:
            Next_Data = line[currIndx]
            if not any(x in Next_Data for x in ['KT','SM']):
                Report_Modifier = Next_Data # AUTO for fully automated report, COR for correction to a previously disseminated report
                currIndx += 1
            Next_Data = line[currIndx]
            if "KT" in Next_Data:
                Wind_Data = True
                Wind_Direction = Next_Data[0:3] # in tens of degrees from true north
                if Next_Data[0:3] == 'VRB':
                    Variable_Winds = True
                Wind_Speed = Next_Data[3:5] # in whole knots (two digits)
                if Next_Data[5] == 'G':
                    Gusts = True
                    Gust_Speed = Next_Data[6:8] # speed in whole knots (two digits)
            else:
                Wind_Data = False
        except:
            print("OUT OF DATA AT FIELD {}".format(currIndx))
            print(line)
        finally:
            currIndx += 1

        try:
            Next_Data = line[currIndx]
            if Wind_Data:
                if (re.fullmatch(r'[0-9][0-9][0-9]V[0-9][0-9][0-9]', Next_Data)): #e.g. 180V240 = wind direction varies from 180 to 240 degrees
                    Variable_Wind_Info = Next_Data
                    Variable_Winds = True
        except:
            print("OUT OF DATA AT FIELD {}".format(currIndx))
            print(line)
            
        if line[-1] == '$':
            System_Maintenance_Reqd = True

        #Sea_Level_Pressure = line[13] # given in tenths of hectopascals (millibars). The last digits are recorded (125 means 1012.5)
        #Station_Type = line[18]
        Num_Fields = len(line)
        record = [WBAN_Number, Call_Sign, Call_Sign2, Year, Month, Day, Hour, Minute, Record_Length, Date, Timestamp, Interval, Call_Sign3, Zulu_Time, 
                  Report_Modifier, Wind_Data, Wind_Direction, Wind_Speed, Gusts, Gust_Speed, Variable_Winds, Variable_Wind_Info, System_Maintenance_Reqd, Num_Fields]
        col_names = ["wban_number", "call_sign", "call_sign2", "year", "month", "day", "hour", "minute", "rec_length", "date", "timestamp", "interval", "call_sign3", 
                     "zulu_time", "report_modifier", "wind_data", "wind_direction", "wind_speed", "gusts", "gust_speed", "variable_winds", "variable_wind_info", "sys_maint_reqd", "num_fields"]
        data.append(record)
    
    sample_df = pd.DataFrame(data, columns = col_names)
    
    # save Dataframe to file
    parquet_file = "{}/noaa/{}.parquet".format(datafolder, fileName)
    write(parquet_file, sample_df,compression='GZIP')
    
    return sample_df

In [9]:
# Get noaa data for the month
def getNOAAData(month, monthyear, yr):
    # Read station data from file that was stored earlier
    unique_station_df = pd.read_parquet("{}/noaa/uniq_station_data.parquet".format(datafolder))

    # List of NOAA stations in the 35 < lat < 40 and  -125 < lon < -120 bounding box
    station_list = ['KAPC', 'KBLU', 'KCCR', 'KHWD', 'KLVK', 'KMAE', 'KMCE', 'KMOD', 'KMRY', 'KMYV', 'KNUQ', 'KOAK', 'KOVE', 'KPRB', 'KSAC', 'KSBP', 'KSCK', 
                    'KSFO', 'KSJC', 'KSMF', 'KSNS', 'KSTS', 'KUKI', 'KVCB', 'KWVI']

    # Get NOAA data for desired stattions in a list
    lines = [] # an array of each read line
    for station in station_list:
        filepath = "ftp://ftp.ncdc.noaa.gov/pub/data/asos-fivemin/6401-20{2}/64010{0}20{2}{1}.dat".format(station, month, yr)
        try:
            for line in pd.read_csv(filepath_or_buffer=filepath , encoding='utf-8', header=None, chunksize=1):
                lines.append(line.iloc[0,0])
        except:
            pass

    # Create noaa dataframe for the month
    noaa_df = createNOAAdf(lines, monthyear)
    # Drop rows where wind speed is not numeric
    noaa_df = noaa_df[noaa_df.wind_speed != 'T']
    merged_noaa_df = pd.merge(noaa_df, unique_station_df, on='wban_number')
    # Convert data type of numeric columns
    merged_noaa_df[['wind_speed','gust_speed','lat','lon']] = merged_noaa_df[['wind_speed','gust_speed','lat','lon']].apply(pd.to_numeric)

    # Get data for bounding box
    bay_noaa_df = merged_noaa_df[(merged_noaa_df.lat > 35) & (merged_noaa_df.lat < 40) 
                                  & (merged_noaa_df.lon > -125) & (merged_noaa_df.lon < -120)]
    bay_noaa_df.reset_index(inplace=True, drop=True)
    bay_noaa_df['datetime'] = bay_noaa_df[['year', 'month','day','hour','minute']].apply(lambda x: int(''.join(x)), axis=1)

    return bay_noaa_df

In [10]:
# Get daily noaa data for the given month
def getDailyNOAA(bay_noaa_df, month, day, yr):
    datestr = '{}/{:02}/{}'.format(month, day, yr)
    dly_noaa_df = bay_noaa_df[bay_noaa_df.date == datestr]
    dly_noaa_df.drop(['year', 'month','day','hour','minute','date','timestamp'], axis=1, inplace=True)
    parquet_file = "{0}/noaa/daily/asos_20{3}{1}{2:02}.parquet".format(datafolder, month, day, yr)
    write(parquet_file, dly_noaa_df,compression='GZIP')
    
    return dly_noaa_df

In [11]:
# Get epa data
def getEPAHistData():
    epa_df = pd.read_csv("{}/ambient/historical_PM25.csv".format(datafolder))
    epa_df.columns = ['lat', 'lon', 'utc', 'parameter', 'epa_pm25_unit', 'epa_pm25_value','raw_concentration', 'aqi', 'category', 'site_name', 'agency_name',
       'full_aqs_code', 'intl_aqs_code']
    
    # Add a datekey column based on local date
    epa_df['created'] = epa_df['utc'].apply(lambda x: int(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').replace(tzinfo=tz.tzutc()).astimezone(tz.tzlocal()).strftime("%Y%m%d%H%M")))
    
    return epa_df

In [12]:
# Get daily interpolated epa data
def getEPADailyData(dateint, dt_ind, month, epa_df, yr):
    start = dateint + dt_ind * 10000
    end = start + 10001
    dly_epa_df = epa_df[(epa_df.created >= start) & (epa_df.created < end)]
    dly_epa_df.reset_index(inplace=True, drop=True)

    new_df = pd.DataFrame(columns=['lat', 'lon', 'utc', 'parameter', 'epa_pm25_unit', 'epa_pm25_value', 'raw_concentration', 'aqi', 'category', 'site_name', 'agency_name', 'full_aqs_code', 'intl_aqs_code', 'created'])
    for sitenm in dly_epa_df.site_name.unique():
        indx_ct = 0
        site_df = dly_epa_df[dly_epa_df.site_name == sitenm]
        for i in site_df.created.unique():
            indx_ct += 1
            new_df =  pd.concat([new_df,site_df.iloc[indx_ct - 1:indx_ct]],ignore_index=True)

            if i != site_df.created.max(): # Don't interpolate the last record
                tmp_df = site_df.iloc[indx_ct - 1:indx_ct][['lat', 'lon', 'utc', 'parameter', 'epa_pm25_unit', 'category', 'site_name', 'agency_name', 'full_aqs_code', 'intl_aqs_code']]
                for j in range(1,6):
                    new_dt = i + j * 10
            #         print(indx_start, j, new_dt)
                    tmp_df['created'] = int(new_dt)
                    tmp_df['epa_pm25_value'] = np.nan
                    tmp_df['raw_concentration'] = np.nan
                    tmp_df['aqi'] = np.nan
                    new_df =  pd.concat([new_df,tmp_df],ignore_index=True)

    # Convert aqi to numerica for so that it gets interpolated
    new_df[['aqi']] = new_df[['aqi']].replace("nan", np.nan, regex=True)
    new_df[['aqi']] = new_df[['aqi']].apply(pd.to_numeric)

    new_df = new_df.interpolate(method='linear', limit_direction='forward', axis=0)

    int_epa_df = new_df[(new_df.created >= start) & (new_df.created < (end - 1))]
    int_epa_df.reset_index(inplace=True, drop=True)

    parquet_file = "{0}/ambient/daily_interpolated/epa_20{3}{1}{2:02}.parquet".format(datafolder, month, dt_ind, yr)
    write(parquet_file, int_epa_df,compression='GZIP')
    
    return int_epa_df

In [13]:
# Function for mapping closest lat-lon data point
def mapLatLon(ts_df, ts_latlon_df, lkp_df, maphashcol, datecol):
    # Add lat-lon based hashes to noaa and purple air dataframes
    lkp_df[maphashcol] = lkp_df.apply (lambda row: createHashKey(row), axis=1)
    
    # Keep only the asos columns needed to determine the lat-lon mapping
    lkp_latlon_df = lkp_df[[maphashcol,'lat','lon']]
    lkp_latlon_df.drop_duplicates(inplace=True)
    lkp_latlon_df.set_index(maphashcol, inplace=True)
    
    # Find the closest lat-lon mapping corresponding to the purple air records
    closest_points = {}
    for name, point in ts_latlon_df.iterrows():
    #     print(name, point)
    #     break
        distances = (((lkp_latlon_df - point) ** 2).sum(axis=1)**.5)
        closest_points[name] = distances.sort_values().index[0]

    # Create dataframe from lat-lon mapping
    latlonmap_df = pd.DataFrame(list(closest_points.items()), columns=['tslatlonhash',maphashcol])
    
    # Merge purple air data to lat-lon mappings first and then 
    # merge the resulting dataframe to asos and epa dataframes
    merged_df = pd.merge(ts_df, latlonmap_df, on='tslatlonhash')
    
     # Drop common and unwanted columns from noaa and epa dataframes
    lkp_df.drop(['lat','lon'], axis=1, inplace=True)
    
    # Combine asos data
    combined_df = pd.merge(merged_df, lkp_df,  how='left', left_on=[maphashcol, 'created'], right_on=[maphashcol, datecol])
    
    return combined_df

In [14]:
# Function to combine data from various sources
def combineData(noaa_df, epa_df, bay_ts_df, month, day, yr):
    # Add lat-lon based hashes to noaa and purple air dataframes
    bay_ts_df['tslatlonhash'] = bay_ts_df.apply (lambda row: createHashKey(row), axis=1)
    
    # Keep only the purple air columns needed to determine the lat-lon mapping
    ts_latlon_df = bay_ts_df[['tslatlonhash','lat','lon']]
    ts_latlon_df.drop_duplicates(inplace=True)
    ts_latlon_df.set_index('tslatlonhash', inplace=True)
    
    # Find the closest asos lat-lon mapping corresponding to the purple air records
    combined_df = mapLatLon(bay_ts_df, ts_latlon_df, noaa_df, 'asoslatlonhash', 'datetime')
     
    # Find the closest asos lat-lon mapping corresponding to the purple air records
    combined_df = mapLatLon(combined_df, ts_latlon_df, epa_df, 'epalatlonhash', 'created')
     
    # Drop unwanted columns
    combined_df.drop(['tslatlonhash', 'asoslatlonhash', 'epalatlonhash', 'rec_length','num_fields', 'datetime', 'utc', 'parameter'], axis=1, inplace=True)

    combined_df.columns = ['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0', 'pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime','rssi', 'temperature', 'humidity', 'pm2_5_cf_1', 
                       'device_loc_typ','is_owner', 'sensor_id', 'sensor_name', 'parent_id', 'lat', 'lon','thingspeak_primary_id', 'thingspeak_primary_id_read_key','thingspeak_secondary_id', 
                       'thingspeak_secondary_id_read_key', 'city', 'county', 'zipcode', 'created_at', 'year', 'month', 'day', 'hour', 'minute', 'a_h', 'high_reading_flag', 'hidden', 'wban_number', 
                       'call_sign', 'call_sign2', 'interval', 'call_sign3', 'zulu_time', 'report_modifier', 'wind_data', 'wind_direction', 'wind_speed', 'gusts','gust_speed', 'variable_winds', 'variable_wind_info', 
                       'sys_maint_reqd', 'agency_name', 'aqi', 'category', 'epa_pm25_unit', 'epa_pm25_value', 'full_aqs_code', 'intl_aqs_code', 'raw_concentration', 'site_name']
    
    combined_df = combined_df[['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0','pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime','rssi', 
                       'temperature', 'humidity', 'pm2_5_cf_1', 'device_loc_typ', 'is_owner', 'sensor_id', 'sensor_name', 'parent_id','lat', 'lon',  'thingspeak_primary_id', 
                       'thingspeak_primary_id_read_key', 'thingspeak_secondary_id', 'thingspeak_secondary_id_read_key', 'a_h', 'high_reading_flag', 'hidden',
                       'city', 'county', 'zipcode', 'created_at', 'year', 'month', 'day', 'hour', 'minute', 'wban_number', 'call_sign', 'call_sign2', 'interval', 
                       'call_sign3', 'zulu_time', 'report_modifier', 'wind_data', 'wind_direction', 'wind_speed', 'gusts', 'gust_speed', 'variable_winds', 'variable_wind_info', 
                       'sys_maint_reqd', 'epa_pm25_unit', 'epa_pm25_value', 'raw_concentration', 'aqi', 'category', 'site_name', 'agency_name', 'full_aqs_code', 'intl_aqs_code']]
    
    # Write to file
    parquet_file = "{0}/combined_interpolated/20{3}{1}{2:02}.parquet".format(datafolder, month, day, yr)  
    write(parquet_file, combined_df,compression='GZIP')

In [15]:
# Function for loading historical data
def loadHistory(month, monthname, monthyear, startindex, endindex, dateint, yr):
    # Create dataframe for existing addresses
    address_df = pd.read_parquet("{}/purpleair/address_latlon.parquet".format(datafolder))

    # Since we don't have purple air data prior to 09/14, we are using the 09/14 dataset
    # to get a list of the sensors and correponding location
    bay_purple_df = pd.read_parquet("{}/purpleair/dailyfiltered/20190914.parquet".format(datafolder))

    # Get lat lon info from purple air dataset into a separate dataframe
    bay_purple_latlon_df = bay_purple_df[['device_loc_typ', 'is_owner', 'sensor_id', 'sensor_name',  'parent_id', 'lat', 'lon', 'thingspeak_primary_id', 'thingspeak_primary_id_read_key', 'thingspeak_secondary_id', 
                                      'thingspeak_secondary_id_read_key', 'sensorhash']]
    bay_purple_latlon_df.drop_duplicates(inplace=True)

    # Get noaa data for the entire month
    bay_noaa_df = getNOAAData(month, monthyear, yr)

    # Get historical epa data
    epa_df = getEPAHistData()

    for i in range(startindex, endindex):
        try:
            # Get thingspeak data
            bay_ts_df = getThingspeakData(bay_purple_df, month, monthname, i, yr)

            # Merge purple air data
            bay_ts_df = mergePurpleAir(bay_purple_latlon_df, bay_ts_df, address_df, month, i, yr)

            # Get noaa data
            dly_noaa_df = getDailyNOAA(bay_noaa_df, month, i, yr)

            # Get epa data
            int_epa_df = getEPADailyData(dateint, i, month, epa_df, yr)

            # Combine data and save to file
            combineData(dly_noaa_df, int_epa_df, bay_ts_df, month, i, yr)
        except:
            print(month, startindex)
            continue

In [None]:
loadHistory('05', 'may', 'May2019', 30, 32, 201905000000)
# Errors - 06, 19,

In [None]:
loadHistory('02', 'feb', 'Feb2019', 6, 29, 201902000000)
# Errors - 05

In [None]:
loadHistory('04', 'apr', 'Apr2019', 21, 31, 201904000000, '19')
# NEED TO REPROCESS WHOLE MONTH

In [16]:
loadHistory('12', 'dec', 'Dec2018', 1, 32, 201812000000, '18')
# Errors - 1,2

OUT OF DATA AT FIELD 8
['93210KOVE', 'OVE20181212002505212/12/18', '00:25:31', '5-MIN', 'KOVE', '120825Z', 'AUTO', '090090056']
For 01, Of the 593 requests, 377 errored out.
Of the 593 requests, 363 errored out.
For 02, Of the 593 requests, 372 errored out.
Of the 593 requests, 353 errored out.
For 03, Of the 593 requests, 372 errored out.
Of the 593 requests, 353 errored out.
For 04, Of the 593 requests, 376 errored out.
Of the 593 requests, 355 errored out.
For 05, Of the 593 requests, 361 errored out.
Of the 593 requests, 337 errored out.
For 06, Of the 593 requests, 345 errored out.
Of the 593 requests, 320 errored out.
For 07, Of the 593 requests, 356 errored out.
Of the 593 requests, 331 errored out.
For 08, Of the 593 requests, 362 errored out.
Of the 593 requests, 337 errored out.
For 09, Of the 593 requests, 356 errored out.
Of the 593 requests, 331 errored out.
For 10, Of the 593 requests, 355 errored out.
Of the 593 requests, 331 errored out.
For 11, Of the 593 requests, 331

In [16]:
loadHistory('09', 'sep', 'Sep2018', 1, 31, 201809000000, '18')
# Errors - 

OUT OF DATA AT FIELD 8
['93210KOVE', 'OVE20180929001005709/29/18', '00:10:31', '5-MIN', 'KOVE', '290810Z', 'AUTO', '15016T01610094']
For 01, Of the 593 requests, 528 errored out.
Of the 593 requests, 525 errored out.
For 02, Of the 593 requests, 531 errored out.
Of the 593 requests, 531 errored out.
For 03, Of the 593 requests, 529 errored out.
Of the 593 requests, 529 errored out.
For 04, Of the 593 requests, 528 errored out.
Of the 593 requests, 527 errored out.
For 05, Of the 593 requests, 529 errored out.
Of the 593 requests, 529 errored out.
For 06, Of the 593 requests, 531 errored out.
Of the 593 requests, 531 errored out.
For 07, Of the 593 requests, 529 errored out.
Of the 593 requests, 529 errored out.
For 08, Of the 593 requests, 532 errored out.
Of the 593 requests, 531 errored out.
For 09, Of the 593 requests, 532 errored out.
Of the 593 requests, 531 errored out.
For 10, Of the 593 requests, 530 errored out.
Of the 593 requests, 529 errored out.
For 11, Of the 593 requests