### Function for getting data

In [11]:
import datetime
from datetime import date, timedelta
from os import path
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [None]:
# Class for throwing custom errors
class CustomError(Exception):
    def __init__(self, m):
        self.message = m
    def __str__(self):
        return self.message

In [None]:
# Helper function for getting dates in a given range
def getDates(start, end):
    date_list = []
    start_date = datetime.datetime.strptime(start, "%Y/%m/%d").date()
    end_date = datetime.datetime.strptime(end, "%Y/%m/%d").date()

    delta = end_date - start_date       # as timedelta

    for i in range(delta.days + 1):
        day = start_date + timedelta(days=i)
        date_list.append(day.strftime("%Y%m%d"))
        
    return date_list

In [None]:
# Helper function for loading data into a dataframe
def loadDataframe(files):
    df = pd.DataFrame(columns=['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0','pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime','rssi', 
                       'temperature', 'humidity', 'pm2_5_cf_1', 'device_loc_typ', 'is_owner', 'sensor_id', 'sensor_name', 'parent_id','lat', 'lon',  'thingspeak_primary_id', 
                       'thingspeak_primary_id_read_key', 'thingspeak_secondary_id', 'thingspeak_secondary_id_read_key', 'a_h', 'high_reading_flag', 'hidden',
                       'city', 'county', 'zipcode', 'created_at', 'year', 'month', 'day', 'hour', 'minute', 'wban_number', 'call_sign', 'call_sign2', 'interval', 
                       'call_sign3', 'zulu_time', 'report_modifier', 'wind_data', 'wind_direction', 'wind_speed', 'gusts', 'gust_speed', 'variable_winds', 'variable_wind_info', 
                       'sys_maint_reqd', 'epa_pm25_unit', 'epa_pm25_value', 'raw_concentration', 'aqi', 'category', 'site_name', 'agency_name', 'full_aqs_code', 'intl_aqs_code'])
    
    for file in files:
        file_name = "{}.parquet".format(file)
        if path.exists(file_name):
            tmp_df = pd.read_parquet(file_name)
            df = pd.concat([df,tmp_df],ignore_index=True) 
        else:
            print("File {} does not exist".format(file_name))

    return df

In [None]:
# Main function for getting data
def get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR):
    
    # Create variables from parameters
    startfile = int(START_DATE.replace('/',''))
    endfile = int(END_DATE.replace('/',''))
    lat_min = DOWN_LEFT[0]
    lat_max = UP_RIGHT[0]
    lon_min = DOWN_LEFT[1]
    lon_max = UP_RIGHT[1]

    try:
        # Check Parameters
        if startfile <= endfile:
            file_list = getDates(START_DATE, END_DATE)
            df = loadDataframe(file_list)
            # Filter data for input bounding box
            df = df[(df.lat > lat_min) & (df.lat < lat_max) 
                              & (df.lon > lon_min) & (df.lon < lon_max)]
            
            # Filter data for input  hours
            df = df[(df.hour >= START_HOUR) & (df.hour <= END_HOUR)]
            df.reset_index(inplace=True, drop=True)
            return df
        else:
            raise CustomError("INPUT ERROR: Start Date is greater than End Date")
    except Exception as e:
        print(e)

### Use Python file to get data

In [13]:
from getData import get_data

In [14]:
UP_LEFT = (38.008050, -122.536985)
UP_RIGHT = (38.008050, -122.186437)
DOWN_RIGHT = (37.701933, -122.186437)
DOWN_LEFT = (37.701933, -122.536985)
START_DATE = '2019/09/27'
END_DATE = '2019/10/02'
START_HOUR = '10'
END_HOUR = '18'

In [16]:
df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR)

File 20190928.parquet does not exist
File 20190929.parquet does not exist
File 20190930.parquet does not exist
File 20191001.parquet does not exist
File 20191002.parquet does not exist


In [17]:
len(df)

27612

In [18]:
df.lat.min(), df.lat.max(), df.lon.min(), df.lon.max()

(37.706918, 37.98261, -122.535636, -122.188555)

In [19]:
df.hour.min(), df.hour.max()

('10', '18')

In [20]:
df.head()

Unnamed: 0,0_3um,0_5um,10_0um,1_0um,2_5um,5_0um,a_h,agency_name,aqi,call_sign,call_sign2,call_sign3,category,city,county,created,created_at,day,device_loc_typ,epa_pm25_unit,epa_pm25_value,full_aqs_code,gust_speed,gusts,hidden,high_reading_flag,hour,humidity,interval,intl_aqs_code,is_owner,lat,lon,minute,month,parent_id,pm10_0,pm10_0_atm,pm1_0,pm1_0_atm,pm2_5_atm,pm2_5_cf_1,raw_concentration,report_modifier,rssi,sensor_id,sensor_name,site_name,sys_maint_reqd,temperature,uptime,variable_wind_info,variable_winds,wban_number,wind_data,wind_direction,wind_speed,year,zipcode,zulu_time
0,477.31,134.21,0.0,22.26,2.15,1.29,,San Francisco Bay Area AQMD,7.0,KSFO,SFO,KSFO,1.0,San Francisco,San Francisco County,201909271000,2019/09/27T10:00,27,outside,UG/M3,1.6,60750005.0,,0.0,False,,10,72.0,5-MIN,840060800000.0,0,37.72244,-122.439302,0,9,,3.41,3.41,1.84,1.84,2.68,2.68,-1.0,,-75.0,16939,#SAFQ11,San Francisco,0.0,69.0,468.0,,0.0,23234,1.0,290,11.0,2019,94112,271800Z
1,444.75,130.32,0.0,20.85,2.71,0.88,,San Francisco Bay Area AQMD,7.0,KSFO,SFO,KSFO,1.0,San Francisco,San Francisco County,201909271010,2019/09/27T10:10,27,outside,UG/M3,1.6,60750005.0,,0.0,False,,10,73.0,5-MIN,840060800000.0,0,37.72244,-122.439302,10,9,,3.25,3.25,1.47,1.47,2.69,2.69,-1.0,,-70.0,16939,#SAFQ11,San Francisco,0.0,69.0,478.0,,0.0,23234,1.0,280,12.0,2019,94112,271810Z
2,477.72,141.43,0.0,24.33,2.0,0.39,,San Francisco Bay Area AQMD,7.0,KSFO,SFO,KSFO,1.0,San Francisco,San Francisco County,201909271020,2019/09/27T10:20,27,outside,UG/M3,1.6,60750005.0,,0.0,False,,10,72.0,5-MIN,840060800000.0,0,37.72244,-122.439302,20,9,,3.28,3.28,1.85,1.85,3.04,3.04,-1.0,,-70.0,16939,#SAFQ11,San Francisco,0.0,69.0,488.0,,0.0,23234,1.0,270,13.0,2019,94112,271820Z
3,590.0,173.65,0.0,31.58,1.45,0.0,,San Francisco Bay Area AQMD,7.0,KSFO,SFO,KSFO,1.0,San Francisco,San Francisco County,201909271030,2019/09/27T10:30,27,outside,UG/M3,1.6,60750005.0,,0.0,False,,10,73.0,5-MIN,840060800000.0,0,37.72244,-122.439302,30,9,,3.83,3.83,2.26,2.26,3.83,3.83,-1.0,,-70.0,16939,#SAFQ11,San Francisco,0.0,69.0,498.0,,0.0,23234,1.0,250,11.0,2019,94112,271830Z
4,435.9,118.88,0.0,20.88,1.55,0.36,,San Francisco Bay Area AQMD,7.0,KSFO,SFO,KSFO,1.0,San Francisco,San Francisco County,201909271040,2019/09/27T10:40,27,outside,UG/M3,1.6,60750005.0,18.0,1.0,False,,10,73.0,5-MIN,840060800000.0,0,37.72244,-122.439302,40,9,,2.31,2.31,0.96,0.96,2.13,2.13,-1.0,,-72.0,16939,#SAFQ11,San Francisco,0.0,69.0,512.0,,0.0,23234,1.0,240,12.0,2019,94112,271840Z
