### Function for getting data

In [None]:
import datetime
from datetime import date, timedelta
from os import path
import pandas as pd

import boto3
import s3fs
from fastparquet import ParquetFile

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [None]:
# Class for throwing custom errors
class CustomError(Exception):
    def __init__(self, m):
        self.message = m
    def __str__(self):
        return self.message

In [None]:
# Helper function for getting dates in a given range
def getDates(start, end):
    date_list = []
    start_date = datetime.datetime.strptime(start, "%Y/%m/%d").date()
    end_date = datetime.datetime.strptime(end, "%Y/%m/%d").date()

    delta = end_date - start_date       # as timedelta

    for i in range(delta.days + 1):
        day = start_date + timedelta(days=i)
        date_list.append(day.strftime("%Y%m%d"))
        
    return date_list

In [None]:
# Helper function for loading data into a dataframe
def loadDataframe(files):
            
    s3 = s3fs.S3FileSystem()
    myopen = s3.open
    
    df = pd.DataFrame(columns=['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0','pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime','rssi', 
                       'temperature', 'humidity', 'pm2_5_cf_1', 'device_loc_typ', 'is_owner', 'sensor_id', 'sensor_name', 'parent_id','lat', 'lon',  'thingspeak_primary_id', 
                       'thingspeak_primary_id_read_key', 'thingspeak_secondary_id', 'thingspeak_secondary_id_read_key', 'a_h', 'high_reading_flag', 'hidden',
                       'city', 'county', 'zipcode', 'created_at', 'year', 'month', 'day', 'hour', 'minute', 'wban_number', 'call_sign', 'call_sign2', 'interval', 
                       'call_sign3', 'zulu_time', 'report_modifier', 'wind_data', 'wind_direction', 'wind_speed', 'gusts', 'gust_speed', 'variable_winds', 'variable_wind_info', 
                       'sys_maint_reqd', 'epa_pm25_unit', 'epa_pm25_value', 'raw_concentration', 'aqi', 'category', 'site_name', 'agency_name', 'full_aqs_code', 'intl_aqs_code'])

    for filenm in files:
        pf=ParquetFile('midscapstone-whos-polluting-my-air/CombinedDaily/{}.parquet'.format(filenm), open_with=myopen)
        tmp_df=pf.to_pandas()
        df = pd.concat([df, tmp_df],ignore_index=True)

    return df

In [None]:
# Main function for getting data
def get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR):
    
    # Create variables from parameters
    startfile = int(START_DATE.replace('/',''))
    endfile = int(END_DATE.replace('/',''))
    lat_min = DOWN_LEFT[0]
    lat_max = UP_RIGHT[0]
    lon_min = DOWN_LEFT[1]
    lon_max = UP_RIGHT[1]

    try:
        # Check Parameters
        if startfile <= endfile:
            file_list = getDates(START_DATE, END_DATE)
            
            df = loadDataframe(file_list)
            # Filter data for input bounding box
            df = df[(df.lat > lat_min) & (df.lat < lat_max) 
                              & (df.lon > lon_min) & (df.lon < lon_max)]
            
            # Filter data for input  hours
            df = df[(df.hour >= START_HOUR) & (df.hour <= END_HOUR)]
            df.reset_index(inplace=True, drop=True)
            return df
        else:
            raise CustomError("INPUT ERROR: Start Date is greater than End Date")
    except Exception as e:
        print(e)

### Use Python file to get data

In [1]:
from getData import get_data

In [2]:
UP_LEFT = (38.008050, -122.536985)
UP_RIGHT = (38.008050, -122.186437)
DOWN_RIGHT = (37.701933, -122.186437)
DOWN_LEFT = (37.701933, -122.536985)
START_DATE = '2019/09/18'
END_DATE = '2019/09/21'
START_HOUR = '10'
END_HOUR = '18'

In [3]:
df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR)

In [4]:
len(df)

112237

In [5]:
df.lat.min(), df.lat.max(), df.lon.min(), df.lon.max()

(37.706918, 37.98261, -122.535636, -122.188555)

In [6]:
df.hour.min(), df.hour.max()

('10', '18')

In [7]:
df.day.unique()

array(['18', '19', '20', '21'], dtype=object)

In [8]:
df.head()

Unnamed: 0,0_3um,0_5um,1_0um,2_5um,5_0um,10_0um,pm1_0,pm10_0,created,pm1_0_atm,...,sys_maint_reqd,epa_pm25_unit,epa_pm25_value,raw_concentration,aqi,category,site_name,agency_name,full_aqs_code,intl_aqs_code
0,431.41,128.38,21.32,5.18,1.94,0.58,1.32,4.08,201909181000,1.32,...,False,UG/M3,2.8,2.0,12,1,San Francisco,San Francisco Bay Area AQMD,60750005,840060750005
1,426.2,124.74,23.25,5.34,1.26,0.34,1.08,3.67,201909181010,1.08,...,False,UG/M3,2.8,2.0,12,1,San Francisco,San Francisco Bay Area AQMD,60750005,840060750005
2,347.96,102.4,18.71,3.5,1.88,0.94,0.78,3.07,201909181020,1.21,...,False,UG/M3,2.8,2.0,12,1,San Francisco,San Francisco Bay Area AQMD,60750005,840060750005
3,361.39,106.0,24.86,5.48,2.06,0.38,0.62,3.39,201909181030,0.62,...,False,UG/M3,2.8,2.0,12,1,San Francisco,San Francisco Bay Area AQMD,60750005,840060750005
4,333.17,95.73,16.6,5.03,2.29,1.2,0.91,3.64,201909181040,0.91,...,False,UG/M3,2.8,2.0,12,1,San Francisco,San Francisco Bay Area AQMD,60750005,840060750005
