In [1]:
import pandas as pd
import pandas_profiling
import numpy as np
import json
import datetime, time
from dateutil import tz
import ast
from matplotlib import pyplot as plt 
import seaborn as sns
import gmplot

import boto3
import s3fs
from fastparquet import ParquetFile, write

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [2]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# Purple Air data ia assumed to be in a subfolder called 'purpleair' 
# For example, if the base data folder is '/users/data', purpleair data should be in '/users/data/purpleair'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

In [3]:
# # Downloaded the historical_PM25.csv locally for initial merge
# epa_df = pd.read_csv("{}/ambient/historical_PM25.csv".format(datafolder))

In [4]:
# Read historical epa data from s3
bucket = "capstone-air-pollution"
file_name = "EPA/historical_PM25.csv"  # historical
# file_name = "EPA/201910_PM25.csv"  # current

s3 = boto3.client('s3') 
obj = s3.get_object(Bucket= bucket, Key= file_name) 
epa_df = pd.read_csv(obj['Body']) 

In [4]:
epa_df.columns = ['lat', 'lon', 'utc', 'parameter', 'epa_pm25_unit', 'epa_pm25_value','raw_concentration', 'aqi', 'category', 'site_name', 'agency_name',
       'full_aqs_code', 'intl_aqs_code']

In [5]:
# Add a datekey column based on local date
epa_df['created'] = epa_df['utc'].apply(lambda x: int(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').replace(tzinfo=tz.tzutc()).astimezone(tz.tzlocal()).strftime("%Y%m%d%H%M")))

In [6]:
epa_df.head()

Unnamed: 0,lat,lon,utc,parameter,epa_pm25_unit,epa_pm25_value,raw_concentration,aqi,category,site_name,agency_name,full_aqs_code,intl_aqs_code,created
0,37.9722,-122.5189,2018-09-01 00:00:00,PM2.5,UG/M3,9.4,12.0,39,1,San Rafael,San Francisco Bay Area AQMD,60410001,840060410001,201808311700
1,37.7658,-122.3978,2018-09-01 00:00:00,PM2.5,UG/M3,6.2,6.0,26,1,San Francisco,San Francisco Bay Area AQMD,60750005,840060750005,201808311700
2,37.9604,-122.3571,2018-09-01 00:00:00,PM2.5,UG/M3,10.0,11.0,42,1,San Pablo - Rumrill,San Francisco Bay Area AQMD,60131004,840060131004,201808311700
3,37.864767,-122.302741,2018-09-01 00:00:00,PM2.5,UG/M3,2.7,4.0,11,1,Berkeley Aquatic Park,San Francisco Bay Area AQMD,60010013,840060010013,201808311700
4,37.8148,-122.282402,2018-09-01 00:00:00,PM2.5,UG/M3,11.0,9.0,46,1,Oakland West,San Francisco Bay Area AQMD,60010011,840060010011,201808311700


In [7]:
epa_df.utc.nunique(), len(epa_df)

(9481, 55603)

In [8]:
epa_df.created.min(), epa_df.created.max()

(201808311700, 201909301700)

In [23]:
dateint = 201909000000
for i in range(1,31):
    start = dateint + i * 10000
    end = start + 10001
    dly_epa_df = epa_df[(epa_df.created >= start) & (epa_df.created < end)]
    
    parquet_file = "{}/ambient/daily/epa_201909{:02}.parquet".format(datafolder, i)
    write(parquet_file, dly_epa_df,compression='GZIP')

In [24]:
dly_epa_df.created.min(), dly_epa_df.created.max(), dly_epa_df.utc.nunique(), len(dly_epa_df)

(201909300000, 201909301700, 18, 98)