### OpenAQ API

In [None]:
import requests
import json

def pp(data):
    print(json.dumps(data, indent=4, sort_keys=True))

# get reference grade sensors in the United States
url = "https://api.openaq.org/v2/locations?limit=4000&page=1&offset=0&sort=desc&country=US&order_by=lastUpdated&sensor_type=reference%20grade&dump_raw=false"

headers = {
    'accept': 'application/json',
    'X-API-Key': '111aa3992802fb389df494260d28256eb6988ea71562749d7dc0b9e9e9c4d152'
}

api_data = requests.get(url, headers=headers).json()

# get all location ids
location_ids = []
for i in api_data["results"]:
    if i['isMobile'] == False:
        location_ids.append(i["id"])

### AWS Data Download

In [1]:
import boto3
from botocore import UNSIGNED
from botocore.config import Config

import pandas as pd
import csv
import gzip
import os

import concurrent.futures
from tqdm import tqdm

s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

#----- Functions -----
def download_and_process_files(key):
    # download object from S3
    # print(f'Downloading {key}')
    obj = s3.get_object(Bucket=bucket_name, Key=key)

    # unzip and store in dataframe
    with gzip.open(obj['Body'], 'rb') as f:
        df = pd.read_csv(f)
    return df
# --------------------

# OpenAQ bucket name and base path
bucket_name = 'openaq-data-archive'
base_s3_path = 'records/csv.gz'

# location of CSV with location IDs
location_ids_csv = './location_ids.csv'

# parquet output path
parquet_path = 'G:/_data'

# number of workers used to download from S3 in parallel
num_workers = 200

location_ids = []

with open(location_ids_csv, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        location_ids.append(row[0])

for i in range(len(location_ids)):

    location_id = location_ids[i]

    
    # check for existing local file
    if os.path.exists(f'{parquet_path}/{location_id}.parquet'):
        print(f'Location {location_id} [{i + 1}/{len(location_ids)}] : Already downloaded')
        continue

    # set S3 path
    s3_path = f'{base_s3_path}/locationid={location_id}'

    # init pagination variables
    continuation_token = None
    s3_files = []

    # paginate through the objects
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_path, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_path)

        s3_files.extend([obj['Key'] for obj in response.get('Contents', [])])
        
        if 'NextContinuationToken' in response:
            continuation_token = response['NextContinuationToken']
        else:
            break

    # finished grabbing list of files for this location, print number of files
    print(f'Location {location_id} [{i + 1}/{len(location_ids)}] : {len(s3_files)} files')

    # download objects in parallel, using tqdm to track progress
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        dataframes = list(tqdm(executor.map(download_and_process_files, s3_files), total=len(s3_files)))

    # write Parquet file
    try:
        parquet_file = f'{parquet_path}/{location_id}.parquet'
        print(f'Writing Parquet file: {parquet_file}')
        pd.concat(dataframes, ignore_index=True).to_parquet(parquet_file, engine='pyarrow', index=False)
    except Exception as e:
        print(f'Error writing Parquet file: {e}')

Location 1414589 [1/3012] : Already downloaded
Location 1370216 [2/3012] : Already downloaded
Location 1254 [3/3012] : Already downloaded
Location 162 [4/3012] : Already downloaded
Location 992 [5/3012] : Already downloaded
Location 994 [6/3012] : Already downloaded
Location 996 [7/3012] : Already downloaded
Location 999 [8/3012] : Already downloaded
Location 1000 [9/3012] : Already downloaded
Location 1002 [10/3012] : Already downloaded
Location 1003 [11/3012] : Already downloaded
Location 1006 [12/3012] : Already downloaded
Location 1008 [13/3012] : Already downloaded
Location 1010 [14/3012] : Already downloaded
Location 1013 [15/3012] : Already downloaded
Location 1014 [16/3012] : Already downloaded
Location 1015 [17/3012] : Already downloaded
Location 1016 [18/3012] : Already downloaded
Location 1017 [19/3012] : Already downloaded
Location 1022 [20/3012] : Already downloaded
Location 1023 [21/3012] : Already downloaded
Location 1027 [22/3012] : Already downloaded
Location 1028 [23/

0it [00:00, ?it/s]

Writing Parquet file: G:/_data/2852032.parquet
Error writing Parquet file: No objects to concatenate
Location 173 [1351/3012] : Already downloaded
Location 186 [1352/3012] : Already downloaded
Location 187 [1353/3012] : Already downloaded
Location 188 [1354/3012] : Already downloaded
Location 191 [1355/3012] : Already downloaded
Location 193 [1356/3012] : Already downloaded
Location 195 [1357/3012] : Already downloaded
Location 199 [1358/3012] : Already downloaded
Location 213 [1359/3012] : Already downloaded
Location 215 [1360/3012] : Already downloaded
Location 216 [1361/3012] : Already downloaded
Location 218 [1362/3012] : Already downloaded
Location 220 [1363/3012] : Already downloaded
Location 221 [1364/3012] : Already downloaded
Location 222 [1365/3012] : Already downloaded
Location 223 [1366/3012] : Already downloaded
Location 227 [1367/3012] : Already downloaded
Location 229 [1368/3012] : Already downloaded
Location 230 [1369/3012] : Already downloaded
Location 231 [1370/3012] 




Location 277 [1387/3012] : 16869 files


100%|██████████| 16869/16869 [00:47<00:00, 352.63it/s]


Writing Parquet file: G:/_data/277.parquet
Location 278 [1388/3012] : 12163 files


100%|██████████| 12163/12163 [00:59<00:00, 205.44it/s]


Writing Parquet file: G:/_data/278.parquet
Location 279 [1389/3012] : 3870 files


100%|██████████| 3870/3870 [00:12<00:00, 308.30it/s] 


Writing Parquet file: G:/_data/279.parquet
Location 280 [1390/3012] : 4091 files


100%|██████████| 4091/4091 [00:11<00:00, 349.09it/s]


Writing Parquet file: G:/_data/280.parquet
Location 283 [1391/3012] : 9587 files


100%|██████████| 9587/9587 [00:26<00:00, 357.00it/s] 


Writing Parquet file: G:/_data/283.parquet
Location 284 [1392/3012] : 4708 files


100%|██████████| 4708/4708 [00:06<00:00, 753.84it/s]  


Writing Parquet file: G:/_data/284.parquet
Location 288 [1393/3012] : 9948 files


100%|██████████| 9948/9948 [00:27<00:00, 358.54it/s]


Writing Parquet file: G:/_data/288.parquet
Location 293 [1394/3012] : 10974 files


100%|██████████| 10974/10974 [00:29<00:00, 365.89it/s]


Writing Parquet file: G:/_data/293.parquet
Location 305 [1395/3012] : 20181 files


100%|██████████| 20181/20181 [00:13<00:00, 1511.72it/s] 


Writing Parquet file: G:/_data/305.parquet
Location 314 [1396/3012] : 18976 files


100%|██████████| 18976/18976 [00:51<00:00, 366.17it/s] 


Writing Parquet file: G:/_data/314.parquet
Location 315 [1397/3012] : 19570 files


100%|██████████| 19570/19570 [00:55<00:00, 350.03it/s] 


Writing Parquet file: G:/_data/315.parquet
Location 319 [1398/3012] : 17367 files


100%|██████████| 17367/17367 [00:35<00:00, 484.18it/s] 


Writing Parquet file: G:/_data/319.parquet
Location 321 [1399/3012] : 18761 files


 10%|█         | 1893/18761 [00:06<00:57, 291.45it/s]
