In [1]:
import requests
import json
from datetime import datetime, timedelta, timezone
import pandas as pd

In [98]:
from datetime import datetime, timedelta

def generate_urls(base_url, start_date, end_date, step_days):
    date_format = "%Y-%m-%dT%H:%M:%S%z"
    urls = []

    current_date = start_date
    while current_date <= end_date:
        current_date_str = current_date.strftime(date_format)
        next_date = current_date + timedelta(days=step_days)
        next_date_str = next_date.strftime(date_format)

        query_url = f"{base_url}&date_from={current_date_str}&date_to={next_date_str}"
        urls.append(query_url)

        current_date = next_date  # move to the next interval

    return urls

base_url = "https://api.openaq.org/v2/measurements?location_id=63272&parameter=um010&parameter=pm1&limit=10000"
#base_url = "https://api.openaq.org/v2/measurements?location_id=63272&parameter=pm25&parameter=pm10&limit=10000"
#base_url = "https://api.openaq.org/v2/measurements?location_id=63272&parameter=um100&parameter=um025&limit=10000"
start_date = datetime(2021, 12, 31, 23, 0, 0, 0)
end_date = datetime(2023, 1, 1, 23, 0, 0)
step_days = 4

urls = generate_urls(base_url, start_date, end_date, step_days)

In [107]:
def get_data_and_save(urls, output_file):
    accumulated_data = []

    for url in urls[50:]:
        response = requests.get(url)
        data = response.json()

        try:
            # check if the "results" key exists in the data
            if "results" in data:
                accumulated_data.extend(data["results"])
        except KeyError:
            pass
            print(f"Error: 'results' key not found in the {url}")

    # Save as JSON
    output = {"results": accumulated_data}
    with open(output_file, "w") as file:
        json.dump(output, file)

output_file = "air_quality_data.json"

get_data_and_save(urls, output_file)


In [108]:
with open("air_quality_data.json") as f:
    data = json.load(f)
#with open("air_quality_data_1.json") as file:
 # data1 = json.load(file)

##data['results'].extend(data1['results'])

#with open('air_quality_um010_pm1.json', 'w') as f:
  #json.dump(data, f) 

In [109]:
df = pd.DataFrame(data["results"])
df['datetime'] = df.date.apply(lambda x: x['local'])
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values('datetime')
df.drop_duplicates(['datetime', 'parameter'], inplace=True)
df_wide = df.pivot(index='datetime', columns='parameter', values='value')
df_wide.head(5)


325324

In [113]:
# Resample the DataFrame by 1 hour and calculate the mean 
df_hourly_mean = df_wide[['pm1', 'um010']].resample('1H').mean()
df_hourly_mean.to_csv('air_quality_um10_pm1.csv')
df_hourly_mean.head(10)


parameter,pm1,um010
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-31 22:00:00-01:00,11.111111,0.926667
2021-12-31 23:00:00-01:00,9.154545,0.780909
2022-01-01 00:00:00-01:00,7.24,0.634
2022-01-01 01:00:00-01:00,6.190909,0.601818
2022-01-01 02:00:00-01:00,5.083333,0.455833
2022-01-01 03:00:00-01:00,4.828571,0.435714
2022-01-01 04:00:00-01:00,4.71,0.406
2022-01-01 05:00:00-01:00,4.523077,0.411538
2022-01-01 06:00:00-01:00,3.958333,0.3725
2022-01-01 07:00:00-01:00,3.390909,0.328182


In [119]:
df1 = pd.read_csv('air_quality_um100_um025.csv')
df2 = pd.read_csv('air_quality_pm10_pm25.csv')
df3 = pd.read_csv('air_quality_um10_pm1.csv')

# merge on the datetime column
merged_df = pd.merge(df1, df2, on='datetime')
final_df = pd.merge(merged_df, df3, on='datetime')
final_df['datetime'] = pd.to_datetime(final_df['datetime'])
filtered_df = final_df[final_df['datetime'].dt.year == 2022]
filtered_df.to_csv('air_quality_full.csv')
filtered_df.head(10)


Unnamed: 0,datetime,um100,um025,pm10,pm25,pm1,um010
2,2022-01-01 00:00:00-01:00,0.0,0.049,12.86,11.46,7.24,0.634
3,2022-01-01 01:00:00-01:00,0.000909,0.048182,11.936364,10.145455,6.190909,0.601818
4,2022-01-01 02:00:00-01:00,0.0,0.036667,9.35,8.041667,5.083333,0.455833
5,2022-01-01 03:00:00-01:00,0.0,0.037143,9.114286,7.628571,4.828571,0.435714
6,2022-01-01 04:00:00-01:00,0.0,0.027,8.49,7.31,4.71,0.406
7,2022-01-01 05:00:00-01:00,0.0,0.030769,8.6,7.315385,4.523077,0.411538
8,2022-01-01 06:00:00-01:00,0.0,0.030833,7.825,6.466667,3.958333,0.3725
9,2022-01-01 07:00:00-01:00,0.0,0.023636,6.4,5.336364,3.390909,0.328182
10,2022-01-01 08:00:00-01:00,0.0,0.017778,5.7,4.811111,3.222222,0.287778
11,2022-01-01 09:00:00-01:00,0.0,0.01375,4.2,3.675,2.4625,0.2175
