In [80]:
# Import relevant packages
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import openaq
import warnings
import datetime as dt
import requests

warnings.simplefilter('ignore')

%matplotlib inline

# Set major seaborn asthetics
sns.set("notebook", style='ticks', font_scale=1.0)

# Increase the quality of inline plots
mpl.rcParams['figure.dpi']= 500

print ("pandas v{}".format(pd.__version__))
print ("matplotlib v{}".format(mpl.__version__))
print ("seaborn v{}".format(sns.__version__))
print ("openaq v{}".format(openaq.__version__))

pandas v1.1.3
matplotlib v3.3.1
seaborn v0.11.0
openaq v1.1.0


In [79]:
# Fetch the api
# http://dhhagan.github.io/py-openaq/api.html
api = openaq.OpenAQ(version ='v2')

In [141]:
# TODO
# 1. Drop the ID column, don't need to write that into the file
# 2. The dates are not the same for all cities.. Look into that
# 3. Maybe count locations and provide that into the dataframe as well. 
#    It's relevent to know how many measures are in the city.
# 4. Can we choose certain type of measures



# The cities we are interested in
cities_of_interest = {"Akureyri": "IS", "London": "GB", 
    "Mexico City": "MX", "Newcastle": "GB", "Reykjavík":"IS"}

# Filter what city we want to get
def filter_results_to_country(geoapi_response_data, city):
    #If there are multiple cities with the same name, we choose the most populated
    filter_by_country = [resp for resp in geoapi_response_data if resp['country_code'] == cities_of_interest[city]]
    # 
    sorted_by_pop = sorted(filter_by_country, key = lambda resp: resp['population'] if 'population' in resp else 0, reverse=True)
    result = sorted_by_pop[0]

    # Relevant subset of the result dict
    return {key: result[key] for key in ('latitude', 'longitude', 'elevation', 'population')}

# Get the longitude and latitude for a certain city
def query_lat_long(city):
    params_dict = {
        'name': city,
        #Default is 10 but since newcastle gives 9, we might hit the limit
        'count': 100
    }

    # Since the user gives us more than 3 chars, the api performs fuzzy matching. So we do not
    # need to worry abt spelling
    resp = requests.get('https://geocoding-api.open-meteo.com/v1/search', params_dict)
    data = resp.json()
    
    return filter_results_to_country(data['results'], city)



def measurement_to_csv(city,date_from,date_to):
    '''
    This function takes in a city, parameter and date and writes data into a csv.file
    Input:
        city: name of a city (string)
        ???parameter: List of strings that represent the parameters wanted to calculate
        date_from: measurments after this date will be calculated
        date_to: Measures until this date will be calculated
    '''
    
    # Get the longitude and latitude for the city
    location = query_lat_long(city)
    coords = f'{location["latitude"]},{location["longitude"]}'
    
    # Call the location api to check for the first date updated
    locations = api.locations(coordinates = coords,radius = 10000,df = True)

    min_date = locations["firstUpdated"].min()
    
    min_date = min_date.tz_convert(None)
    
    min_date = pd.to_datetime(min_date) 
    
    if min_date > date_from:
        date_from = min_date
    
    
    # Number of days we want measurements for
    day_diff = (date_to - date_from).days
    
    # How we split the call between days to the API
    split_days = 30

    # Number of 30 day blocks in our range
    number_months = day_diff // split_days

    # Initialize the start date
    start = date_from
    


    # Add measurements to csv file 30 days at a time
    # An extra iteration for the remaining <30 days
    for n in range(number_months + 1):
        
        # Find the end date
        end = start + dt.timedelta(days = split_days)
        
        # Fetch the data from the measurment api
        df_api = api.measurements(coordinates = coords, radius = 5000, df = True, 
                                  limit = 30000, parameter = ["pm25", "pm10"], value_from = 0,
                              date_from = start, date_to = end)
        
        # Start as the last end date
        start = end

        # For the first iteration create df
        if n == 0: 
            df = df_api.copy()
        # After the first iteration append the data
        else:
            df = df.append(df_api)
    
    ## Data prepping 

    # Change the index
    df.index.name = 'Date.local'
    df.reset_index(inplace=True)
    df['Date'] = df['Date.local'].dt.strftime('%Y-%m-%d')
    df['value'] = df['value'].astype(float, errors = 'raise')

    # Calculate mean, max and min value for each date
    Result_mean = df.groupby(['Date', 'parameter'],as_index=False)['value'].mean()
    Result_max = df.groupby(['Date', 'parameter'],as_index=False)['value'].max()
    Result_min = df.groupby(['Date', 'parameter'],as_index=False)['value'].min()

    # Pivot the tables to wide format
    ResultWide_mean = Result_mean.pivot_table(index='Date',columns='parameter', values='value')
    ResultWide_max = Result_max.pivot_table(index='Date',columns='parameter', values='value')
    ResultWide_min = Result_min.pivot_table(index='Date',columns='parameter', values='value')

    # Rename the columns to distinguish
    ResultWide_mean.rename(columns={"pm10": 'pm10_mean', 'pm25': 'pm25_mean'}, inplace=True)
    ResultWide_max.rename(columns={"pm10": 'pm10_max', 'pm25': 'pm25_max'}, inplace=True)
    ResultWide_min.rename(columns={"pm10": 'pm10_min', 'pm25': 'pm25_min'}, inplace=True)

    # Join mean and max first
    df_first_join = pd.merge(ResultWide_mean, ResultWide_max, left_index=True, right_index=True)

    # Join now to min
    ResultWide = pd.merge(df_first_join, ResultWide_min, left_index=True, right_index=True)

    # Change the index (Can we drop the ID column?)
    ResultWide.index.name = 'Date'
    ResultWide.reset_index(inplace=True)
    ResultWide.index.name = 'ID'

    # Write to a file  
    
    Path = f'Data_measurements/{city}.csv'
    ResultWide.to_csv(Path)
    
    
# Call the function
city = 'London'
date_from = pd.to_datetime('2020-01-01') 
date_to = pd.to_datetime('2021-05-01')

measurement_to_csv(city,date_from,date_to)

In [None]:
# Call the function
city = 'London'
date_from = '2020-06-01', 
date_to = '2021-10-01'

measurment_to_csv(city,date_from,date_to)

In [2]:
city = 'London'
# If we want to then read in the data
df_city = pd.read_csv(f'Data_measurements/{city}.csv')
df_city.head()


Unnamed: 0,ID,Date,pm10_mean,pm25_mean,pm10_max,pm25_max,pm10_min,pm25_min
0,0,2021-06-23,12.045865,5.692241,27.0,13.6,6.0,3.0
1,1,2021-06-24,16.277833,9.444379,32.0,20.6,7.8,5.0
2,2,2021-06-25,8.691071,4.954861,19.5,12.4,4.0,2.0
3,3,2021-06-26,11.347642,6.922283,21.0,20.5,3.9,2.6
4,4,2021-06-27,14.990957,11.269939,29.0,33.4,6.0,5.0
