# Add AQI to data

This code will take a pandas df and return it with a new column added on, which is the AQI for the location and time the person was in that area.

In [5]:
# install HTTP requests package
%pip install requests

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: C:\Users\Admin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [32]:
# imports
import pandas as pd
import numpy as np
import datetime as dt
import requests as req
import matplotlib as plt
import json
import math
from collections import Counter

In [2]:
# constants
DATE_FORMAT = "%Y-%m-%d"

In [26]:
# load the 100,000 lines of my csv into a df (don't need that many for testing the script)
# show the first 10 rows, and the columns and their types
# remove the first row, since the first row has a string in all the rows
csv_data = pd.read_csv('../gsur23013.csv', nrows=400000, skiprows=[1])

# Strip the date from a string
# params
#   date: the str representation of the date
# returns
#   a datetime object
def GetDate(date: str) -> dt.datetime:
    day = str.split(date, " ")[0]
        
    # strip out the datetime object .
    date = dt.datetime.strptime(day, DATE_FORMAT)
    return date

# Applies the GetDate function to the entire given series
# params
#   date: the series to apply the date to
# returns
#   the changed date column
def GetDateSeries(date: pd.Series) -> pd.Series:
    return date.apply(GetDate)

# change date column to just the day
csv_data["stamp"] = GetDateSeries(csv_data["stamp"])

# show some info about the dataframe
print(csv_data.dtypes)
print(csv_data.head(10))

  csv_data = pd.read_csv('../gsur23013.csv', nrows=400000, skiprows=[1])


stamp                  datetime64[ns]
yaw                           float64
pitch                         float64
roll                          float64
rotation_rate_x               float64
rotation_rate_y               float64
rotation_rate_z               float64
user_acceleration_x           float64
user_acceleration_y           float64
user_acceleration_z           float64
latitude                      float64
longitude                     float64
altitude                      float64
course                        float64
speed                         float64
horizontal_accuracy           float64
vertical_accuracy             float64
battery_state                  object
user_activity_label            object
dtype: object
       stamp       yaw     pitch      roll  rotation_rate_x  rotation_rate_y  \
0 2023-08-28 -0.033335  0.040521  1.376791         0.000530         0.000567   
1 2023-08-28  0.166034  0.051798 -0.922997        -0.002101        -0.001292   
2 2023-08-28  0.911395 -

In [43]:
# Takes the given date and location and returns an AQI for that date and location. The AQI measure is *mainly* used with PM 2.5
# params
#   date: the date to retrieve the AQI for
#   latitude: the float representation of the latitude
#   longitude: the float representation of the longitude.
# returns
#   The AQI as a float
# Throws
#   Raises an HTTP Error if the request failed.
def GetAqi(date: dt.datetime, latitude: float, longitude: float) -> (float, dt.datetime):
    # this is the link to the historical API for AQI
    url = "https://www.airnowapi.org/aq/observation/latLong/historical?api_key={}&latitude={}&longitude={}&format=application/json&date={}T00-0000"
    
    # the api_key is stored in a config.json file, and kept off the github to avoid public exposure of your api_key for AirNow
    api_key = ""
    with open("../config.json", "r") as f:
        api_key = json.load(f)["api_key"]
    
    # formats the url with the right data, and sends it to AirNow
    response = req.get(url.format(api_key, latitude, longitude, date))
    
    # if the response is okay, it returns the AQI, if not, it raises an HTTP error
    if response.ok:
        # response returns a list of dictionaries (in this case, of length 1)
        # so access the first element, and retrieve the AQI from the dict
        # looks like: [{'DateObserved': '2023-08-28', 'HourObserved': 0, 'LocalTimeZone': 'PST', 'ReportingArea': 'Pullman', 'StateCode': 'WA', 'Latitude': 46.7245, 'Longitude': -117.1801, 'ParameterName': 'PM2.5', 'AQI': 38, 'Category': {'Number': 1, 'Name': 'Good'}}]
        return (response.json()[0].get("AQI", None), date)
    else:
        response.raise_for_status()

# Add an AQI column to a new dataframe
# params
#    df: the dataframe to add the column to
#    date_col: The title of the date column
#    lat_col: The title of the lat column
#    long_col: The title of the long column
# returns
# throws
def AddAQI(df: pd.DataFrame, date_col: str, lat_col: str, long_col: str) -> pd.DataFrame:
    # get the unique dates and the latitude and longitude for those dates
    unique_dates = UniqueDates(df, date_col, lat_col, long_col)
    
    # grab all the AQI's for that day
    AQIs = []
    for info in unique_dates:
        AQIs.append(GetAqi(info[0].date(), info[1], info[2]))
    
    # copy the df so we don't mess with the passed copy
    new_df = df.copy(deep=True)
    
    # get the counts of unique dates
    number_of_records = Counter(df[date_col].astype(str))
    
    # get a list of the aqis (should match the length of the dataframe)
    aqi_list = []
    for aqi, date in AQIs:
        aqi_list = aqi_list + [aqi for x in range(number_of_records[str(date)])]
        
    # set the AQI column
    new_df["AQI"] = aqi_list
    return new_df        
        
# Gets the unique dates, and their associated latitude and longitudes
# params
#    df: The dataframe to get the dates, lat, and long from
#    date_col: The title of the date column
#    lat_col: The title of the lat column
#    long_col: The title of the long column
# returns
#    a list of tuples, where the first spot is the date, the second the latitude, and the third the longitude
def UniqueDates(df: pd.DataFrame, date_col: str, lat_col: str, long_col: str) -> list:
    unique_dates = []
    last_date = dt.datetime.min.day
    for i in range(len(df)):
        date = df.loc[i, date_col]
        
        # if they aren't equal, add that unique date to the list
        #next day
        if (date.date() != last_date.date()):
            
            # get the lat and long
            lat = df.loc[i, lat_col]
            long = df.loc[i, long_col]
            
            # if they are nan, move to the next available lat and long
            if (math.isnan(lat) or math.isnan(long)):
                continue
                        
            unique_dates.append((date, lat, long))
        
        last_date = date
        
    return unique_dates

AddAQI(csv_data, "stamp")

396310
396310
3690
400000
       stamp       yaw     pitch      roll  rotation_rate_x  rotation_rate_y  \
0 2023-08-28 -0.033335  0.040521  1.376791         0.000530         0.000567   
1 2023-08-28  0.166034  0.051798 -0.922997        -0.002101        -0.001292   
2 2023-08-28  0.911395 -0.037857  0.041807        -0.001991        -0.000616   
3 2023-08-28  0.910933 -0.024698  0.026261         0.002564         0.000466   
4 2023-08-28  0.910843 -0.020273  0.021116         0.001488         0.002050   
5 2023-08-28  0.910789 -0.018194  0.018532         0.003596         0.004182   
6 2023-08-28  0.910780 -0.016943  0.017129         0.002520         0.002087   
7 2023-08-28  0.910708 -0.016079  0.016075         0.000699         0.000615   
8 2023-08-28  0.910740 -0.015435  0.015343         0.003869         0.001404   
9 2023-08-28  0.910719 -0.014973  0.014860        -0.000675         0.000058   

   rotation_rate_z  user_acceleration_x  user_acceleration_y  \
0         0.003458           