# Data Import & Cleaning

In [1]:
# import dependencies
import requests
import json
import pprint
import pandas as pd
from api_keys import api_key

In [2]:
# api call
url = f"https://api.waqi.info/v2/map/bounds?latlng=50.10319,-7.64133,60.15456,1.75159&networks=all&token={api_key}"
response = requests.get(url)
data = response.json()

In [3]:
# printing json
data

{'status': 'ok',
 'data': [{'lat': 52.411563,
   'lon': -1.560228,
   'uid': 8913,
   'aqi': '13',
   'station': {'name': 'Coventry Allesley, United Kingdom',
    'time': '2023-08-09T03:00:00+09:00'}},
  {'lat': 53.54914,
   'lon': -2.638139,
   'uid': 3220,
   'aqi': '28',
   'station': {'name': 'Wigan Centre, United Kingdom',
    'time': '2023-08-09T03:00:00+09:00'}},
  {'lat': 56.124331,
   'lon': -3.141347,
   'uid': 5994,
   'aqi': '8',
   'station': {'name': 'Fife Kirkcaldy, United Kingdom',
    'time': '2023-08-09T01:00:00+09:00'}},
  {'lat': 55.97671,
   'lon': -3.59731,
   'uid': 6026,
   'aqi': '10',
   'station': {'name': 'West Lothian Linlithgow High Street 2, United Kingdom',
    'time': '2023-08-09T01:00:00+09:00'}},
  {'lat': 55.938253,
   'lon': -4.317753,
   'uid': 5981,
   'aqi': '10',
   'station': {'name': 'East Dunbartonshire Milngavie , United Kingdom',
    'time': '2023-08-09T03:00:00+09:00'}},
  {'lat': 53.285833000178,
   'lon': -6.1319440004234,
   'uid': 1338

In [4]:
# count number of datapoints
len(data['data'])

159

In [5]:
# create dataframe
df = pd.DataFrame(data['data'])
df

Unnamed: 0,lat,lon,uid,aqi,station
0,52.411563,-1.560228,8913,13,"{'name': 'Coventry Allesley, United Kingdom', ..."
1,53.549140,-2.638139,3220,28,"{'name': 'Wigan Centre, United Kingdom', 'time..."
2,56.124331,-3.141347,5994,8,"{'name': 'Fife Kirkcaldy, United Kingdom', 'ti..."
3,55.976710,-3.597310,6026,10,{'name': 'West Lothian Linlithgow High Street ...
4,55.938253,-4.317753,5981,10,"{'name': 'East Dunbartonshire Milngavie , Unit..."
...,...,...,...,...,...
154,53.370000,-6.588000,-92452,-,"{'name': 'Carton Court', 'time': '2023-08-09T0..."
155,51.425286,-0.345606,3195,13,"{'name': 'London Teddington Bushy Park, United..."
156,51.453090,-0.944067,3206,15,"{'name': 'Reading New Town, United Kingdom', '..."
157,55.945589,-3.182186,3176,22,"{'name': 'Edinburgh St Leonards, United Kingdo..."


In [7]:
# normalise the 'station' column and expand into separate columns
normalised = pd.json_normalize(df['station'])

# concatenate the normalized data with the original dataframe
df = pd.concat([df, normalised], axis=1)

# drop the original 'station' column
df.drop('station', axis=1, inplace=True)

In [8]:
df

Unnamed: 0,lat,lon,uid,aqi,name,time
0,52.411563,-1.560228,8913,13,"Coventry Allesley, United Kingdom",2023-08-09T03:00:00+09:00
1,53.549140,-2.638139,3220,28,"Wigan Centre, United Kingdom",2023-08-09T03:00:00+09:00
2,56.124331,-3.141347,5994,8,"Fife Kirkcaldy, United Kingdom",2023-08-09T01:00:00+09:00
3,55.976710,-3.597310,6026,10,"West Lothian Linlithgow High Street 2, United ...",2023-08-09T01:00:00+09:00
4,55.938253,-4.317753,5981,10,"East Dunbartonshire Milngavie , United Kingdom",2023-08-09T03:00:00+09:00
...,...,...,...,...,...,...
154,53.370000,-6.588000,-92452,-,Carton Court,2023-08-09T04:50:57+09:00
155,51.425286,-0.345606,3195,13,"London Teddington Bushy Park, United Kingdom",2023-08-09T03:00:00+09:00
156,51.453090,-0.944067,3206,15,"Reading New Town, United Kingdom",2023-08-09T03:00:00+09:00
157,55.945589,-3.182186,3176,22,"Edinburgh St Leonards, United Kingdom",2023-08-09T03:00:00+09:00


In [9]:
# split the 'time' column into 'date' and 'time'
df['date'] = df['time'].str.split('T').str[0]
df['time'] = df['time'].str.split('T').str[1].str[:-6]

In [10]:
df

Unnamed: 0,lat,lon,uid,aqi,name,time,date
0,52.411563,-1.560228,8913,13,"Coventry Allesley, United Kingdom",03:00:00,2023-08-09
1,53.549140,-2.638139,3220,28,"Wigan Centre, United Kingdom",03:00:00,2023-08-09
2,56.124331,-3.141347,5994,8,"Fife Kirkcaldy, United Kingdom",01:00:00,2023-08-09
3,55.976710,-3.597310,6026,10,"West Lothian Linlithgow High Street 2, United ...",01:00:00,2023-08-09
4,55.938253,-4.317753,5981,10,"East Dunbartonshire Milngavie , United Kingdom",03:00:00,2023-08-09
...,...,...,...,...,...,...,...
154,53.370000,-6.588000,-92452,-,Carton Court,04:50:57,2023-08-09
155,51.425286,-0.345606,3195,13,"London Teddington Bushy Park, United Kingdom",03:00:00,2023-08-09
156,51.453090,-0.944067,3206,15,"Reading New Town, United Kingdom",03:00:00,2023-08-09
157,55.945589,-3.182186,3176,22,"Edinburgh St Leonards, United Kingdom",03:00:00,2023-08-09


In [11]:
# check datatypes - aqi is currently an object type
df.dtypes

lat     float64
lon     float64
uid       int64
aqi      object
name     object
time     object
date     object
dtype: object

In [12]:
# clean the 'aqi' column by replacing non-numeric values with NaN
df['aqi'] = pd.to_numeric(df['aqi'], errors='coerce')

# drop rows with NaN values in the 'aqi' column
df.dropna(subset=['aqi'], inplace=True)

# convert the 'aqi' column to integer
df['aqi'] = df['aqi'].astype(int)

# check datatypes - aqi is now integer so ready for analysis
df.dtypes

lat     float64
lon     float64
uid       int64
aqi       int64
name     object
time     object
date     object
dtype: object

In [13]:
# final df
df

Unnamed: 0,lat,lon,uid,aqi,name,time,date
0,52.411563,-1.560228,8913,13,"Coventry Allesley, United Kingdom",03:00:00,2023-08-09
1,53.549140,-2.638139,3220,28,"Wigan Centre, United Kingdom",03:00:00,2023-08-09
2,56.124331,-3.141347,5994,8,"Fife Kirkcaldy, United Kingdom",01:00:00,2023-08-09
3,55.976710,-3.597310,6026,10,"West Lothian Linlithgow High Street 2, United ...",01:00:00,2023-08-09
4,55.938253,-4.317753,5981,10,"East Dunbartonshire Milngavie , United Kingdom",03:00:00,2023-08-09
...,...,...,...,...,...,...,...
153,52.338033,-6.462139,13413,9,"Wexford Opera House, Ireland",02:00:00,2023-08-09
155,51.425286,-0.345606,3195,13,"London Teddington Bushy Park, United Kingdom",03:00:00,2023-08-09
156,51.453090,-0.944067,3206,15,"Reading New Town, United Kingdom",03:00:00,2023-08-09
157,55.945589,-3.182186,3176,22,"Edinburgh St Leonards, United Kingdom",03:00:00,2023-08-09


In [14]:
# export to csv file
df.to_csv('data/air_pollution_data.csv', index=False)

In [15]:
# export to JSON file
with open('data/air_pollution_data.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

# Exploratory Analysis

In [18]:
# stations with lowest aqi (least polluted)
least_polluted = df.sort_values(by='aqi', ascending = True)
least_polluted.head()

Unnamed: 0,lat,lon,uid,aqi,name,time,date
54,51.2434,-1.1275,-125002,1,IBASINGS39,04:50:04,2023-08-09
58,54.00313,-6.392628,14211,1,"Dundalk, Co. Louth, Ireland",03:00:00,2023-08-09
99,52.896705,-7.332026,-196696,2,L5731,04:48:51,2023-08-09
112,51.074793,-4.041924,4903,2,"Barnstaple A39, United Kingdom",03:00:00,2023-08-09
32,52.090561,-7.620018,13381,3,"Dungarvan, Co. Waterford, Ireland",04:00:00,2023-08-09


In [19]:
# stations with highest aqi (most polluted)
most_polluted = df.sort_values(by='aqi', ascending = False)
most_polluted.head()

Unnamed: 0,lat,lon,uid,aqi,name,time,date
152,56.018951,-3.721013,5989,88,"Falkirk Grangemouth MC, United Kingdom",02:00:00,2023-08-09
66,53.33,-6.288,-354841,65,Aughavanagh Road,04:49:22,2023-08-09
133,51.379312,-0.281259,14619,57,"Kingston Upon Thames - Tolworth Broadway, Unit...",03:00:00,2023-08-09
6,51.54421,-0.175269,3166,50,"Camden Kerbside, United Kingdom",03:00:00,2023-08-09
89,51.579656,-3.766465,8063,46,"Port Talbot Prince Street 2, United Kingdom",04:00:00,2023-08-09
