In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import seaborn as sns


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
years_fmt = mdates.DateFormatter('%Y-%m') #This is a format. Will be clear in Screenshot

In [None]:
pollution_df = pd.read_csv('../input/london-air-quality-at-bondway-interchange-lambeth/AirQualityData_LambethBondway_2018-2023.csv', parse_dates=True)

#Renaming columns
pollution_df.rename(columns={'Species': 'Pollutant','ReadingDateTime': 'DateTime'}, inplace=True)
pollution_df

Each variable can be described as follows:
1. Site: The code for the site where the data was collected. This analysis focuses on one site in Vauxhall with the code LB5.
[Details for site LB5](https://www.londonair.org.uk/london/asp/publicdetails.asp?Site=LB5&View=Details&region=0)
2. Pollutant: The type of pollutant that is measured at this site.
* Nitrogen Dioxide (µg/m-3)
* PM10 Particulate (by BAMH) (µg/m-3)
3. The date and time that this measurementis valid for. The readings are taken hourly.
4. The value of the reading.
5. The units in which the values are measured. This data is measured in of mass per unit volume. A concentration of 1 µg/m3 means that one cubic metre of air contains one microgram (10^-6 grams) of pollutant.
6. Whether the data has yet been ratified, or if it is provisional.

For the air quality standards, I have created a CSV using data from [here](https://www.eea.europa.eu/publications/status-of-air-quality-in-Europe-2022/europes-air-quality-status-2022/world-health-organization-who-air)

In [None]:
aqg_temp= pd.read_csv('../input/london-air-quality-at-bondway-interchange-lambeth/WHO_AQG.csv',header = 0)
header_row = aqg_temp.iloc[0]
aqg_df = pd.DataFrame(aqg_temp.values[1:], columns=header_row)
aqg_df[['Unit']] = "µg/m-3"
aqg_df
aqg_df.to_csv('/kaggle/working/aqg_df.csv')

# Data types
Checking what the data types are and if they need to be converted.


In [None]:
pollution_df.dtypes

Formatting data for consistency

In [None]:
#Formatting date columns to be consistent
pollution_df[['DateTime']] = pollution_df[['DateTime']].apply(pd.to_datetime, format='%d/%m/%Y %H:%M')
pollution_df = pollution_df.set_index(pollution_df['DateTime'])
pollution_df[['Unit']] = "µg/m-3"

#Reordering Columns
pollution_df = pollution_df[['DateTime', 'Pollutant','Value','Unit']]

#Filter for a year either side of Congestion charge being changed to ULEZ
pollution_df = pollution_df.loc[(pollution_df['DateTime'] >= '2018-01-01 00:00:00') & (pollution_df['DateTime'] <= '2020-12-31 00:00:00')]

pollution_df

# Missing Data
Assessing how much data is missing for each column.

In [None]:
# Counting NaN values in all columns
null_count = pollution_df.isnull().sum()
null_percentage = (null_count / len(pollution_df) * 100).round(2)
print('Total null count:\n',null_count,'\n\n\nPercentage of values missing:\n',null_percentage)

Only 1.14% of the data is missing, which is below the acceptable 5% of missing data.

In [None]:
pollution_df = pollution_df.ffill(axis = 0)
null_count = pollution_df.isnull().sum()
null_count

# Creation of pivot chart
From a pivot table, I can see the average daily reading for each pollutant.

In [None]:
pivot_df = pollution_df.pivot(index='DateTime', columns='Pollutant', values= 'Value')
pivot_df
#pivot_df.to_csv('/kaggle/working/pollution_df.csv')

Plotting a chart to show average monthly values

In [None]:
monthly_avg = pivot_df.resample("ME").mean()

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(20,10))
plt.title("Average Monthly Measurements of Air Pollutants in Vauxhall 2018 – 2020")
ax = sns.lineplot(data=monthly_avg)
#sns.lmplot(x="DateTime", y="Value",data=monthly_avg)

plt.xticks(rotation = 'vertical')
ax.set(ylabel='Volume / µg/m-3', xlabel='Date')

By plotting the average monthly values of both pollutants, there is a trend between them which we would expect. 

In [None]:
pollution_df['Day'] = pollution_df['DateTime'].dt.day_name()
pollution_df['Month'] = pollution_df['DateTime'].dt.month_name()
pollution_df['Year'] = pollution_df['DateTime'].dt.year

# Pollution by Month

In [None]:
NO2_df = pollution_df.loc[(pollution_df['Pollutant'] == 'NO2')]
PM10_df = pollution_df.loc[(pollution_df['Pollutant'] == 'PM10')]

fig,ax = plt.subplots(2,1, figsize=(20,20))

sns.lineplot(x=NO2_df['Month'], y=NO2_df['Value'], hue = 'Year',data=NO2_df, ax =ax[0],errorbar=None)
ax[0].set_title("NO2 Concentrate by Month")
ax[0].set(ylabel='Volume / µg/m-3', xlabel='Month')


sns.lineplot(x=PM10_df['Month'], y=PM10_df['Value'], hue = 'Year',data=PM10_df, ax =ax[1],errorbar=None)
ax[1].set_title("PM10 Concentrate by Month")
ax[1].set(ylabel='Volume / µg/m-3', xlabel='Month')

There is a slight trend that shows that there was a decrease in both NO2 and PM10 concentrations in the summer months. It could be theorised that in winter months more people use cars to travel due to bad weather, thus increasing pollution. It is notable in 2020 that the levels started increasing overall in May rather than August/September as it was reported people were nervous about spreading Covid-19 and were travelling via personal transport to avoid it.\ \
The NO2 decreases the most over the course of three years, indicating that the LEZ could have an positive affect in reducing levels, but the same can not be said for the PM10 levels.\ \
The WHO limits are intended for averages over the course of a day or year, rather than month, so it would be unsuitable to add these to the graphs.

In [None]:
NO2_df = pollution_df.loc[(pollution_df['Pollutant'] == 'NO2')]
PM10_df = pollution_df.loc[(pollution_df['Pollutant'] == 'PM10')]

fig,ax = plt.subplots(2,1, figsize=(20,20))

sns.lineplot(x=NO2_df['Day'], y=NO2_df['Value'], hue = 'Year',data=NO2_df, ax =ax[0],errorbar=None)
ax[0].set_title("NO2 Concentrate by day of week")
ax[0].set(ylabel='Volume / µg/m-3', xlabel= 'Day of week')

sns.lineplot(x=PM10_df['Day'], y=PM10_df['Value'], hue = 'Year',data=PM10_df, ax =ax[1],errorbar=None)
ax[1].set_title("PM10 Concentrate by day of week")
ax[1].set(ylabel='Volume / µg/m-3', xlabel='Day of week')

The graphs show a slight rise throughout the week, with the levels dropping off substantially over the weekend where there will be fewer vehicles travelling on the road. However, there is a rise in the PM10 levels over the course of the three years sampled, rather than the opposite, which is what I expected.

# WHO Levels

In [None]:
daily_avg_pivot = pivot_df.resample("D").mean()
daily_avg = daily_avg_pivot
daily_avg.columns.name = None
daily_avg.reset_index(drop=False, inplace=True)
daily_avg[['DateTime']] = daily_avg[['DateTime']].apply(pd.to_datetime, format='%Y-%m-%d')

daily_avg['Day'] = daily_avg['DateTime'].dt.day_name()
daily_avg['Month'] = daily_avg['DateTime'].dt.month_name()
daily_avg['Year'] = daily_avg['DateTime'].dt.year

daily_avg['NO2_AQG']=aqg_df.query("Pollutant =='NO2' & AVG_Period=='D'")['AQGValue'].iloc[0]
daily_avg['PM10_AQG']=aqg_df.query("Pollutant =='PM10' & AVG_Period=='D'")['AQGValue'].iloc[0]
daily_avg[['NO2_AQG','PM10_AQG']] = daily_avg[['NO2_AQG','PM10_AQG']].astype(int)

daily_avg['NO2_BREACH']=daily_avg['NO2'] >= daily_avg['NO2_AQG']
daily_avg['PM10_BREACH']=daily_avg['PM10'] >= daily_avg['PM10_AQG']
daily_avg['NO2_BREACH_PC']=(((daily_avg['NO2']-daily_avg['NO2_AQG']) / daily_avg['NO2']) * 100).round(2)
daily_avg['PM10_BREACH_PC']=(((daily_avg['PM10']-daily_avg['PM10_AQG']) / daily_avg['PM10']) * 100).round(2)
#daily_avg[['NO2_BREACH','PM10_BREACH']]=daily_avg[['NO2','PM10']] >= daily_avg[['NO2_AQG','PM10_AQG']] 

print(daily_avg)
daily_avg.dtypes

In [None]:

null_count = daily_avg.isnull().sum()
null_count

In [None]:
fig,ax = plt.subplots(2,1, figsize=(20,20))
sns.lineplot(x=daily_avg['DateTime'], y=daily_avg['NO2_BREACH_PC'], hue = 'Year',data=daily_avg, ax =ax[0],errorbar=None)
ax[0].set_title("NO2 Concentrate by Month")
ax[0].set(ylabel='Volume / µg/m-3', xlabel='Month')


sns.lineplot(x=daily_avg['DateTime'], y=daily_avg['PM10_BREACH_PC'], hue = 'Year',data=daily_avg, ax =ax[1],errorbar=None)
ax[1].set_title("PM10 Concentrate by Month")
ax[1].set(ylabel='Volume / µg/m-3', xlabel='Month')

In [None]:
fig,ax = plt.subplots(2,1, figsize=(20,20))

sns.lineplot(x=daily_avg.index, y=daily_avg['NO2'],data=daily_avg, ax =ax[0],errorbar=None)
ax[0].set_title("NO2 Concentrate by day of week")
ax[0].set(ylabel='Volume / µg/m-3', xlabel= 'Day')

sns.lineplot(x=daily_avg.index, y=daily_avg['PM10'], data=daily_avg, ax =ax[1],errorbar=None)
ax[1].set_title("PM10 Concentrate by day of week")
ax[1].set(ylabel='Volume / µg/m-3', xlabel='Day')

In [None]:
"""monthly_avg = daily_df.resample("M").mean()

plt.title("Average Monthly Measurements of PM10 in Vauxhall 2019–2022")
ax = sns.lineplot(data=monthly_avg, x = 'ReadingDateTime', y = 'PM10')
ax.set(ylabel='PM10 concentration / µg/m-3', xlabel='Date')"""

# Days of the week vs year
I hypothesise that the amount of pollution will be affected by the day of the week, across the years, with a notable decline in 2020 to correlate with the lockdown for Covid-19.

Currently, the measurements are in hourly form, which is going to be too much. I want to create a new data frame that lists the daily average as well as the day of the week.

# No2 on a monthly basis

In [None]:
NO2_new = NO2_df.filter(['DateTime', 'Value'], axis=1)
NO2_month = NO2_new.groupby([pd.Grouper(freq='ME')]).mean()
NO2_month['Month'] = NO2_month['DateTime'].dt.month
NO2_month['Year'] = NO2_month['DateTime'].dt.year
NO2_month

In [None]:
NO2_pivot_df = NO2_month.pivot(index='Month', columns='Year', values= 'Value')

NO2_pivot_df.columns.name = None
NO2_pivot_df.reset_index(['Month'])
NO2_pivot_df['%1819Change'] = -(((NO2_pivot_df[2019]-NO2_pivot_df[2018]) / NO2_pivot_df[2018]) * 100).round(2)
NO2_pivot_df['%1920Change'] = -(((NO2_pivot_df[2020]-NO2_pivot_df[2019]) / NO2_pivot_df[2019]) * 100).round(2)
NO2_pivot_df['-'] = '-'
NO2_pivot_df['%1820Change'] = -(((NO2_pivot_df[2020]-NO2_pivot_df[2018]) / NO2_pivot_df[2018]) * 100).round(2)
NO2_pivot_df
#percentage (New Price - Old Price)/Old Price and then multiply that number by 100

O 56.79 N 39.95

In [None]:
change1819 = NO2_pivot_df['%1819Change'].mean()
change1920 = NO2_pivot_df['%1920Change'].mean()
change1820 = NO2_pivot_df['%1820Change'].mean()
print('18-19',change1819,'\n19-20',change1920,'\n18-20',change1820)