## Summary Statistics

This notebook creates a box-plot visualization of the variance of different pollutants in each month for different cities. The data is read from the corresponding csv file images/<year>_7cities.csv. This is AQI data that has not been filtered

In this notebook, I plot two versions, the median and max of each day's readings.  

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

plt.style.use('seaborn-bright')

In [None]:
year = 2019
df = pd.read_csv(f'data/{year}_7cities.csv')

## A quick look at the data, make sure we understand what's what

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.dtypes

OK! Most of the columns seem fine, but the Date is listed as an object. Let's fix this, so we can use this data correctly.

We will want to group the data according to month, so let's also create the corresponding columns.

In [None]:
df.Date = pd.to_datetime(df.Date, format = '%d/%m/%Y') # need the format string to specify which is the month and which is the year
df['month_num'] = pd.DatetimeIndex(df['Date']).month
df['month_name'] = df.Date.dt.month_name()
df.dtypes

In [None]:
df.head()

In [None]:
print(df.shape)
# just keep 2019 data, discard everything else
df = df[df['Date'].dt.year == year]
print(df.shape)

In [None]:
df.head()

In [None]:
#just to make this generic
cities = df.City.unique()
pollutants = df.Specie.unique()

print(cities)
print(pollutants)

## 24-hour means unless otherwise specified. Using 8-hour as a proxy when unavailable
pollutant_who = {}
pollutant_who['co']   = 0
pollutant_who['no2']  = 0
pollutant_who['o3']   = 100 ##  8-hour mean
pollutant_who['pm10'] = 50
pollutant_who['pm25'] = 25
pollutant_who['so2']  = 20  

## annual in 'industrial, residential, rural, others' areas
pollutant_naaqs = {}
pollutant_naaqs['co']   = 2   ##  8-hour mean
pollutant_naaqs['no2']  = 80
pollutant_naaqs['o3']   = 100 ##  8-hour mean
pollutant_naaqs['pm10'] = 100
pollutant_naaqs['pm25'] = 60
pollutant_naaqs['so2']  = 80  


---

I'm really, really partial to making grids of plots myself!

In [None]:
fig, ax = plt.subplots(len(pollutants), len(cities), figsize=(24,36), sharey='row')
plt.subplots_adjust(hspace=0.3) ## the hspace value is a fraction of the average axes height

# For each pollutant, and each city, create a box plot, and put it in the correct (pollutant, city) location
for i, p in enumerate(pollutants):
    d = df[(df.Specie == p)] # extract the pollutant data
    for j, c in enumerate(cities):
        a = ax[i][j] # figure out the axes to use
        d[d.City == c].boxplot(column='median', by='month_num', ax=a)
        if pollutant_who[p]:
            a.axhline(y=pollutant_who[p], c='g')
        if pollutant_naaqs[p]:
            a.axhline(y=pollutant_naaqs[p], c='r')
        a.set_title(c)

for a, r in zip(ax[:,0], pollutants):
    a.set_ylabel(r, rotation=90, fontsize=24, color='b')
    
#plt.title('') # SKK: I need to figure this out. If I uncomment this line, the argument shows up as the title of the last subfigure!
plt.suptitle('')
plt.show()
fig.savefig(f'images/boxplot-pollutants-cities-{year}.png', bbox_inches='tight')
plt.close()

In [None]:
fig, ax = plt.subplots(len(pollutants), len(cities), figsize=(24,36), sharey='row')
plt.subplots_adjust(hspace=0.3)

## same as the plot above, but using the daily max, rather than the daily median values for each month
for i, p in enumerate(pollutants):
    d = df[df.Specie == p]
    for j, c in enumerate(cities):
        
        a = ax[i][j]
        d[d.City == c].boxplot(column='max', by='month_num', ax=a)
        a.set_title(c, color='blue')
        if pollutant_who[p]:
            a.axhline(y=pollutant_who[p], c='g')
        if pollutant_naaqs[p]:
            a.axhline(y=pollutant_naaqs[p], c='r')

for a, r in zip(ax[:,0], pollutants):
    a.set_ylabel(r, rotation=90, fontsize=24, color='b')
#plt.title('')
plt.suptitle('')
plt.show()
fig.savefig(f'images/boxplot-pollutants-cities-max-{year}.png', bbox_inches='tight')
plt.close()

And finally, we can plot these per city:

In [None]:
fig, ax = plt.subplots(1, len(pollutants), figsize=(24,6))

city = 'Delhi'
for i, p in enumerate(pollutants):
    d = df[df.Specie == p]     
    a = ax[i]
    d[d.City == city].boxplot(column='median', by='month_num', ax=a)
    a.set_title(p, color='blue')
    if pollutant_who[p]:
        a.axhline(y=pollutant_who[p], c='g')
    if pollutant_naaqs[p]:
        a.axhline(y=pollutant_naaqs[p], c='r')
plt.suptitle('')
plt.show()
#plt.savefig(f'images/Delhi-poll-{year}.png')

In [None]:
## individual plots of pollutants for each city
for c in cities:
    fig, ax = plt.subplots(1, len(pollutants), figsize=(24,6))
    for i, p in enumerate(pollutants):
        d = df[df.Specie == p]     
        a = ax[i]
        d[d.City == c].boxplot(column='median', by='month_num', ax=a)
        a.set_title(p, color='blue')
        if pollutant_who[p]:
            a.axhline(y=pollutant_who[p], c='g')
        if pollutant_naaqs[p]:
            a.axhline(y=pollutant_naaqs[p], c='r')
        plt.suptitle(f'{c}', c='b', fontsize=24)
        plt.savefig(f"images/boxplot-{c}-pollutants-{year}.png")
    plt.close()

In [None]:
## plots of pollutants across cities
for p in pollutants:
    fig, ax = plt.subplots(1, len(cities), figsize=(24,6), sharey=True)
    d = df[df.Specie == p]
    for i, c in enumerate(cities):
        a = ax[i]
        d[d.City == c].boxplot(column='median', by='month_num', ax=a)
        a.set_title(c)
        if pollutant_who[p]:
            a.axhline(y=pollutant_who[p], c='g')
        if pollutant_naaqs[p]:
            a.axhline(y=pollutant_naaqs[p], c='r')
        plt.suptitle(f'{p}', c='b', fontsize=20)
        plt.savefig(f"images/boxplot-{p}-cities-{year}.png")
    plt.close()


In [None]:
fig, ax = plt.subplots(len(cities), len(pollutants), figsize=(24,36))
plt.subplots_adjust(hspace=0.3)

for i, c in enumerate(cities):
    d = df[df.City == c] 
    for j, p in enumerate(pollutants):
        a = ax[i][j]
        d[d.Specie == p].boxplot(column='median', by='month_num', ax=a)
        a.set_title(p, color='blue')
        if pollutant_who[p]:
            a.axhline(y=pollutant_who[p], c='g')
        if pollutant_naaqs[p]:
            a.axhline(y=pollutant_naaqs[p], c='r')
        
for a, r in zip(ax[:,0], cities):
    a.set_ylabel(r, rotation=90, fontsize=24, color='b')
    
plt.suptitle('')
plt.show()
fig.savefig(f'images/boxplot-cities-pollutants-{year}.png', bbox_inches='tight')
plt.close()

In [None]:
fig, ax = plt.subplots(len(cities), len(pollutants), figsize=(24,36))
plt.subplots_adjust(hspace=0.3)

for i, c in enumerate(cities):
    d = df[df.City == c] 
    for j, p in enumerate(pollutants):
        a = ax[i][j]
        d[d.Specie == p].boxplot(column='max', by='month_num', ax=a)
        a.set_title(p, color='blue')
        if pollutant_who[p]:
            a.axhline(y=pollutant_who[p], c='g')
        if pollutant_naaqs[p]:
            a.axhline(y=pollutant_naaqs[p], c='r')
        
for a, r in zip(ax[:,0], cities):
    a.set_ylabel(r, rotation=90, fontsize=24, color='b')
    
plt.suptitle('')
plt.show()
fig.savefig(f'images/boxplot-cities-pollutants-max-{year}.png', bbox_inches='tight')
plt.close()