In [28]:
import json
import csv
import pandas
import numpy

In [13]:
# Read airquality data
df = pandas.read_csv('../data/airquality.csv', dtype={'county': str, 'date_local': str}, parse_dates=['date_local']) 

In [14]:
df.head()

Unnamed: 0,latitude,longitude,date_local,parameter,county,city,aqi,observation_count
0,36.72639,-119.733,2003-01-03,PM2.5 - Local Conditions,Fresno,Fresno,121.0,1.0
1,36.72639,-119.733,2003-01-06,PM2.5 - Local Conditions,Fresno,Fresno,136.0,1.0
2,36.72639,-119.733,2003-01-09,PM2.5 - Local Conditions,Fresno,Fresno,80.0,1.0
3,36.72639,-119.733,2003-01-12,PM2.5 - Local Conditions,Fresno,Fresno,89.0,1.0
4,36.72639,-119.733,2003-01-15,PM2.5 - Local Conditions,Fresno,Fresno,102.0,1.0


In [25]:
# How many datapoints per year
df.groupby(
  df.date_local.dt.year,
).size()

date_local
2003     46010
2004     46580
2005     46996
2006     12670
2007     12434
2008     20466
2009     75458
2010    240999
2011    332153
2012    433021
2013    501516
2014    545168
2015    569746
2016    162456
2017    168426
2018    177543
2019    179286
dtype: int64

In [20]:
# How many datapoints per city
df.groupby(
  'county'
).size()

county
Alameda            186685
Butte                7915
Calaveras           56027
Colusa              52888
Contra Costa        72480
Del Norte            2373
El Dorado             280
Fresno             215235
Humboldt            36159
Imperial            52011
Inyo                80110
Kern                80772
Kings               87007
Lake                 2500
Los Angeles        103471
Madera              81079
Marin               56273
Mendocino          121701
Merced              64241
Modoc                   8
Mono                 6486
Monterey           147229
Napa                33313
Nevada              11545
Orange              20607
Placer              46567
Plumas              13252
Riverside          146925
Sacramento          82363
San Benito          60115
San Bernardino      47816
San Diego          175871
San Francisco       61669
San Joaquin        163788
San Luis Obispo    226677
San Mateo           60409
Santa Barbara      150740
Santa Clara        121597
Santa

In [40]:
# How many datapoints per city x year
df_group = df.groupby([
    df.date_local.dt.year,
    df.county
]).size().rename('size').reset_index().rename(columns={'date_local': 'year'})

In [41]:
df_group.head()

Unnamed: 0,year,county,size
0,2003,Alameda,828
1,2003,Butte,244
2,2003,Calaveras,244
3,2003,Colusa,476
4,2003,Contra Costa,1104


In [47]:
df_pivot = df_group.pivot_table('size', ['county'], 'year')

In [50]:
df_pivot.head()

year,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Alameda,828.0,760.0,740.0,191.0,211.0,342.0,2244.0,15684.0,17410.0,17559.0,25076.0,33402.0,34368.0,7878.0,8970.0,10310.0,10712.0
Butte,244.0,244.0,244.0,63.0,60.0,62.0,240.0,63.0,68.0,93.0,365.0,365.0,365.0,1312.0,1268.0,1384.0,1475.0
Calaveras,244.0,244.0,240.0,61.0,69.0,65.0,240.0,4831.0,8760.0,8784.0,8667.0,8650.0,8760.0,1742.0,1179.0,1789.0,1702.0
Colusa,476.0,492.0,468.0,127.0,99.0,70.0,212.0,67.0,6479.0,8853.0,8823.0,8827.0,8820.0,2314.0,2349.0,2272.0,2140.0
Contra Costa,1104.0,1036.0,1132.0,279.0,306.0,309.0,1100.0,297.0,305.0,783.0,17135.0,17149.0,17004.0,3428.0,3599.0,3547.0,3967.0


In [61]:
# Remove unnecessary columns to make aggregation easier
df_drop = df.drop(['parameter', 'city', 'observation_count'], axis=1).rename(columns={'date_local': 'date'})
df_drop = df_drop[['date', 'county', 'latitude', 'longitude', 'aqi']]

In [62]:
df_drop.head()

Unnamed: 0,date,county,latitude,longitude,aqi
0,2003-01-03,Fresno,36.72639,-119.733,121.0
1,2003-01-06,Fresno,36.72639,-119.733,136.0
2,2003-01-09,Fresno,36.72639,-119.733,80.0
3,2003-01-12,Fresno,36.72639,-119.733,89.0
4,2003-01-15,Fresno,36.72639,-119.733,102.0


In [64]:
# Aggregate by date and county; average lat, long, and aqi
df_agg = df_drop.groupby(['date', 'county']).mean()

In [67]:
print(df_agg)

                          latitude   longitude         aqi
date       county                                         
2003-01-01 Contra Costa  37.936013 -122.026154   82.000000
           Fresno        36.781333 -119.773190  152.000000
           Kern          35.356615 -119.062613   91.000000
           Los Angeles   34.008950 -118.113333   85.333333
           Orange        33.830620 -117.938450   63.000000
...                            ...         ...         ...
2019-12-31 Sonoma        38.403765 -122.818294   34.000000
           Stanislaus    37.565241 -120.915110   68.000000
           Sutter        39.138773 -121.618549   64.000000
           Tehama        40.170930 -122.255560   39.000000
           Ventura       34.311255 -118.903723    6.333333

[178959 rows x 3 columns]


In [70]:
df_agg.to_csv('../data/airquality_aggregated.csv')