In [1]:
import json
import csv
import pandas
import numpy

In [3]:
# Read airquality data
df = pandas.read_csv('../data/airquality.csv', dtype={'county': str, 'date_local': str}, parse_dates=['date_local']) 

In [5]:
df.head()
print(df)

         latitude  longitude date_local                 parameter  county  \
0        36.72639 -119.73300 2003-01-03  PM2.5 - Local Conditions  Fresno   
1        36.72639 -119.73300 2003-01-06  PM2.5 - Local Conditions  Fresno   
2        36.72639 -119.73300 2003-01-09  PM2.5 - Local Conditions  Fresno   
3        36.72639 -119.73300 2003-01-12  PM2.5 - Local Conditions  Fresno   
4        36.72639 -119.73300 2003-01-15  PM2.5 - Local Conditions  Fresno   
...           ...        ...        ...                       ...     ...   
3570923  37.64571 -118.96652 2019-12-27  PM2.5 - Local Conditions    Mono   
3570924  37.64571 -118.96652 2019-12-28  PM2.5 - Local Conditions    Mono   
3570925  37.64571 -118.96652 2019-12-29  PM2.5 - Local Conditions    Mono   
3570926  37.64571 -118.96652 2019-12-30  PM2.5 - Local Conditions    Mono   
3570927  37.64571 -118.96652 2019-12-31  PM2.5 - Local Conditions    Mono   

                  city  measure  
0               Fresno     43.6  
1      

In [6]:
# Filter out negative values
df = df[(df['measure'] >= 0) | (df['measure'].isnull())]
print(df)

         latitude  longitude date_local                 parameter  county  \
0        36.72639 -119.73300 2003-01-03  PM2.5 - Local Conditions  Fresno   
1        36.72639 -119.73300 2003-01-06  PM2.5 - Local Conditions  Fresno   
2        36.72639 -119.73300 2003-01-09  PM2.5 - Local Conditions  Fresno   
3        36.72639 -119.73300 2003-01-12  PM2.5 - Local Conditions  Fresno   
4        36.72639 -119.73300 2003-01-15  PM2.5 - Local Conditions  Fresno   
...           ...        ...        ...                       ...     ...   
3570923  37.64571 -118.96652 2019-12-27  PM2.5 - Local Conditions    Mono   
3570924  37.64571 -118.96652 2019-12-28  PM2.5 - Local Conditions    Mono   
3570925  37.64571 -118.96652 2019-12-29  PM2.5 - Local Conditions    Mono   
3570926  37.64571 -118.96652 2019-12-30  PM2.5 - Local Conditions    Mono   
3570927  37.64571 -118.96652 2019-12-31  PM2.5 - Local Conditions    Mono   

                  city  measure  
0               Fresno     43.6  
1      

In [7]:
# How many datapoints per year
df.groupby(
  df.date_local.dt.year,
).size()

date_local
2003     46010
2004     46580
2005     46996
2006     12670
2007     12434
2008     20466
2009     75458
2010    240527
2011    329584
2012    423921
2013    490475
2014    531046
2015    553379
2016    161462
2017    167336
2018    176338
2019    178851
dtype: int64

In [8]:
# How many datapoints per city
df.groupby(
  'county'
).size()

county
Alameda            184786
Butte                7915
Calaveras           55746
Colusa              49478
Contra Costa        71739
Del Norte            2373
El Dorado             280
Fresno             213128
Humboldt            35677
Imperial            51913
Inyo                77990
Kern                78672
Kings               86763
Lake                 2500
Los Angeles        103451
Madera              80707
Marin               55802
Mendocino          121625
Merced              63909
Modoc                   8
Mono                 6486
Monterey           142328
Napa                32648
Nevada              11272
Orange              20607
Placer              43713
Plumas              13252
Riverside          144468
Sacramento          82141
San Benito          57050
San Bernardino      47464
San Diego          172683
San Francisco       60037
San Joaquin        162265
San Luis Obispo    223010
San Mateo           58785
Santa Barbara      149584
Santa Clara        119772
Santa

In [9]:
# How many datapoints per city x year
df_group = df.groupby([
    df.date_local.dt.year,
    df.county
]).size().rename('size').reset_index().rename(columns={'date_local': 'year'})

In [10]:
df_group.head()

Unnamed: 0,year,county,size
0,2003,Alameda,828
1,2003,Butte,244
2,2003,Calaveras,244
3,2003,Colusa,476
4,2003,Contra Costa,1104


In [11]:
df_pivot = df_group.pivot_table('size', ['county'], 'year')

In [12]:
df_pivot.head()

year,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Alameda,828.0,760.0,740.0,191.0,211.0,342.0,2244.0,15684.0,17410.0,17326.0,24600.0,32828.0,33784.0,7861.0,8968.0,10309.0,10700.0
Butte,244.0,244.0,244.0,63.0,60.0,62.0,240.0,63.0,68.0,93.0,365.0,365.0,365.0,1312.0,1268.0,1384.0,1475.0
Calaveras,244.0,244.0,240.0,61.0,69.0,65.0,240.0,4831.0,8759.0,8783.0,8619.0,8577.0,8602.0,1742.0,1179.0,1789.0,1702.0
Colusa,476.0,492.0,468.0,127.0,99.0,70.0,212.0,67.0,5987.0,8453.0,8263.0,8273.0,8082.0,2202.0,2091.0,1982.0,2134.0
Contra Costa,1104.0,1036.0,1132.0,279.0,306.0,309.0,1100.0,297.0,305.0,775.0,17007.0,16924.0,16665.0,3397.0,3599.0,3547.0,3957.0


In [14]:
# Remove unnecessary columns to make aggregation easier
df_drop = df.drop(['parameter', 'city'], axis=1).rename(columns={'date_local': 'date'})
df_drop = df_drop[['date', 'county', 'latitude', 'longitude', 'measure']]

In [15]:
df_drop.head()

Unnamed: 0,date,county,latitude,longitude,measure
0,2003-01-03,Fresno,36.72639,-119.733,43.6
1,2003-01-06,Fresno,36.72639,-119.733,49.7
2,2003-01-09,Fresno,36.72639,-119.733,26.1
3,2003-01-12,Fresno,36.72639,-119.733,30.0
4,2003-01-15,Fresno,36.72639,-119.733,36.1


In [16]:
# Aggregate by date and county; average lat, long, and measure
df_agg = df_drop.groupby(['date', 'county']).mean()

In [17]:
print(df_agg)

                          latitude   longitude    measure
date       county                                        
2003-01-01 Contra Costa  37.936013 -122.026154  26.900000
           Fresno        36.781333 -119.773190  58.000000
           Kern          35.356615 -119.062613  30.900000
           Los Angeles   34.008950 -118.113333  27.800000
           Orange        33.830620 -117.938450  17.600000
...                            ...         ...        ...
2019-12-31 Sonoma        38.403765 -122.818294   8.105000
           Stanislaus    37.565241 -120.915110  20.161667
           Sutter        39.138773 -121.618549  18.100000
           Tehama        40.170930 -122.255560   9.411667
           Ventura       34.311255 -118.903723   1.509722

[178655 rows x 3 columns]


In [18]:
df_agg.to_csv('../data/airquality_aggregated.csv')