# Where are the most sensors by region?
The US government has a number of ways to classify regions. One is the Metropolitan Statistical Area (MSA). This looks to be about the right unit of measurement, as it works mostly with urban/suburban areas rather than drawing in huge exurban spaces. 

In [1]:
import pandas as pd
import pandas_profiling
import numpy as np 
import json
import datetime
import re
import ast

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [2]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# Purple Air data ia assumed to be in a subfolder called 'purpleair' 
# For example, if the base data folder is '/users/data', purpleair data should be in '/users/data/purpleair'

# Ben's local path
datafolder = "../my_stash/data"

In [3]:
purpleair_df = pd.read_csv("{}/purpleair/09141731_withaddress.csv".format(datafolder))
zipmsa_df = pd.read_csv("{}/zip_msa.csv".format(datafolder)) # purchased from:
# zipinfo.com/products/z5msa/z5msa.htm
regions_df = pd.read_csv("{}/msapmsa.csv".format(datafolder)) # purchased from same

In [4]:
zipmsa_df.rename(columns = {'ZIP':'zipcode'}, inplace = True) # standardize the column name

In [5]:
# handle NaNs in the PurpleAir zipcode field
replace_col = []
for i in purpleair_df['zipcode']:
    try:
        i = int(i)
    except:
        i = 0
    replace_col.append(i)
purpleair_df['zipcode'] = replace_col

In [6]:
purpleair_df.astype({'zipcode': 'int64'}) # set them to integer type

Unnamed: 0.1,Unnamed: 0,age,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,last_seen,lat,lon,pm2_5val,parent_id,thingspeak_primary_id,thingspeak_primary_id_read_key,thingspeak_secondary_id,thingspeak_secondary_id_read_key,sensor_type,humidity,is_owner,pressure,temp_f,av_stat_last_modified,av_stat_time_since_last_modified,pm2_5val_10m_avg,pm2_5val_30m_avg,pm2_5val_1h_avg,pm2_5val_6h_avg,pm2_5val_24h_avg,pm2_5val_1wk_avg,city,country,county,state,zipcode
0,0,357358,,inside,,False,24115,2nd South 12th East,1547065985,40.764907,-111.856653,0.15,,672791,CLV9HLXOGIYQNYD2,672792,WAZLM3J4Q9OHKNGE,PMS5003+PMS5003+BME280,15.0,0,869.14,89.0,1.547066e+12,80079.0,0.47,19.37,34.37,16.29,4.76,0.71,Salt Lake City,United States,Salt Lake County,Utah,84102
1,1,357358,,,,False,24116,2nd South 12th East B,1547065988,40.764907,-111.856653,,24115.0,672793,UQJBDQ2XXPP73U45,672795,5G9B9E4XFL32S845,,15.0,0,869.16,89.0,,,,,,,,,Salt Lake City,United States,Salt Lake County,Utah,84102
2,2,99019,,outside,,False,27699,CHA1,1562566345,37.929030,-121.940127,3.13,,716094,M57A4C3IH3QFM9N6,716095,7U439FCFH7XFYY2V,PMS5003+PMS5003+BME280,26.0,0,995.19,74.0,1.562566e+12,119968.0,3.38,3.43,3.93,6.40,6.79,4.48,Clayton,United States,Contra Costa County,California,94517
3,3,99019,,,,False,27700,CHA1 B,1562566345,37.929030,-121.940127,3.22,27699.0,716096,9CI7CR8RAJJ4UZC7,716097,6CE24ETYNU310ABD,,,0,,,1.562566e+12,119969.0,3.44,3.52,4.08,6.69,7.13,4.76,Clayton,United States,Contra Costa County,California,94517
4,4,0,,outside,,False,16791,DW0435,1568507486,18.082454,-67.039027,1.47,,589048,61GKVZGTCZSBUGB5,589049,5HBLH5R8GPLM6J88,PMS5003+PMS5003+BME280,58.0,0,1009.83,96.0,1.568507e+12,119763.0,1.78,1.53,1.51,2.29,2.97,3.95,San Germán,Puerto Rico,San Germán,,683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17539,17539,381884,,,,False,22218,Zuffi Domicilio B,1545594436,37.303701,-121.773327,,22217.0,652930,YF5HTYQKFTLLWWSE,652931,H05KWOGIXM4CR999,,36.0,0,1014.88,73.0,,,,,,,,,San Jose,United States,Santa Clara County,California,95135
17540,17540,1,,outside,1.0,False,15207,__SilvanaTerraceSensor1,1568507432,48.203071,-122.289081,3.16,,573192,2JVEOVLU389JBIKL,573194,5K8M358MOAFRRBRG,PMS5003+PMS5003+BME280,51.0,0,1011.76,76.0,1.568507e+12,120175.0,2.92,2.87,2.67,2.55,3.13,5.19,Stanwood,United States,Snohomish County,Washington,98292
17541,17541,1,,,1.0,False,15208,__SilvanaTerraceSensor1 B,1568507432,48.203071,-122.289081,3.06,15207.0,573195,0538JMKEJZY8I4K0,573196,6VJ9ZQQJE2GU6KL6,,,0,,,1.568507e+12,120175.0,3.26,3.15,2.93,2.88,3.50,5.70,Stanwood,United States,Snohomish County,Washington,98292
17542,17542,0,,outside,,False,15211,__SilvanaTerraceSensor2,1568507469,48.203800,-122.288882,2.09,,573199,NBCUX44B85C1D1LH,573201,GP83RBKIOKCJJAPP,PMS5003+PMS5003+BME280,58.0,0,1011.20,74.0,1.568507e+12,120024.0,3.08,3.04,2.83,3.03,3.56,5.52,Stanwood,United States,Snohomish County,Washington,98292


In [7]:
# merge the purple air data with the metro data
merged_df = pd.merge(purpleair_df, zipmsa_df, on='zipcode')

In [8]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,age,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,last_seen,lat,lon,pm2_5val,parent_id,thingspeak_primary_id,thingspeak_primary_id_read_key,thingspeak_secondary_id,thingspeak_secondary_id_read_key,sensor_type,humidity,is_owner,pressure,temp_f,av_stat_last_modified,av_stat_time_since_last_modified,pm2_5val_10m_avg,pm2_5val_30m_avg,pm2_5val_1h_avg,pm2_5val_6h_avg,pm2_5val_24h_avg,pm2_5val_1wk_avg,city,country,county,state,zipcode,City,ST,A/C,FIPS,County,T/Z,DST?,MSA,PMSA,Type
0,0,357358,,inside,,False,24115,2nd South 12th East,1547065985,40.764907,-111.856653,0.15,,672791,CLV9HLXOGIYQNYD2,672792,WAZLM3J4Q9OHKNGE,PMS5003+PMS5003+BME280,15.0,0,869.14,89.0,1547066000000.0,80079.0,0.47,19.37,34.37,16.29,4.76,0.71,Salt Lake City,United States,Salt Lake County,Utah,84102,Salt Lake City,UT,801.0,49035.0,Salt Lake,MST,Y,7160.0,,
1,1,357358,,,,False,24116,2nd South 12th East B,1547065988,40.764907,-111.856653,,24115.0,672793,UQJBDQ2XXPP73U45,672795,5G9B9E4XFL32S845,,15.0,0,869.16,89.0,,,,,,,,,Salt Lake City,United States,Salt Lake County,Utah,84102,Salt Lake City,UT,801.0,49035.0,Salt Lake,MST,Y,7160.0,,
2,122,390462,,inside,,False,22845,115s 1100e slc;ut.,1545079733,40.766638,-111.859077,0.06,,656576,OBFA3BXTIOMTQFT9,656577,V7R2K8M07V7RDZ59,PMS5003+PMS5003+BME280,18.0,0,867.77,77.0,1545080000000.0,79299.0,20.38,30.45,21.9,4.92,1.29,0.19,Salt Lake City,United States,Salt Lake County,Utah,84102,Salt Lake City,UT,801.0,49035.0,Salt Lake,MST,Y,7160.0,,
3,123,390462,,,,False,22846,115s 1100e slc;ut. B,1545079763,40.766638,-111.859077,,22845.0,656578,26J28OUCV6WFFHVU,656579,NYDECD3UY4TF4U62,,18.0,0,867.76,77.0,,,,,,,,,Salt Lake City,United States,Salt Lake County,Utah,84102,Salt Lake City,UT,801.0,49035.0,Salt Lake,MST,Y,7160.0,,
4,592,25726,,inside,,False,23505,8th&8th,1566963933,40.751973,-111.868187,4.36,,660357,7KO16799H7RHQFI8,660358,4NH5ZIFKCCFPKUSH,PMS5003+PMS5003+BME280,20.0,0,871.56,93.0,1566964000000.0,119983.0,3.9,4.15,3.89,3.41,3.62,3.64,Salt Lake City,United States,Salt Lake County,Utah,84102,Salt Lake City,UT,801.0,49035.0,Salt Lake,MST,Y,7160.0,,


In [9]:
# handle NaNs in the metro data
replace_col = []
for row in range(len(merged_df)):
    i = merged_df.loc[row].MSA
    j = merged_df.loc[row].PMSA
    try:
        i = int(i)
    except:
        try:
            i = int(j)
        except:
            i = 0
    replace_col.append(i)
merged_df['metrocode'] = replace_col

In [10]:
merged_df

Unnamed: 0.1,Unnamed: 0,age,a_h,device_loc_typ,high_reading_flag,hidden,sensor_id,sensor_name,last_seen,lat,lon,pm2_5val,parent_id,thingspeak_primary_id,thingspeak_primary_id_read_key,thingspeak_secondary_id,thingspeak_secondary_id_read_key,sensor_type,humidity,is_owner,pressure,temp_f,av_stat_last_modified,av_stat_time_since_last_modified,pm2_5val_10m_avg,pm2_5val_30m_avg,pm2_5val_1h_avg,pm2_5val_6h_avg,pm2_5val_24h_avg,pm2_5val_1wk_avg,city,country,county,state,zipcode,City,ST,A/C,FIPS,County,T/Z,DST?,MSA,PMSA,Type,metrocode
0,0,357358,,inside,,False,24115,2nd South 12th East,1547065985,40.764907,-111.856653,0.15,,672791,CLV9HLXOGIYQNYD2,672792,WAZLM3J4Q9OHKNGE,PMS5003+PMS5003+BME280,15.0,0,869.14,89.0,1.547066e+12,80079.0,0.47,19.37,34.37,16.29,4.76,0.71,Salt Lake City,United States,Salt Lake County,Utah,84102,Salt Lake City,UT,801.0,49035.0,Salt Lake,MST,Y,7160.0,,,7160
1,1,357358,,,,False,24116,2nd South 12th East B,1547065988,40.764907,-111.856653,,24115.0,672793,UQJBDQ2XXPP73U45,672795,5G9B9E4XFL32S845,,15.0,0,869.16,89.0,,,,,,,,,Salt Lake City,United States,Salt Lake County,Utah,84102,Salt Lake City,UT,801.0,49035.0,Salt Lake,MST,Y,7160.0,,,7160
2,122,390462,,inside,,False,22845,115s 1100e slc;ut.,1545079733,40.766638,-111.859077,0.06,,656576,OBFA3BXTIOMTQFT9,656577,V7R2K8M07V7RDZ59,PMS5003+PMS5003+BME280,18.0,0,867.77,77.0,1.545080e+12,79299.0,20.38,30.45,21.90,4.92,1.29,0.19,Salt Lake City,United States,Salt Lake County,Utah,84102,Salt Lake City,UT,801.0,49035.0,Salt Lake,MST,Y,7160.0,,,7160
3,123,390462,,,,False,22846,115s 1100e slc;ut. B,1545079763,40.766638,-111.859077,,22845.0,656578,26J28OUCV6WFFHVU,656579,NYDECD3UY4TF4U62,,18.0,0,867.76,77.0,,,,,,,,,Salt Lake City,United States,Salt Lake County,Utah,84102,Salt Lake City,UT,801.0,49035.0,Salt Lake,MST,Y,7160.0,,,7160
4,592,25726,,inside,,False,23505,8th&8th,1566963933,40.751973,-111.868187,4.36,,660357,7KO16799H7RHQFI8,660358,4NH5ZIFKCCFPKUSH,PMS5003+PMS5003+BME280,20.0,0,871.56,93.0,1.566964e+12,119983.0,3.90,4.15,3.89,3.41,3.62,3.64,Salt Lake City,United States,Salt Lake County,Utah,84102,Salt Lake City,UT,801.0,49035.0,Salt Lake,MST,Y,7160.0,,,7160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14422,17521,178057,,,,False,12247,Zeughaushof 2 B,1557824059,47.377600,8.530420,8.50,12246.0,527112,6850DM8F8XT5DCW1,527113,EXINYPXBQGJ7V8P0,,22.0,0,979.26,68.0,1.557824e+12,77115.0,10.85,11.63,10.49,7.88,7.61,10.45,Zürich,Switzerland,Zürich,Zürich,8004,Atco,NJ,856.0,34007.0,Camden,EST,Y,,6160.0,,6160
14423,17524,321180,,outside,,False,11660,Zillmere,1549236669,-27.364016,153.033164,1.28,,511805,33QYYA4PHRX0MB7Y,511806,EU23TN1RRSTF9WXH,PMS5003+PMS5003+BME280,57.0,0,1011.03,86.0,1.549237e+12,80018.0,1.93,2.48,3.26,3.24,2.28,2.46,Zillmere,Australia,Brisbane City,Queensland,4034,Freeport,ME,207.0,23005.0,Cumberland,EST,Y,6400.0,,U,6400
14424,17525,321180,,,,False,11661,Zillmere B,1549236673,-27.364016,153.033164,1.87,11660.0,511807,7JDC0KWXY8NAZSD3,511808,T1BEQOBSIIZH8UCH,,57.0,0,1011.02,86.0,1.549237e+12,79850.0,2.14,2.70,3.47,3.46,2.45,2.52,Zillmere,Australia,Brisbane City,Queensland,4034,Freeport,ME,207.0,23005.0,Cumberland,EST,Y,6400.0,,U,6400
14425,17536,0,,outside,,False,22607,zombieland,1568507459,36.158086,-115.362767,4.28,,654347,O7NK1R67KFEOL0K9,654348,6T8TV55XJ5QUDATV,PMS5003+PMS5003+BME280,5.0,0,902.16,107.0,1.568507e+12,119986.0,3.93,3.88,3.87,3.99,4.33,3.36,Las Vegas,United States,Clark County,Nevada,89138,Las Vegas,NV,702.0,32003.0,Clark,PST,Y,4120.0,,,4120


In [11]:
# count the sensors in each metro region
MSA_counts = merged_df['metrocode'].value_counts()

In [12]:
# remove NaNs from the regional name lookup data
replace_col = []
for i in range(len(regions_df)):
    try:
        i = int(regions_df.loc[i]['Metro Area'])
    except:
        i = -1
    replace_col.append(i)
regions_df['Metro Area'] = replace_col

In [13]:
# output sorted list of all metro regions with the sensor numbers, in descending order

metro_name = ''
for MSA in MSA_counts.keys():
    population = 0
    if MSA == 0:
        metro_name = "No MSA region"
    else:
        metro_name = regions_df[regions_df['Metro Area']==MSA].Name.iloc[0]
    print(metro_name, MSA_counts[MSA])


No MSA region 2384
Los Angeles-Long Beach 1523
Oakland 838
San Francisco 801
Salt Lake City-Ogden 760
Riverside-San Bernardino 515
San Jose 480
Sacramento 396
Pittsburgh 343
Seattle-Bellevue-Everett 260
Orange County 234
Austin-San Marcos 229
Santa Rosa 216
Provo-Orem 198
Portland-Vancouver, OR-WA 188
Fresno 177
Washington, DC-MD-VA-WV 173
Chicago 169
Vallejo-Fairfield-Napa 158
Eugene-Springfield 148
San Luis Obispo-Atascadero-Paso Robles 145
New York 122
Boston, MA-NH 120
Raleigh-Durham-Chapel Hill 115
Houston 115
Grand Junction 105
Santa Barbara-Santa Maria-Lompoc 105
Minneapolis-St. Paul, MN-WI 101
San Diego 100
Indianapolis 96
Redding 94
Charlotte-Gastonia-Rock Hill, NC-SC 93
Denver 91
Santa Cruz-Watsonville 89
Baltimore 83
Yolo 76
Bakersfield 70
Phoenix-Mesa 68
Chico-Paradise 62
Boise City 60
Philadelphia, PA-NJ 60
Tacoma 58
Albuquerque 56
Fayetteville-Springdale-Rogers 52
New Orleans 50
Medford-Ashland 48
Detroit 46
Salinas 46
Ventura 44
Madison 41
Atlanta 40
Dallas 36
Las Vegas,

### Conclusions

LA-Long Beach has the most sensors but over a larger area than the Bay Area. Riverside is an extremely large area, making the number of sensors deceptive. Unfotunately, it's impossible to get geographic area data for these MSAa, but eyeballing it looks like Oakland + San Franciso would be a smaller area than LA but would have more sensors. We could also include San Jose and Santa Rosa for a larger Bay Area look, but that would add a lot of area for not many more sensors. The only other intriguing place is Salt Lake City, which could possibly be combined with Provo-Orem.