# Investigation of PurpleAir's data
[PurpleAir](http://www.purpleair.com) sells low-cost air quality sensors that feed data to [real-time maps of PM2.5 pollution](https://www.purpleair.com/map?#11/37.789/-122.2048).   
This data will be used for a UC Berkeley capstone project [summarized here](https://docs.google.com/document/d/1NjCpqNd7rDnD6VOExVktGtquRzs21hpwZ8HhLQpYLO8/edit).

### Libraries and installs

In [1]:
import pandas as pd
import pandas_profiling
import numpy as np
import json
import datetime, time
from dateutil import tz
import ast
from matplotlib import pyplot as plt 
import seaborn as sns
import gmplot
from math import floor

import boto3
import s3fs
from fastparquet import ParquetFile, write

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

### Data Folder Instructions

In [2]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# Purple Air data ia assumed to be in a subfolder called 'purpleair' 
# For example, if the base data folder is '/users/data', purpleair data should be in '/users/data/purpleair'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

### Helper Functions

In [4]:
def createHashKey(row, col1, col2):
    
    str_col1 = row[col1]
    
    str_col2 = row[col2]
        
    return hash(str_col1 + str_col2)

### Consolidate s3 files into daily raw data files

This data is being pulled from the s3 bucket (midscapstone-whos-polluting-my-air) where we are storing all the purple air data files. The data files are being stored in a folder called 'PurpleAir' in this bucket.

In [14]:
s3 = s3fs.S3FileSystem()
myopen = s3.open

s3 = boto3.resource('s3')
bucket = s3.Bucket('midscapstone-whos-polluting-my-air')
objs = bucket.objects.filter(Prefix='PurpleAir/2019')

purple_df = pd.DataFrame(columns=['mapVersion', 'baseVersion', 'mapVersionString', 'results'])

for obj in objs:
    file_name = int(obj.key.replace('PurpleAir/2019','').replace('.parquet',''))
    if file_name >= 9300659 and file_name < 10010659:
        pf=ParquetFile('midscapstone-whos-polluting-my-air/{}'.format(obj.key), open_with=myopen)
        df=pf.to_pandas()
        purple_df = pd.concat([purple_df,df],ignore_index=True)

In [24]:
try:
    purple_df = pd.DataFrame.from_records(purple_df.results)
except:
    purple_df['results'] =  purple_df['results'].map(lambda d : ast.literal_eval(d))
    purple_df = pd.DataFrame.from_records(purple_df.results)

In [18]:
# purple_df['Stats'] = purple_df['Stats'].str.replace('15"v3":','15,"v3":')

In [25]:
# split the dict in the 'Stats' column into separate columns
purple_df['Stats'] = purple_df['Stats'].replace(np.nan, '{}', regex=True)
purple_df['Stats'] =  purple_df['Stats'].map(lambda d : ast.literal_eval(d))
purple_df = purple_df.join(pd.DataFrame(purple_df["Stats"].to_dict()).T)
purple_df.drop(['Stats', 'pm','v'], axis=1, inplace=True)   # 'pm' and 'v' are the same as 'PM2_5Value'

In [10]:
purple_df = purple_df[['AGE', 'A_H', 'DEVICE_LOCATIONTYPE', 'Flag', 'Hidden', 'ID', 'Label',
       'LastSeen', 'Lat', 'Lon', 'PM2_5Value', 'ParentID',
       'THINGSPEAK_PRIMARY_ID', 'THINGSPEAK_PRIMARY_ID_READ_KEY',
       'THINGSPEAK_SECONDARY_ID', 'THINGSPEAK_SECONDARY_ID_READ_KEY', 'Type',
       'humidity', 'isOwner', 'pressure', 'temp_f', 'lastModified',
       'timeSinceModified', 'v1', 'v2', 'v3', 'v4', 'v5',
       'v6']]

In [15]:
purple_df.columns

Index(['AGE', 'A_H', 'DEVICE_LOCATIONTYPE', 'Flag', 'Hidden', 'ID', 'Label',
       'LastSeen', 'Lat', 'Lon', 'PM2_5Value', 'ParentID',
       'THINGSPEAK_PRIMARY_ID', 'THINGSPEAK_PRIMARY_ID_READ_KEY',
       'THINGSPEAK_SECONDARY_ID', 'THINGSPEAK_SECONDARY_ID_READ_KEY', 'Type',
       'humidity', 'isOwner', 'pressure', 'temp_f', 'lastModified',
       'timeSinceModified', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6'],
      dtype='object')

In [21]:
# purple_df.drop(['Ozone1','Voc'], axis=1, inplace=True)

In [26]:
# rename columns to something easily understandable
purple_df.columns = ['age','a_h','device_loc_typ','high_reading_flag', 'hidden','sensor_id','sensor_name','last_seen','lat','lon',
                      'pm2_5val','parent_id','thingspeak_primary_id','thingspeak_primary_id_read_key','thingspeak_secondary_id',
                      'thingspeak_secondary_id_read_key','sensor_type','humidity','is_owner','pressure','temp_f','av_stat_last_modified',
                      'av_stat_time_since_last_modified','pm2_5val_10m_avg','pm2_5val_30m_avg','pm2_5val_1h_avg','pm2_5val_6h_avg',
                      'pm2_5val_24h_avg','pm2_5val_1wk_avg']

In [27]:
# Write to file
parquet_file = "{}/purpleair/dailyraw/20190916.parquet".format(datafolder)
write(parquet_file, purple_df,compression='GZIP')

### Get only required columns from raw data

In [127]:
purple_df.head()

Unnamed: 0.1,Unnamed: 0,baseVersion,mapVersion,mapVersionString,results
0,0,6,0.88,,"{'ID': 24115, 'Label': ' 2nd South 12th East',..."
1,1,6,0.88,,"{'ID': 24116, 'ParentID': 24115, 'Label': ' 2n..."
2,2,6,0.88,,"{'ID': 27699, 'Label': ' CHA1', 'DEVICE_LOCATI..."
3,3,6,0.88,,"{'ID': 27700, 'ParentID': 27699, 'Label': ' CH..."
4,4,6,0.88,,"{'ID': 16791, 'Label': ' DW0435', 'DEVICE_LOCA..."


In [28]:
for i in range(14,17):
    purple_df = pd.read_parquet("{}/purpleair/dailyraw/201909{}.parquet".format(datafolder,i))
    # Drop unwanted columns
    purple_df.drop(['age','av_stat_last_modified', 'av_stat_time_since_last_modified','pm2_5val_10m_avg', 'pm2_5val_30m_avg', 'pm2_5val_1h_avg',
           'pm2_5val_6h_avg', 'pm2_5val_24h_avg', 'pm2_5val_1wk_avg', 'pm2_5val','humidity','pressure','temp_f','sensor_type'], axis=1, inplace=True)
    # There may be duplicates in sensor data in case no new readings we obtained since the last refresh
    purple_df.drop_duplicates(inplace=True)
    
    bayarea_purple_df = purple_df[(purple_df.lat > 37.701933) & (purple_df.lat < 38.008050) 
                              & (purple_df.lon > -122.536985) & (purple_df.lon < -122.186437)]
    bayarea_purple_df.reset_index(inplace=True, drop=True)
    
    # Get date and time columns in local timezone
    bayarea_purple_df['year'] = bayarea_purple_df['last_seen'].apply(lambda x: datetime.datetime.fromtimestamp(x).replace(tzinfo=tz.tzutc()).astimezone(tz.tzlocal()).strftime("%Y"))
    bayarea_purple_df['month'] = bayarea_purple_df['last_seen'].apply(lambda x: datetime.datetime.fromtimestamp(x).replace(tzinfo=tz.tzutc()).astimezone(tz.tzlocal()).strftime("%m"))
    bayarea_purple_df['day'] = bayarea_purple_df['last_seen'].apply(lambda x: datetime.datetime.fromtimestamp(x).replace(tzinfo=tz.tzutc()).astimezone(tz.tzlocal()).strftime("%d"))
    bayarea_purple_df['hour'] = bayarea_purple_df['last_seen'].apply(lambda x: datetime.datetime.fromtimestamp(x).replace(tzinfo=tz.tzutc()).astimezone(tz.tzlocal()).strftime("%H"))
    bayarea_purple_df['minute'] = bayarea_purple_df['last_seen'].apply(lambda x: datetime.datetime.fromtimestamp(x).replace(tzinfo=tz.tzutc()).astimezone(tz.tzlocal()).strftime("%M"))
    bayarea_purple_df['10min'] = bayarea_purple_df['minute'].apply(lambda x: "{:02}".format(10 * floor(int(x)/10)))
    
    bayarea_purple_df['datetime'] = bayarea_purple_df[['year', 'month','day','hour','10min']].apply(lambda x: int(''.join(x)), axis=1)
    
    # Drop unwanted columns from purple air data
    bayarea_purple_df.drop(['last_seen', 'hour', 'minute', '10min'], axis = 1, inplace=True)
    bayarea_purple_df.drop_duplicates(inplace=True)
    
    # Write to file
    parquet_file = "{}/purpleair/dailyfull/201909{}.parquet".format(datafolder,i)
    write(parquet_file, bayarea_purple_df,compression='GZIP')
    
    bayarea_purple_dly_df =bayarea_purple_df[(bayarea_purple_df.year == '2019') & (bayarea_purple_df.month == '09') & (bayarea_purple_df.day == str(i))]
    
    # Drop unwanted columns from purple air data
    bayarea_purple_dly_df.drop(['year', 'month', 'day'], axis = 1, inplace=True)
    bayarea_purple_dly_df.drop_duplicates(inplace=True)
    
    # Add hash column based on the primary and secondary keys
    bayarea_purple_dly_df['sensorhash'] = bayarea_purple_dly_df.apply (lambda row: createHashKey(row,'thingspeak_primary_id_read_key',
                                                                                                    'thingspeak_secondary_id_read_key'), axis=1)
    # Write to file
    parquet_file = "{}/purpleair/dailyfiltered/201909{}.parquet".format(datafolder,i)
    write(parquet_file, bayarea_purple_dly_df,compression='GZIP')

In [124]:
bayarea_purple_dly_df.sensor_id.count(), bayarea_purple_dly_df.lat.nunique(), bayarea_purple_dly_df.lon.nunique(), bayarea_purple_dly_df.sensorhash.nunique()

(58896, 294, 294, 587)

In [3]:
tst = pd.read_parquet("{}/purpleair/dailyfiltered/20190930.parquet".format(datafolder))

In [5]:
tst.datetime.unique()

array([201909300000, 201909300010, 201909300020, 201909300030,
       201909300040, 201909300050, 201909300100, 201909300110,
       201909300120, 201909300130, 201909300140, 201909300150,
       201909300200, 201909300210, 201909300220, 201909300230,
       201909300240, 201909300250, 201909300300, 201909300310,
       201909300320, 201909300330, 201909300340, 201909300350,
       201909300400, 201909300410, 201909300420, 201909300430,
       201909300440, 201909300450, 201909300500, 201909300510,
       201909300520, 201909300530, 201909300540, 201909300550,
       201909300600, 201909300610, 201909300620, 201909300630,
       201909300640, 201909300650, 201909300700, 201909300710,
       201909300720, 201909300730, 201909300740, 201909300750,
       201909300800, 201909300810, 201909300820, 201909300830,
       201909300840, 201909300850, 201909300900, 201909300910,
       201909300920, 201909300930, 201909300940, 201909300950,
       201909301000, 201909301010, 201909301020, 201909