# Investigation of PurpleAir's data
[PurpleAir](http://www.purpleair.com) sells low-cost air quality sensors that feed data to [real-time maps of PM2.5 pollution](https://www.purpleair.com/map?#11/37.789/-122.2048).   
This data will be used for a UC Berkeley capstone project [summarized here](https://docs.google.com/document/d/1NjCpqNd7rDnD6VOExVktGtquRzs21hpwZ8HhLQpYLO8/edit).

### Libraries and installs

In [1]:
import pandas as pd
import pandas_profiling
import numpy as np
import json
import datetime, time
from dateutil import tz
import ast
from matplotlib import pyplot as plt 
import seaborn as sns
import gmplot

import boto3
import s3fs
from fastparquet import ParquetFile, write

import urllib3
import json

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

https = urllib3.PoolManager()

### Data Folder Instructions

In [4]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# Purple Air data ia assumed to be in a subfolder called 'purpleair' 
# For example, if the base data folder is '/users/data', purpleair data should be in '/users/data/purpleair'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

### Helper Functions

In [3]:
def createHashKey(row, col1, col2):
    
    str_col1 = row[col1]
    
    str_col2 = row[col2]
        
    return hash(str_col1 + str_col2)

In [4]:
# Get data from sensor 2
def genTS2DF(sensordf, startday):
    ts_s_df = pd.DataFrame(columns=['created_at', '0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0', 'pm10_0','sensorhash'])
    count, errCount = 0, 0

    for ind, val in sensordf.iterrows():
        qrystr = "https://api.thingspeak.com/channels/{0}/feeds.json?api_key={0}&start=2019-09-{2}%2000:00:00&end=2019-09-{2}%2023:59:59& \
                    timezone=America/Los_Angeles&timescale=10".format(val['thingspeak_secondary_id'], val['thingspeak_secondary_id_read_key'], startday)
#         print(qrystr)
        try:
            count += 1
            r = https.request('GET',qrystr)
            if r.status == 200:
                j = json.loads(r.data.decode('utf-8'))
                df = pd.DataFrame(j['feeds'])
                df.columns=['created_at', '0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0', 'pm10_0']
                df['sensorhash'] = val['sensorhash']
                ts_s_df = pd.concat([ts_s_df,df],ignore_index=True)
        except Exception as e:
            errCount += 1
            continue
    print("For {}, Of the {} requests, {} errored out.".format(startday, count, errCount))
    
    # Add a key column based on time
    # This along with the sensorhash column will be used to join the two sensor datasets
    ts_s_df['created'] = ts_s_df['created_at'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%Y%m%d%H%M"))
    
    return ts_s_df

In [5]:
# Get data from sensor 1
def genTS1DF(sensordf, startday):
    ts_p_df = pd.DataFrame(columns=['created_at', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime', 'rssi', 'temperature', 'humidity', 'pm2_5_cf_1','sensorhash'])
    count, errCount = 0, 0

    for ind, val in sensordf.iterrows():
        qrystr = "https://api.thingspeak.com/channels/{0}/feeds.json?api_key={1}&start=2019-09-{2}%2000:00:00&end=2019-09-{2}%2023:59:59& \
                    timezone=America/Los_Angeles&timescale=10".format(val['thingspeak_primary_id'], val['thingspeak_primary_id_read_key'], startday)
#         print(qrystr)
        try:
            count += 1
            r = https.request('GET',qrystr)
            if r.status == 200:
                j = json.loads(r.data.decode('utf-8'))
                df = pd.DataFrame(j['feeds'])
                df.columns=['created_at', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime', 'rssi', 'temperature', 'humidity', 'pm2_5_cf_1']
                df['sensorhash'] = val['sensorhash']
                ts_p_df = pd.concat([ts_p_df,df],ignore_index=True)
        except Exception as e:
            errCount += 1
            continue
    print("Of the {} requests, {} errored out.".format(count, errCount))
    
    # Add a key column based on time
    # This along with the sensorhash column will be used to join the two sensor datasets
    ts_p_df['created'] = ts_p_df['created_at'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%Y%m%d%H%M"))
    
    return ts_p_df

In [6]:
days_list = ['15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30']
# days_list = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14']
# bayarea_purple_df = pd.read_parquet("{}/purpleair/dailyfiltered/20190914.parquet".format(datafolder))

for i in range(len(days_list)):
    bayarea_purple_df = pd.read_parquet("{}/purpleair/dailyfiltered/201909{}.parquet".format(datafolder,days_list[i]))
    bay_pa_thingspeak_df = bayarea_purple_df[['sensorhash', 'thingspeak_primary_id','thingspeak_primary_id_read_key',
                                               'thingspeak_secondary_id','thingspeak_secondary_id_read_key']]
    bay_pa_thingspeak_df.drop_duplicates(inplace=True)
    bay_pa_thingspeak_df.reset_index(inplace=True, drop=True)

    ts_s_df = genTS2DF(bay_pa_thingspeak_df, days_list[i])
    ts_p_df = genTS1DF(bay_pa_thingspeak_df, days_list[i])
    # Merge data from the two sensors
    # Only keep records having particle data
    bay_ts_df = pd.merge(ts_s_df, ts_p_df,  how='left', left_on=['sensorhash','created'], right_on=['sensorhash','created'])
    bay_ts_df.drop(['created_at_y'], axis=1, inplace=True)
    
    # Write to file
    parquet_file = "{}/thingspeak/thingspeak_sep{}.parquet".format(datafolder, days_list[i])
    write(parquet_file, bay_ts_df,compression='GZIP')

For 01, Of the 593 requests, 102 errored out.
Of the 593 requests, 36 errored out.
For 02, Of the 593 requests, 102 errored out.
Of the 593 requests, 37 errored out.
For 03, Of the 593 requests, 103 errored out.
Of the 593 requests, 37 errored out.
For 04, Of the 593 requests, 101 errored out.
Of the 593 requests, 34 errored out.
For 05, Of the 593 requests, 98 errored out.
Of the 593 requests, 30 errored out.
For 06, Of the 593 requests, 97 errored out.
Of the 593 requests, 29 errored out.
For 07, Of the 593 requests, 93 errored out.
Of the 593 requests, 26 errored out.
For 08, Of the 593 requests, 91 errored out.
Of the 593 requests, 21 errored out.
For 09, Of the 593 requests, 89 errored out.
Of the 593 requests, 20 errored out.
For 10, Of the 593 requests, 84 errored out.
Of the 593 requests, 15 errored out.
For 11, Of the 593 requests, 85 errored out.
Of the 593 requests, 16 errored out.
For 12, Of the 593 requests, 79 errored out.
Of the 593 requests, 10 errored out.
For 13, Of t

In [124]:
tst = pd.read_parquet("{}/thingspeak/thingspeak_sep30.parquet".format(datafolder))

In [125]:
tst.created.unique()

array(['201909300000', '201909300010', '201909300020', '201909300030',
       '201909300040', '201909300050', '201909300100', '201909300110',
       '201909300120', '201909300130', '201909300140', '201909300150',
       '201909300200', '201909300210', '201909300220', '201909300230',
       '201909300240', '201909300250', '201909300300', '201909300310',
       '201909300320', '201909300330', '201909300340', '201909300350',
       '201909300400', '201909300410', '201909300420', '201909300430',
       '201909300440', '201909300450', '201909300500', '201909300510',
       '201909300520', '201909300530', '201909300540', '201909300550',
       '201909300600', '201909300610', '201909300620', '201909300630',
       '201909300640', '201909300650', '201909300700', '201909300710',
       '201909300720', '201909300730', '201909300740', '201909300750',
       '201909300800', '201909300810', '201909300820', '201909300830',
       '201909300840', '201909300850', '201909300900', '201909300910',
      