# Investigation of PurpleAir's data
[PurpleAir](http://www.purpleair.com) sells low-cost air quality sensors that feed data to [real-time maps of PM2.5 pollution](https://www.purpleair.com/map?#11/37.789/-122.2048).   
This data will be used for a UC Berkeley capstone project [summarized here](https://docs.google.com/document/d/1NjCpqNd7rDnD6VOExVktGtquRzs21hpwZ8HhLQpYLO8/edit).

### Libraries and installs

In [1]:
import pandas as pd
import pandas_profiling
import numpy as np
import json
import datetime, time
from dateutil import tz
import ast
from matplotlib import pyplot as plt 
import seaborn as sns
import gmplot

import boto3
import s3fs
from fastparquet import ParquetFile, write

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

### Data Folder Instructions

In [2]:
# Use this cell to specify the paths for the data folder in your local machines
# Use the variable 'datafolder' to specify the path
# Comment out all the data paths except your own
# Purple Air data ia assumed to be in a subfolder called 'purpleair' 
# For example, if the base data folder is '/users/data', purpleair data should be in '/users/data/purpleair'

# Angshuman's local path
datafolder = "/Users/apaul2/Documents/_Common/capstone/Project/data"

### Get data for one full day

This data is being pulled from the s3 bucket (midscapstone-whos-polluting-my-air) where we are storing all the purple air data files. The data files are being stored in a folder called 'PurpleAir' in this bucket.

In [5]:
# Create dataframe for existing addresses
address_df = pd.read_parquet("{}/purpleair/address_latlon.parquet".format(datafolder))
address_df.head()

Unnamed: 0,city,country,county,lat,lon,state,zipcode
0,Kardia,Greece,,40.465755,22.992308,,575 00
1,Thessaloniki,Greece,,40.633926,22.956742,,546 36
2,Thessaloniki,Greece,,40.597275,22.954437,,546 46
3,Thessaloniki,Greece,,40.633927,22.939293,,546 24
4,Keizer,United States,Marion County,45.017528,-123.016639,Oregon,97303


In [20]:
# days_list = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14']
# bay_purple_df = pd.read_parquet("{}/purpleair/dailyfiltered/20190914.parquet".format(datafolder))

for i in range(14,31):
    bay_purple_df = pd.read_parquet("{}/purpleair/dailyfiltered/201909{}.parquet".format(datafolder, i))

# for i in range(len(days_list)):
    
    bay_purple_latlon_df = bay_purple_df[['device_loc_typ', 'is_owner', 'sensor_id', 'sensor_name',  'parent_id', 'lat', 'lon', 'thingspeak_primary_id', 'thingspeak_primary_id_read_key', 'thingspeak_secondary_id', 
                                      'thingspeak_secondary_id_read_key', 'sensorhash']]
    bay_purple_latlon_df.drop_duplicates(inplace=True)
    
    bay_purple_data_df = bay_purple_df[['a_h', 'high_reading_flag', 'hidden', 'datetime', 'sensorhash']]
    bay_purple_data_df.drop_duplicates(inplace=True)
    
#     bay_ts_df = pd.read_parquet("{}/thingspeak/thingspeak_sep{}.parquet".format(datafolder,days_list[i]))
    bay_ts_df = pd.read_parquet("{}/thingspeak/thingspeak_sep{}.parquet".format(datafolder,i))
    # Some numeric columns may have "nan" as a string - convert these values to np.nan
    # so that the data type of these columns are correctly identified
    bay_ts_df[['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0','pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime',
           'rssi', 'temperature', 'humidity', 'pm2_5_cf_1']] = bay_ts_df[['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0',
           'pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime', 'rssi', 'temperature', 'humidity', 'pm2_5_cf_1']].replace("nan", np.nan, regex=True)
    bay_ts_df[['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0','pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime',
           'rssi', 'temperature', 'humidity', 'pm2_5_cf_1']] = bay_ts_df[['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0',
           'pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime', 'rssi', 'temperature', 'humidity', 'pm2_5_cf_1']].apply(pd.to_numeric)
    
    # Merge purple air data with sensor data
    # Only keep records having particle data
    bay_ts_df = pd.merge(bay_ts_df, bay_purple_latlon_df,  how='left', left_on=['sensorhash'], right_on=['sensorhash'])
    bay_ts_df = pd.merge(bay_ts_df, bay_purple_data_df,  how='left', left_on=['sensorhash', 'created'], right_on=['sensorhash', 'datetime'])
    
    # Join address dataframe with main dataframe
    bay_ts_df = pd.merge(bay_ts_df, address_df,  how='left', left_on=['lat','lon'], right_on=['lat','lon'])
    
    bay_ts_df['created_at'] = bay_ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%Y/%m/%dT%H:%M"))
    bay_ts_df['year'] = bay_ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%Y"))
    bay_ts_df['month'] = bay_ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%m"))
    bay_ts_df['day'] = bay_ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%d"))
    bay_ts_df['hour'] = bay_ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%H"))
    bay_ts_df['minute'] = bay_ts_df['created_at_x'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ").strftime("%M"))
    
    # Drop unwanted columns
    bay_ts_df.drop(['created_at_x', 'sensorhash', 'datetime', 'country','state'], axis = 1, inplace=True)
    
    # Convert data type of attributes to string
    bay_ts_df[['high_reading_flag','sensor_id','parent_id', 'is_owner']] = bay_ts_df[['high_reading_flag','sensor_id','parent_id', 'is_owner']].astype(str)
    
    # Save final dataframe for future use
#     parquet_file = "{}/pa_ts/201909{}.parquet".format(datafolder,days_list[i])
    parquet_file = "{}/pa_ts/201909{}.parquet".format(datafolder,i)
    write(parquet_file, bay_ts_df,compression='GZIP')

In [164]:
tst = pd.read_parquet("{}/pa_ts/20190930.parquet".format(datafolder))

In [165]:
tst.created.unique()

array([201909300000, 201909300010, 201909300020, 201909300030,
       201909300040, 201909300050, 201909300100, 201909300110,
       201909300120, 201909300130, 201909300140, 201909300150,
       201909300200, 201909300210, 201909300220, 201909300230,
       201909300240, 201909300250, 201909300300, 201909300310,
       201909300320, 201909300330, 201909300340, 201909300350,
       201909300400, 201909300410, 201909300420, 201909300430,
       201909300440, 201909300450, 201909300500, 201909300510,
       201909300520, 201909300530, 201909300540, 201909300550,
       201909300600, 201909300610, 201909300620, 201909300630,
       201909300640, 201909300650, 201909300700, 201909300710,
       201909300720, 201909300730, 201909300740, 201909300750,
       201909300800, 201909300810, 201909300820, 201909300830,
       201909300840, 201909300850, 201909300900, 201909300910,
       201909300920, 201909300930, 201909300940, 201909300950,
       201909301000, 201909301010, 201909301020, 201909