In [1]:
import plotly.graph_objects as go
import plotly
import plotly.io as pio
import pandas as pd
import numpy as np
import io
import os
import sys
import requests
import glob
import datetime
from itertools import accumulate
from pathlib import Path

In [2]:
# Richie Woo, Dosenet Intern 2020

In [3]:
# RICHIE PAIR or FILE IS PURPLE AIR DON'T FORGET

def main():
    os.chdir('sensor-data')
            
    folder_lists = os.listdir()
    print(folder_lists)
    
    for folder in folder_lists:
        print("")
        print("---------------")
        print(folder)
        dosenet_data = get_url(folder)
        dosenet_data = dosenet_data.sort_values('deviceTime_utc')
        
        os.chdir(folder)
        pair_file_list = glob.glob('*.csv')
        print('found ', len(pair_file_list), ' files')
        n = 0
        for file in pair_file_list:
            n += 1
            print('file ', n, ':')
            file_data = pd.read_csv(file)
            dosenet_data = get_url(folder)
            dosenet_data = dosenet_data.sort_values('deviceTime_utc')
            
            all_data = fix_data(dosenet_data, file_data)
            pm = '1'
            diffstring = str('pm' + pm + 'diff')
            difference_data = compare_data(all_data, pm)
            all_data.loc[:, diffstring] = difference_data
            pm = '25'
            diffstring = str('pm' + pm + 'diff')
            difference_data = compare_data(all_data, pm)
            all_data.loc[:, diffstring] = difference_data
            pm = '10'
            diffstring = str('pm' + pm + 'diff')
            difference_data = compare_data(all_data, pm)
            all_data.loc[:, diffstring] = difference_data
            
            #stupid windows uses \ instead of /. Edit the code to save the files to a different location if you want.
            data_folder = Path('python-scripts\processed_data')
            filestring = str(str(folder) + '_' + str(n) + '.csv')
            data_path = data_folder / filestring
            print('saving to:', str(data_path))

            all_data.to_csv(filestring, index = False)
            
        os.chdir(os.path.dirname(os.getcwd()))
            
    

In [4]:
def fix_data(dosenet_data, file_data):
    #drop unncessary columns
    dosenet_data = dosenet_data.drop(['deviceTime_local', 'deviceTime_utc', 'error_flag'], axis=1)
    file_data = file_data.drop(['Humidity_%', 'UptimeMinutes', 'Temperature_F', 'RSSI_dbm', 'entry_id', 'PM2.5_CF1_ug/m3'], axis=1)
    
    #preparation for data.
    #below for dosenet
    dosenet_data = dosenet_data.sort_values('deviceTime_unix')
    dosenet_data = dosenet_data.iloc[::-1]
    dosenet_data = dosenet_data.reset_index(drop=True)
    d_unix_list = dosenet_data['deviceTime_unix']
    
    #below for pair
    file_data = file_data.sort_values('created_at')
    file_data = file_data.iloc[::-1]
    file_data = file_data.reset_index(drop=True)
    f_temp_uptime = file_data['created_at']
    f_unix_list = []
    f_unix_list = [string.replace(' UTC', '') for string in f_temp_uptime]
    f_unix_list = [int(datetime.datetime.strptime(time, '%Y-%m-%d %H:%M:%S').replace(tzinfo=datetime.timezone.utc).timestamp()) for time in f_unix_list]
    file_data['Unix_time'] = f_unix_list
    file_data = file_data.drop(['created_at', 'Unnamed: 10'], axis=1)
    
    #merge stuff here
    fmin, fmax = compare_length(d_unix_list, f_unix_list)
    d_unix_list = [x for x in d_unix_list if x <= fmax and x >= fmin]
    f_unix_list = [x for x in f_unix_list if x <= fmax and x >= fmin]
    fmin, fmax = compare_length(d_unix_list, f_unix_list)
    d_unix_list = [x for x in d_unix_list if x <= fmax and x >= fmin]
    f_unix_list = [x for x in f_unix_list if x <= fmax and x >= fmin]
    fmin, fmax = compare_length(d_unix_list, f_unix_list)
    d_unix_list = [x for x in d_unix_list if x <= fmax and x >= fmin]
    f_unix_list = [x for x in f_unix_list if x <= fmax and x >= fmin]

    #cuts out data indices we don't want
    dosenet_data = dosenet_data[dosenet_data['deviceTime_unix'].isin(d_unix_list)]
    file_data = file_data[file_data['Unix_time'].isin(f_unix_list)]
    
    
    #reset Indices
    dosenet_data = dosenet_data.reset_index(drop=True)
    file_data = file_data.reset_index(drop=True)
    
    #rename columns
    dosenet_data.rename(columns = {'deviceTime_unix':'date_time'}, inplace = True)
    file_data.rename(columns = {'Unix_time':'date_time'}, inplace = True)
    
    #converts unix to date, time
    file_data['date_time'] = pd.to_datetime(file_data['date_time'],unit='s')
    dosenet_data['date_time'] = pd.to_datetime(dosenet_data['date_time'],unit='s')
    
    #finds mean per hour
    dosenet_data = dosenet_data.resample('h', on='date_time')['PM1', 'PM25', 'PM10'].mean().reset_index()
    file_data = file_data.resample('h', on='date_time')['PM1.0_CF1_ug/m3', 'PM10.0_CF1_ug/m3', 'PM2.5_ATM_ug/m3'].mean().reset_index()

    #creates a combined dataframe
    all_data = pd.merge(dosenet_data, file_data)
    
    #drops all hours without data
    all_data = all_data.dropna(axis = 0)
    
    #Shows how many hour chunks are being compared, useful for seeing how much data is being processed.
    print('comparing ', len(all_data), ' hours')
    
    all_data.rename(columns = {'PM1.0_CF1_ug/m3':'PM1p', 
                                'PM2.5_ATM_ug/m3':'PM25p', 
                                'PM10.0_CF1_ug/m3':'PM10p'}, inplace = True)
    
    all_data.rename(columns = {'PM1':'PM1d', 
                                'PM25':'PM25d', 
                                'PM10':'PM10d'}, inplace = True)
    return all_data
    

In [5]:
def get_url(location_name):
    '''Gets the csv location data from radwatch downloads. '''
    
    url = 'https://radwatch.berkeley.edu/test/dosenet/' + location_name + '.csv'

    header = {
      "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
      "X-Requested-With": "XMLHttpRequest"
    }
    
    s=requests.get(url,headers=header).text
    dosenet_data = pd.read_csv(io.StringIO(s))
    
    return dosenet_data

In [6]:
def compare_data(all_data, pm):

    pm_string_d = None
    pm_string_p = None
    if pm == '1':
        pm_string_d = 'PM1d'
        pm_string_p = 'PM1p'
        print('PM 1:')
    elif pm == '25':
        pm_string_d = 'PM25d'
        pm_string_p = 'PM25p'
        print('PM 2.5:')
    elif pm == '10':
        pm_string_d = 'PM10d'
        pm_string_p = 'PM10p'
        print('PM 10:')
    else:
        raise 'something_wrong' from None
    
    difference_data = report_diff_avg(all_data, pm_string_d, pm_string_p)
    
    return difference_data

In [7]:
def compare_length(dataset1, dataset2):
    
    max1 = max(dataset1)
    min1 = min(dataset1)
    max2 = max(dataset2)
    min2 = min(dataset2)
    fmax = None
    fmin = None
    
    if max1 > max2:
        if min1 > min2:
            fmin = min1
            fmax = max2
        if min1 < min2:
            fmin = min2
            fmax = max2
        if min1 == min2:
            fmin = min1
            fmax = max2
    if max1 < max2:
        if min1 > min2:
            fmin = min1
            fmax = max1
        if min1 < min2:
            fmin = min2
            fmax = max1
        if min1 == min2:
            fmin = min2
            fmax = max1
    if max1 == max2:
        if min1 > min2:
            fmin = min1
            fmax = max1
        if min1 < min2:
            fmin = min2
            fmax = max1
        if min1 == min2:
            fmin = min1
            fmax = max2
        
    return fmin, fmax

In [8]:
def report_diff_avg(all_data, pm_string_d, pm_string_p):
    
    difference_data = all_data[pm_string_d] - all_data[pm_string_p]
    avg_diff = np.mean(difference_data)
    print('Average difference between dosenet and pair data: ', avg_diff)
    
    return difference_data

In [9]:
main()

['chs_os_aq', 'etch_roof_aq', 'exploratorium_aq', 'hb_os_aq', 'miramonte_os_aq', 'pinewood_os_aq', 'uw_aq']

---------------
chs_os_aq
found  1  files
file  1 :
comparing  116  hours
PM 1:
Average difference between dosenet and pair data:  -3.453708923649621
PM 2.5:
Average difference between dosenet and pair data:  -4.644089673537596
PM 10:
Average difference between dosenet and pair data:  -4.731930214333527
saving to: python-scripts\processed_data\chs_os_aq_1.csv

---------------
etch_roof_aq
found  2  files
file  1 :
comparing  2820  hours
PM 1:
Average difference between dosenet and pair data:  -0.8806518885117048
PM 2.5:
Average difference between dosenet and pair data:  -1.4504320506474309
PM 10:
Average difference between dosenet and pair data:  -0.6884993573297779
saving to: python-scripts\processed_data\etch_roof_aq_1.csv
file  2 :
comparing  2153  hours
PM 1:
Average difference between dosenet and pair data:  -0.38137660285672076
PM 2.5:
Average difference between dosenet an

In [10]:
# os.chdir(os.path.dirname(os.getcwd()))

Daily averages
2.5 p, d, diff
within 1 mile
