In [1]:
#------------------------------------#
#------------- packages--------------#
#------------------------------------#

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import ruptures as rpt

import datetime as dt
import pytz
import os

In [2]:
pids = pd.read_csv('StudyParticipants.csv')

aq_ll = []
aq_off = []

for r,d,f in os.walk("/Users/beatriceli/Documents/PhD_Research/GitHub/well-being/lll_awair/linklab"):
    for file in f:
        if file.endswith("awair.csv"):
            temp = pd.read_csv(os.path.join(r,file))
            # drop device_id
            temp = temp.drop(columns=["device_id"])
            # rename time to timestamp
            temp = temp.rename(columns={"time":"timestamp"})
            # convert to datetime
            temp["timestamp"] = pd.to_datetime(temp["timestamp"])
            aq_ll.append(temp)

for r,d,f in os.walk("/Users/beatriceli/Documents/PhD_Research/GitHub/well-being/lll_awair/offgrounds"):
    for file in f:
        if file.endswith("awair.csv"):
            temp = pd.read_csv(os.path.join(r,file))
            # drop device_uuid
            temp = temp.drop(columns=["device_uuid"])
            # read in timestamp that is in UTC as Eastern datetime
            temp["timestamp"] = pd.to_datetime(temp["timestamp"])
            # convert utc to eastern
            temp["timestamp"] = temp["timestamp"].dt.tz_convert('US/Eastern')
            aq_off.append(temp)

aq_off = pd.concat(aq_off)
aq_ll = pd.concat(aq_ll)

# # concat off and ll
aq = pd.concat([aq_ll,aq_off])

# merge Work column from pids with aq on ParticipantID
aq = aq.merge(pids[["ParticipantID","Work"]],on="ParticipantID",how="left")
# drop score, temp, humid, voc, lum
aq = aq.drop(columns=["score","temp","humid","lum"])
aq['weekday'] = aq['timestamp'].apply(lambda x: x.weekday())
# map 0=Monday, 1=Tuesday, 2=Wednesday, 3=Thursday, 4=Friday, 5=Saturday, 6=Sunday
aq['weekday'] = aq['weekday'].map({0:"Monday",1:"Tuesday",2:"Wednesday",3:"Thursday",4:"Friday",5:"Saturday",6:"Sunday"})
# cast weekend to string where 1 = weekend and 0 = weekday
aq["weekend"] = aq["weekday"].apply(lambda x: "weekend" if x in ["Saturday","Sunday"] else "weekday")
# reorder columns
aq = aq[["ParticipantID","Work","weekday","weekend","timestamp","co2","noise","voc"]]
aq.head()

Unnamed: 0,ParticipantID,Work,weekday,weekend,timestamp,co2,noise,voc
0,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:04-04:00,551.0,53.9,20.0
1,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:14-04:00,551.0,53.9,22.0
2,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:24-04:00,550.0,53.9,26.0
3,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:34-04:00,550.0,53.8,24.0
4,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:44-04:00,551.0,53.9,26.0


In [3]:
# group by ParticipantID and weekend and calculate mean and std for co2 and noise
aq_wk = aq.groupby(["ParticipantID","Work","weekend"]).agg({"co2":["mean","std"],"noise":["mean","std"], "voc":["mean","std"]})
# flatten column names
aq_wk.columns = [f"{col[0]}_{col[1]}" for col in aq_wk.columns]
aq_wk = aq_wk.reset_index()

aq_wk.head()

Unnamed: 0,ParticipantID,Work,weekend,co2_mean,co2_std,noise_mean,noise_std,voc_mean,voc_std
0,edr,Link Lab,weekday,529.039306,162.175311,52.424589,1.785004,180.870741,104.323242
1,edr,Link Lab,weekend,458.449361,45.165719,51.911907,0.941343,173.140324,94.495806
2,egl,Link Lab,weekday,529.039306,162.175311,52.424589,1.785004,180.870741,104.323242
3,egl,Link Lab,weekend,458.449361,45.165719,51.911907,0.941343,173.140324,94.495806
4,h9u,Off Grounds,weekday,783.061519,253.044044,54.318073,2.75865,208.340938,82.628711


In [4]:
# convert to dictionary
aq_dict = {}

for _, row in aq_wk.iterrows():
    participant_id = row["ParticipantID"]
    weekend = row["weekend"]

    if participant_id not in aq_dict:
        aq_dict[participant_id] = {}

    if weekend not in aq_dict[participant_id]:
        aq_dict[participant_id][weekend] = {}

    aq_dict[participant_id][weekend]["co2_mean"] = row["co2_mean"]
    aq_dict[participant_id][weekend]["co2_std"] = row["co2_std"]
    aq_dict[participant_id][weekend]["noise_mean"] = row["noise_mean"]
    aq_dict[participant_id][weekend]["noise_std"] = row["noise_std"]
    aq_dict[participant_id][weekend]["voc_mean"] = row["voc_mean"]
    aq_dict[participant_id][weekend]["voc_std"] = row["voc_std"]

# first element 
list(aq_dict.items())[0]

('edr',
 {'weekday': {'co2_mean': 529.0393063415204,
   'co2_std': 162.17531080484113,
   'noise_mean': 52.424589434688755,
   'noise_std': 1.785003720422536,
   'voc_mean': 180.87074120529724,
   'voc_std': 104.32324197647927},
  'weekend': {'co2_mean': 458.449361218889,
   'co2_std': 45.165718687572266,
   'noise_mean': 51.911906879909154,
   'noise_std': 0.9413432442076584,
   'voc_mean': 173.14032364909625,
   'voc_std': 94.4958055920536}})

In [5]:
# get date range for each participant
aq.groupby("ParticipantID").agg({"timestamp":["min","max"]})

Unnamed: 0_level_0,timestamp,timestamp
Unnamed: 0_level_1,min,max
ParticipantID,Unnamed: 1_level_2,Unnamed: 2_level_2
edr,2022-04-24 20:00:05-04:00,2022-06-02 02:12:17-04:00
egl,2022-04-24 20:00:05-04:00,2022-06-02 02:12:17-04:00
h9u,2022-05-31 13:50:00-04:00,2022-07-19 11:00:00-04:00
heh,2022-05-16 21:30:00-04:00,2022-07-19 10:55:00-04:00
j02,2022-04-12 20:00:00-04:00,2022-07-18 20:05:00-04:00
lpz,2022-05-16 21:30:00-04:00,2022-07-18 23:35:00-04:00
mr1,2022-05-13 13:58:30-04:00,2022-07-19 19:59:54-04:00
oyb,2022-05-16 22:59:04-04:00,2022-06-16 08:08:58-04:00
pgm,2022-04-27 17:56:56-04:00,2022-07-19 19:59:57-04:00
ssg,2022-05-13 13:58:39-04:00,2022-07-16 06:58:58-04:00


In [6]:
# # iterate over rows of aq dataframe
# for _, row in aq.iterrows():
#     # get participant ID and weekday
#     participant_id = row['ParticipantID']
#     wknd = row['weekend']

#     # look in aq_dict for participant ID and wknd
#     if participant_id in aq_dict and wknd in aq_dict[participant_id]:
#         # standardize values for co2 and noise using mean and std according to wknd
#         aq.loc[_, 'co2'] = (row['co2'] - aq_dict[participant_id][wknd]['co2_mean']) / aq_dict[participant_id][wknd]['co2_std']
#         aq.loc[_, 'noise'] = (row['noise'] - aq_dict[participant_id][wknd]['noise_mean']) / aq_dict[participant_id][wknd]['noise_std']
#         aq.loc[_, 'voc'] = (row['voc'] - aq_dict[participant_id][wknd]['voc_mean']) / aq_dict[participant_id][wknd]['voc_std']
# # to csv without index
# aq.to_csv("ieqmetrics.csv",index=False)

# read csv
aq = pd.read_csv("ieqmetrics.csv", parse_dates = ["timestamp"])
aq.head()

Unnamed: 0,ParticipantID,Work,weekday,weekend,timestamp,co2,noise,voc
0,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:04-04:00,0.923363,1.211566,-1.576642
1,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:14-04:00,0.923363,1.211566,-1.548247
2,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:24-04:00,0.90651,1.211566,-1.491458
3,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:34-04:00,0.90651,1.091809,-1.519852
4,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:44-04:00,0.923363,1.211566,-1.491458


In [7]:
# how many values for each ParticipantID that are over 4 std
co2outliers = aq[aq['co2'] >= 4]
co2outliers.groupby('ParticipantID').count()

Unnamed: 0_level_0,Work,weekday,weekend,timestamp,co2,noise,voc
ParticipantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
edr,2601,2601,2601,2601,2601,2601,2601
egl,2601,2601,2601,2601,2601,2601,2601
h9u,19,19,19,19,19,19,19
j02,253,253,253,253,253,253,253
mr1,1114,1114,1114,1114,1114,1114,1114
oyb,812,812,812,812,812,812,812
pgm,1205,1205,1205,1205,1205,1205,1205
ssg,1021,1021,1021,1021,1021,1021,1021
uja,2244,2244,2244,2244,2244,2244,2244
vxx,3958,3958,3958,3958,3958,3958,3958


In [8]:
# # remove outliers
# aqDrop = aq[aq['co2'] < 4]

# # usimg the dictionary to revert the standardization
# for _, row in aqDrop.iterrows():
#     # get participant ID and weekday
#     participant_id = row['ParticipantID']
#     wknd = row['weekend']

#     # look in aq_dict for participant ID and wknd
#     if participant_id in aq_dict and wknd in aq_dict[participant_id]:
#         # standardize values for co2 and noise using mean and std according to wknd
#         aqDrop.loc[_, 'co2ppm'] = (row['co2'] * aq_dict[participant_id][wknd]['co2_std']) + aq_dict[participant_id][wknd]['co2_mean']
#         aqDrop.loc[_, 'noisedb'] = (row['noise'] * aq_dict[participant_id][wknd]['noise_std']) + aq_dict[participant_id][wknd]['noise_mean']
#         aqDrop.loc[_, 'vocppb'] = (row['voc'] * aq_dict[participant_id][wknd]['voc_std']) + aq_dict[participant_id][wknd]['voc_mean']

# # reset index
# aqDrop = aqDrop.reset_index(drop=True)
# aqDrop.to_csv("ieq_cleaned.csv",index=False)

# read csv
aqDrop = pd.read_csv("ieq_cleaned.csv", parse_dates=["timestamp"])
aqDrop.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aqDrop.loc[_, 'co2ppm'] = (row['co2'] * aq_dict[participant_id][wknd]['co2_std']) + aq_dict[participant_id][wknd]['co2_mean']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aqDrop.loc[_, 'noisedb'] = (row['noise'] * aq_dict[participant_id][wknd]['noise_std']) + aq_dict[participant_id][wknd]['noise_mean']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,ParticipantID,Work,weekday,weekend,timestamp,co2,noise,voc,co2ppm,noisedb,vocppb
0,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:04-04:00,0.923363,1.211566,-1.576642,551.0,53.9,20.0
1,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:14-04:00,0.923363,1.211566,-1.548247,551.0,53.9,22.0
2,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:24-04:00,0.90651,1.211566,-1.491458,550.0,53.9,26.0
3,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:34-04:00,0.90651,1.091809,-1.519852,550.0,53.8,24.0
4,oyb,Link Lab,Monday,weekday,2022-05-16 22:59:44-04:00,0.923363,1.211566,-1.491458,551.0,53.9,26.0


In [12]:
# get mr1 data from aqDrop
mr1 = aqDrop[aqDrop["ParticipantID"] == "mr1"]
# in aqDrop, replace participant data for edr and egl with mr1
edr = mr1.copy()
edr["ParticipantID"] = "edr"

egl = mr1.copy()
egl["ParticipantID"] = "egl"

# drop edr and egl from aqDrop
aqDrop = aqDrop[aqDrop["ParticipantID"] != "edr"]
aqDrop = aqDrop[aqDrop["ParticipantID"] != "egl"]

# concat edr, egl, and aqDrop
aqDrop = pd.concat([aqDrop,edr,egl])

In [13]:
# aggregate columns based on timestamp to 1 minute
# drop seconds from timestamp
aqDrop["timestamp"] = aqDrop["timestamp"].dt.floor("min")
# group by timestamp and calculate mean for co2 and noise
aqDrop = aqDrop.groupby(["ParticipantID","Work","timestamp"]).agg({"co2ppm":"mean","noisedb":"mean","vocppb":"mean"})
# flatten column names
aqDrop.columns = [f"{col[0]}_{col[1]}" for col in aqDrop.columns]
aqDrop = aqDrop.reset_index()
# rename columns
aqDrop = aqDrop.rename(columns={"c_o":"co2ppm","n_o":"noisedb","v_o":"vocppb"})
# to csv
aqDrop.to_csv("ieq_iid.csv",index=False)
aqDrop.head()

Unnamed: 0,ParticipantID,Work,timestamp,co2ppm,noisedb,vocppb
0,edr,Link Lab,2022-05-13 13:58:00-04:00,776.0,52.1,20.333333
1,edr,Link Lab,2022-05-13 13:59:00-04:00,776.166667,51.883333,20.0
2,edr,Link Lab,2022-05-13 14:00:00-04:00,777.833333,51.516667,20.166667
3,edr,Link Lab,2022-05-13 14:01:00-04:00,777.166667,51.8,20.0
4,edr,Link Lab,2022-05-13 14:02:00-04:00,776.5,51.15,20.0


In [14]:
# get lpz
aqDrop[aqDrop["ParticipantID"] == "lpz"].head()

Unnamed: 0,ParticipantID,Work,timestamp,co2ppm,noisedb,vocppb
241475,lpz,Off Grounds BR,2022-05-16 21:30:00-04:00,774.43,51.66,545.29
241476,lpz,Off Grounds BR,2022-05-16 21:35:00-04:00,839.75,54.88,773.75
241477,lpz,Off Grounds BR,2022-05-17 13:25:00-04:00,936.5,56.7,1798.5
241478,lpz,Off Grounds BR,2022-05-17 21:00:00-04:00,1176.84,56.33,439.81
241479,lpz,Off Grounds BR,2022-05-17 21:05:00-04:00,945.53,54.31,329.3


In [15]:
# get timestamp range for each participant
aqDrop.groupby("ParticipantID").agg({"timestamp":["min","max"]})

Unnamed: 0_level_0,timestamp,timestamp
Unnamed: 0_level_1,min,max
ParticipantID,Unnamed: 1_level_2,Unnamed: 2_level_2
edr,2022-05-13 13:58:00-04:00,2022-07-19 19:59:00-04:00
egl,2022-05-13 13:58:00-04:00,2022-07-19 19:59:00-04:00
h9u,2022-05-31 13:50:00-04:00,2022-07-19 11:00:00-04:00
heh,2022-05-16 21:30:00-04:00,2022-07-19 10:55:00-04:00
j02,2022-04-12 20:00:00-04:00,2022-07-18 20:05:00-04:00
lpz,2022-05-16 21:30:00-04:00,2022-07-18 23:35:00-04:00
mr1,2022-05-13 13:58:00-04:00,2022-07-19 19:59:00-04:00
oyb,2022-05-16 22:59:00-04:00,2022-06-16 08:08:00-04:00
pgm,2022-04-27 17:56:00-04:00,2022-07-19 19:59:00-04:00
ssg,2022-05-13 13:58:00-04:00,2022-07-16 06:58:00-04:00
