# Create session-level measurements

This notebook concatenates all of the measurements collected on various dates into one csv per site. The metrics consist of the following measures, with one summary statistic per session-treatment group:

- indoor air temperature (mean, std dev, quartiles)
- indoor relative humidity (mean, std dev, quartiles)
- outdoor air temperature, avg/min/max over the date of each session.
- outdoor relative humidity, avg/min/max over the date of each session.
- participant level mean T and RH, averaged over duration of session.
- operative temp in control and tx
- CO2 in control and tx

In [1]:
sys.path.append('../scripts/')
from utilities import *

import pandas as pd
from os.path import join

import pickle
import pdb

idx = pd.IndexSlice

In [2]:
home_dir = '/Users/ianbolliger/Dropbox/Temperature & Behavior/Experiments'
s = Settings(home_dir)

save_fpath_berk = join(s.berk.home_dir,'session_level_environmental_data.csv')
save_fpath_bus = join(s.bus.home_dir,'session_level_environmental_data.csv')

In [3]:
def calc_session_level_indoor_vals(dfs, timing_df, site_settings):
    out_df = pd.DataFrame()
    idx = pd.IndexSlice

    for grp_ix,grp in enumerate(['control','treatment']):
        grp_df = timing_df[timing_df['Treatment group']==grp_ix]

        # one dataframe for each sensor location
        loc_dfs = []
        for loc in site_settings.sens_locs + [str(i) for i in range(1,7)] + ['Top', 'co2']:
            tmp_df = dfs[grp][loc].reset_index()
            tmp_df['sess'] = pd.cut(tmp_df['time'],bins=grp_df.index)
            grouped = tmp_df.groupby('sess').describe()

            # drop pilot sessions where we don't have room-level sensors (only individual)
            grouped = grouped[grouped.loc[:,idx[:,'count']].sum(axis=1)!=0]

            # don't need # of measurements taken in session
            grouped = grouped.drop(columns='count',level=1)

            loc_dfs.append(grouped)

        # estimate average values for sessions where only one sensor recording
        # then average the sensors for each session
        room_sesh_vals = average_two_sensors(loc_dfs)

        # clarify that these are indoor temps and RH vals
        def renamer(x):
            if x == 'one_sensor_only':
                return 'one_sensor_only_in'
            return x.split('_')[0] + '_in_' + x.split('_')[1]
        sesh_vals = room_sesh_vals.rename(renamer,axis=1)
        
        ## add participant-level sensors
        for s_ix in range(1,7):
            this_df = loc_dfs[s_ix+1]
            
            # convert to single-level index
            this_df.columns = [(j[0]+'_p{}_'.format(s_ix)+j[1]).rstrip('_') for j in this_df.columns.values]
            
            # join onto sesh_vals dataframes
            sesh_vals = sesh_vals.join(this_df,how='outer')
            
        ## add operative temp
        this_df = loc_dfs[8]
        this_df.columns = [(j[0]+'_'+j[1]).rstrip('_') for j in this_df.columns.values]
        sesh_vals = sesh_vals.join(this_df,how='outer')
        
        ## add CO2
        this_df = loc_dfs[9]
        this_df.columns = [(j[0]+'_'+j[1]).rstrip('_') for j in this_df.columns.values]
        sesh_vals = sesh_vals.join(this_df,how='outer')

        ## join onto treatment group / session # data
        sesh_vals = sesh_vals.join(grp_df,how='inner')
        out_df = out_df.append(sesh_vals)

    # format nicely
    out_df = out_df.rename(columns={'Date':'date','Session in day':'session','Treatment group':'treatment'})
    out_df = out_df.set_index(['date','session','treatment'])
    out_df = out_df.drop(columns=['start_time','end_time'])
    out_df.columns = [c.rstrip('%') for c in out_df.columns]
    out_df = out_df.sort_index()
    out_df = out_df[out_df.notnull().any(axis=1)]
    
    return out_df

def average_two_sensors(loc_dfs):
    loc_dfs = bias_correct_one_sensor(loc_dfs)
    n_other_sensors = len(loc_dfs) - 2
    sesh_vals = (loc_dfs[0] + loc_dfs[1]) / 2
    sesh_vals['one_sensor_only'] = pd.DataFrame([loc_dfs[0]['one_sensor_only'],
                                                 loc_dfs[1]['one_sensor_only']]).max()
    return sesh_vals

def bias_correct_one_sensor(loc_dfs,):
    """When only one sensor in a room, bias correct that sensor to estimate
    the average of the two sensors for that session."""
    # find mean difference in sensor
    mean_diff = (loc_dfs[1] - loc_dfs[0]).mean()
    # flatten index
    mean_diff.index = [(i[0]+'_'+i[1]).rstrip('_') for i in mean_diff.index.values]

    # get df with all sessions where at least one sensor was working
    all_locs = loc_dfs[1].join(loc_dfs[0],rsuffix='0',lsuffix='1')

    # adjust for one-sensor-only times
    # workaround b/c reindex fails with intervalIndex (bug)
    for i in [0,1]:
        loc_dfs[i] = all_locs.loc[:,idx[[k for k in all_locs.columns.levels[0] if k[-1] == str(i)],:]]
        
        # convert to single-level index
        loc_dfs[i] = loc_dfs[i].rename(lambda x: x[:-1],axis=1,level=0)
        loc_dfs[i].columns = [(j[0]+'_'+j[1]).rstrip('_') for j in loc_dfs[i].columns.values]
        
        # mark where we did bias correction
        loc_dfs[i]['one_sensor_only'] = loc_dfs[i].isnull().any(axis=1)

    loc_dfs[1].iloc[:,:-1] = loc_dfs[1].iloc[:,:-1].where(~loc_dfs[1]['one_sensor_only'],loc_dfs[0].iloc[:,:-1]+mean_diff)
    loc_dfs[0].iloc[:,:-1] = loc_dfs[0].iloc[:,:-1].where(~loc_dfs[0]['one_sensor_only'],loc_dfs[1].iloc[:,:-1]-mean_diff)
    
    return loc_dfs

def add_outdoor_vals(output_df, dfs_outdoor, timing_df):
    res = output_df.copy()
    for i in ['min','mean','max']:
        to_join = dfs_outdoor[i].copy()
        to_join.index = pd.to_datetime(to_join.index)
        to_join = timing_df.join(to_join,on='Date',how='inner').drop_duplicates(
                ).set_index(
                ['Date', 'Session in day', 'Treatment group']).loc[:,['T','RH']]
        to_join.columns = [r+'_out_daily'+i for r in to_join.columns]
        to_join.index.names = res.index.names
        res = res.join(to_join,how='outer')
    return res

## Berkeley

### Load data

In [4]:
timing_df_berk = get_timing_df_berk(s)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [5]:
# load dataframes 
dfs_berk = load_vals_berkeley(s)

KeyboardInterrupt: 

### Indoor vals

In [None]:
idx = pd.IndexSlice
out_df_berk = calc_session_level_indoor_vals(dfs_berk['indoor'], timing_df_berk, s.berk)

### Outdoor temp/rh

In [None]:
out_df_berk = add_outdoor_vals(out_df_berk, dfs_berk['outdoor'], timing_df_berk)

### Save

In [None]:
out_df_berk.to_csv(save_fpath_berk,float_format='%.2f')

## Busara

### Load data

In [4]:
# load dataframes 
dfs_bus = load_vals_bus(s)
timing_df_bus = get_timing_df_bus(s)

Downloading control room data...
Downloading 20180208...
Downloading far sensor...
Downloading near sensor...
Downloading treatment room data...
Downloading 20180208...
Downloading far sensor...
Downloading near sensor...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


### Indoor vals

In [5]:
out_df_bus = calc_session_level_indoor_vals(dfs_bus['indoor'], timing_df_bus, s.bus)

  .format(op=op_str, alt_op=unsupported[op_str]))


### Outdoor temp/rh

In [None]:
out_df_bus = add_outdoor_vals(out_df_bus, dfs_bus['outdoor'], timing_df_bus)

### Save

In [8]:
out_df_bus.to_csv(save_fpath_bus,float_format='%.2f')