In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 8)
!date

import methods

%load_ext autoreload
%autoreload 2

Mon May 17 12:24:03 PDT 2021


# Calculations for results section

## Load data files

In [2]:
data = methods.load_data()

  mask |= (ar1 == a)


In [3]:
data['start_date'].strftime(methods.date_fmt), data['end_date'].strftime(methods.date_fmt)

('September 8, 2020', 'October 20, 2020')

In [4]:
n_weeks = (data['end_date'] - data['start_date']) / pd.Timedelta(days=7)
print(f'{n_weeks:.2f} weeks of data')

6.00 weeks of data


In [5]:
rows = (data['df'].date >= data['start_date']) & (data['df'].date <= data['end_date']) & (data['df'].test_required == 1)
sum(rows)

43430

In [6]:
df = methods.subset_data(data, rows, exposure='hcw', outcome='test_positive')

# text for Results Section paragraph 1:

In [7]:
print(f'The survey data contained {len(df):,.0f} respondents who were tested due to workplace requirements '
      f'in the time period we focused on,'
      f' {df.exposure.sum():,.0f} HCWs and {(1-df.exposure).sum():,.0f} non-HCWs (see Table 1 for demographic details).'
      f' There were '
      f'{df.outcome.sum():,.0f} respondents who reported a positive test for COVID-19 in the last 14 days '
      f'({df[df.exposure==1].outcome.sum():,.0f} among HCWs and {df[df.exposure==0].outcome.sum():,.0f} among non-HCWs).')

The survey data contained 43,430 respondents who were tested due to workplace requirements in the time period we focused on, 14,660 HCWs and 28,770 non-HCWs (see Table 1 for demographic details). There were 2,145 respondents who reported a positive test for COVID-19 in the last 14 days (588 among HCWs and 1,557 among non-HCWs).


## Calculate relative risk with uncertainty

In [8]:
%%time

# set random seed for reproducibility
np.random.seed(12345)
rr_draws = methods.sample_rr_draws(df)
rr_draws

CPU times: user 51.7 s, sys: 999 ms, total: 52.7 s
Wall time: 52.7 s


point_est    0.731588
draw_0       0.746737
draw_1       0.759099
draw_2       0.759189
               ...   
draw_996     0.727963
draw_997     0.740747
draw_998     0.719303
draw_999     0.751744
Length: 1001, dtype: float64

In [9]:
rr_mean, rr_lb, rr_ub = methods.my_summarize(rr_draws)

In [10]:
print(f'Among HCWs with a required test, {df[df.exposure==1].outcome.sum():,.0f} of {df.exposure.sum():,.0f}'
      f' ({df[df.exposure==1].outcome.mean()*100:,.1f}%) reported a positive test in the last 14 days,'
      f' while among non-HCWs with a required test, '
      f'{df[df.exposure==0].outcome.sum():,.0f} of {(df.exposure==0).sum():,.0f}'
      f' ({df[df.exposure==0].outcome.mean()*100:,.2f}%) reported a positive test,'
      f' for a relative COVID-19 prevalence ratio of {rr_mean:.1f} (95% UI {rr_lb:.2f} to {rr_ub:.2f}).'
      )

Among HCWs with a required test, 588 of 14,660 (4.0%) reported a positive test in the last 14 days, while among non-HCWs with a required test, 1,557 of 28,770 (5.41%) reported a positive test, for a relative COVID-19 prevalence ratio of 0.7 (95% UI 0.68 to 0.80).


In [11]:
# table version of results
t = pd.DataFrame(index=[0])
t.loc[0, 'HCW - tested'] = f'{df.exposure.sum():,.0f}'
t.loc[0, 'HCW - positive'] = f'{df[df.exposure==1].outcome.sum():,.0f}'
t.loc[0, 'HCW - (%)'] = f'{df[df.exposure==1].outcome.mean()*100:,.1f}'

t.loc[0, 'non-HCW - tested'] = f'{(1-df.exposure).sum():,.0f}'
t.loc[0, 'non-HCW - positive'] = f'{df[df.exposure==0].outcome.sum():,.0f}'
t.loc[0, 'non-HCW - (%)'] = f'{df[df.exposure==0].outcome.mean()*100:,.1f}'

t.loc[0, 'RR'] = f'{rr_mean:.2f}'
t.loc[0, '95% UI'] = f'{rr_lb:.2f} to {rr_ub:.2f}'

t

Unnamed: 0,HCW - tested,HCW - positive,HCW - (%),non-HCW - tested,non-HCW - positive,non-HCW - (%),RR,95% UI
0,14660,588,4.0,28770,1557,5.4,0.73,0.68 to 0.80


# Sensitivity analyses

## Unweighted version of main result

In [12]:
df['real_weight'] = df.weight
df['weight'] = 1
df['weighted_outcome'] = df.outcome

In [13]:
# set random seed for reproducibility
np.random.seed(12345)
unweighted_rr_draws = methods.sample_rr_draws(df)
methods.my_summarize(unweighted_rr_draws)

(0.7411303614096791, 0.6969560721128465, 0.792334081143401)

In [14]:
unweighted_rr_mean, unweighted_rr_lb, unweighted_rr_ub = \
    methods.my_summarize(unweighted_rr_draws)

In [15]:
print(f'When we did not use the sample weights to calculate the COVID-19 endorsement rates we found'
      f' a relative COVID-19 incidence ratio of {unweighted_rr_mean:.1f} '
      f'(95% UI {unweighted_rr_lb:.1f} to {unweighted_rr_ub:.1f}).'
      )

When we did not use the sample weights to calculate the COVID-19 endorsement rates we found a relative COVID-19 incidence ratio of 0.7 (95% UI 0.7 to 0.8).


In [16]:
df.weight = df.real_weight

# For a different project, what is the RR of people who mask in different ways

In [17]:
# data['df']['mask_var'] = (data['df'].mask_past5days == 1).astype(float)
# df = methods.subset_data(data, data['df'].test_required, exposure='mask_var', outcome='test_positive')
# methods.my_calc_and_summarize(df) # RR for all-the-time masked in past 5 days

In [18]:
# for i in [2,3,4,5,6]:
#     data['df']['mask_var'] = (data['df'].mask_past5days == i).astype(float)
#     df = methods.subset_data(data, data['df'].test_required, exposure='mask_var', outcome='test_positive')
#     print('Mask intensity:', i)
#     print(methods.my_calc_and_summarize(df))
#     print()

In [19]:
# for i in [1,2,3,4,5,6,7]:
#     data['df']['exposure_var'] = (data['df'].activities_past24hrs.fillna('').str.contains(f'{i}')).astype(float)
#     df = methods.subset_data(data, data['df'].test_required,
#                              exposure='exposure_var', outcome='test_positive')
#     print('Activities includes:', i)
#     print(methods.my_calc_and_summarize(df))
#     print()

In [20]:
# for i in [1,2,3,4,5,6,7]:
#     data['df']['activity_var'] = (data['df'].activities_past24hrs.fillna('').str.contains(f'{i}')).astype(float)
#     data['df']['exposure_var'] = (data['df'].masked_activities_past24hrs.fillna('').str.contains(f'{i}')).astype(float)
#     df = methods.subset_data(data, data['df'].test_required & (data['df'].activity_var == 1),
#                              exposure='exposure_var', outcome='test_positive')
#     print('Masked activities includes:', i)
#     print(methods.my_calc_and_summarize(df))
#     print()

## Sub-analyses for specific subsets of HCWs

To understand how the results might vary between subgroups of HCWs, we repeated our analysis restricted to only doctors, only nurses, only home health aides, and only medical assistants:

In [21]:
results = {}

In [22]:
rows = (data['df'].date >= data['start_date']) & (data['df'].date <= data['end_date']) & (data['df'].test_required == 1)
sum(rows)

43430

In [23]:
df = methods.subset_data(data, rows, exposure='hcw', outcome='test_positive')
results['All HCWs'] = methods.my_calc_and_summarize(df)

In [24]:
df['exposure'] = (data['df'].loc[rows, 'Q68'] == 1)  # Physician or surgeon
results['Physician or surgeon'] = methods.my_calc_and_summarize(df)

In [25]:
df['exposure'] = (data['df'].loc[rows, 'Q68'] == 2)  # Registered nurse (including nurse practitioner)
results['Registered nurse (including nurse practitioner)'] = methods.my_calc_and_summarize(df)

In [26]:
df['exposure'] = (data['df'].loc[rows, 'Q68'] == 3) 
results['Licensed practical or licensed vocational nurse'] = methods.my_calc_and_summarize(df)

In [27]:
df['exposure'] = (data['df'].loc[rows, 'Q68'] == 4)
results['Physician assistant'] = methods.my_calc_and_summarize(df)

In [28]:
df['exposure'] = (data['df'].loc[rows, 'Q68'] == 5)
results['Dentist'] = methods.my_calc_and_summarize(df)

In [29]:
df['exposure'] = (data['df'].loc[rows, 'Q68'] == 6)
results['Any other treating practitioner'] = methods.my_calc_and_summarize(df)

In [30]:
df['exposure'] = (data['df'].loc[rows, 'Q68'] == 7)
results['Pharmacist'] = methods.my_calc_and_summarize(df)

In [31]:
df['exposure'] = (data['df'].loc[rows, 'Q68'] == 8)
results['Any therapist'] = methods.my_calc_and_summarize(df)

In [32]:
df['exposure'] = (data['df'].loc[rows, 'Q68'] == 9)
results['Any health technologist or technician'] = methods.my_calc_and_summarize(df)

In [33]:
df['exposure'] = (data['df'].loc[rows, 'Q68'] == 10)
results['Veterinarian'] = methods.my_calc_and_summarize(df)

In [34]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 1)
results['Nursing assistant or psychiatric aide'] = methods.my_calc_and_summarize(df)

In [35]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 2)
results['Home health or personal care aide'] = methods.my_calc_and_summarize(df)

In [36]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 3)
results['Occupational or physical therapy assistant or aide'] = methods.my_calc_and_summarize(df)

In [37]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 4)
results['Massage therapist'] = methods.my_calc_and_summarize(df)

In [38]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 5)
results['Dental assistant'] = methods.my_calc_and_summarize(df)

In [39]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 6)
results['Medical assistant'] = methods.my_calc_and_summarize(df)

In [40]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 7)
results['Medical transcriptionist'] = methods.my_calc_and_summarize(df)

In [41]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 8)
results['Pharmacy aide'] = methods.my_calc_and_summarize(df)

In [42]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 9)
results['Phlebotomist'] = methods.my_calc_and_summarize(df)

In [43]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 10)
results['Veterinary assistant'] = methods.my_calc_and_summarize(df)

In [44]:
df['exposure'] = (data['df'].loc[rows, 'Q69'] == 11)
results['Any other healthcare support worker'] = methods.my_calc_and_summarize(df)

In [45]:
hcw_types_results = pd.DataFrame(results).T
hcw_types_results.columns = 'n_nonHCW', 'n_HCW', 'RR', 'lb', 'ub'

In [None]:
# pd.set_option('display.max_rows', 80)
np.round(hcw_types_results,2)

# Sensitivity analysis of those with required testing _and_ symptoms

Herbie writes:
> The one question I have at this point is how to account for work required testing.  I worry that there may be systematic differences in testing policies in the healthcare and non-healthcare setting.  Specifically, are individuals being tested because they are symptomatic, or is this part of routine testing?

To investigate this, we can dig deeper into the "why you were tested" question that we used to subset on the individuals who were tested because of employeer/educational requirements (value "4").

This question has a "select all that apply" answer, and includes "I felt sick" (value "1") as a response as well.

In [47]:
test_reason = data['df'].test_reason.fillna('')
required_testing_and_felt_sick_rows = (test_reason.str.contains("4")
                                       & test_reason.str.contains("1")) & (
    (data['df'].date >= data['start_date']) & (data['df'].date <= data['end_date'])
)

sum(required_testing_and_felt_sick_rows)

4584

In [48]:
required_testing_and_not_felt_sick_rows = (test_reason.str.contains("4")
                                           & ~test_reason.str.contains("1")) & (
    (data['df'].date >= data['start_date']) & (data['df'].date <= data['end_date'])
)
sum(required_testing_and_not_felt_sick_rows)

38846

In [49]:
results = {}

df = methods.subset_data(data, required_testing_and_not_felt_sick_rows, exposure='hcw', outcome='test_positive')
results['Test required, did not feel sick'] = methods.my_calc_and_summarize(df)

In [50]:
df = methods.subset_data(data, required_testing_and_felt_sick_rows, exposure='hcw', outcome='test_positive')
results['Test required, felt sick'] = methods.my_calc_and_summarize(df)

In [51]:
required_testing_results = pd.DataFrame(results).T
required_testing_results.columns = 'n_nonHCW', 'n_HCW', 'RR', 'lb', 'ub'
np.round(required_testing_results, 2)

Unnamed: 0,n_nonHCW,n_HCW,RR,lb,ub
"Test required, did not feel sick",25236.0,13610.0,1.09,1.01,1.27
"Test required, felt sick",3534.0,1050.0,0.8,0.69,0.92


# Sensitivity analysis of those who work outside the home

Dan writes:
> HCW in this study encompasses a broad group that likely has varying hospital exposures. My sister-in-law is an OT who works from home currently. Do these study HCW represent the HCW that a patient encounters when seeking medical care?

The survey does contain some information about whether the workers are working from home, but it is pretty coarse, “Was any of your work for pay in the last four weeks outside your home?” I’m going to check if it changes any results though, because I bet it doesn’t (because who would be required to test if they were working from home?).

In [52]:
required_testing_and_outside_work = data['df'].test_required & (data['df'].work_outside_home == 1) & (
    (data['df'].date >= data['start_date']) & (data['df'].date <= data['end_date'])
)
sum(required_testing_and_outside_work)

28586

In [53]:
required_testing_and_not_outside_work = data['df'].test_required & (data['df'].work_outside_home != 1) & (
    (data['df'].date >= data['start_date']) & (data['df'].date <= data['end_date'])
)
sum(required_testing_and_not_outside_work)

14844

In [54]:
results = {}

df = methods.subset_data(data, required_testing_and_outside_work, exposure='hcw', outcome='test_positive')
results['Test required, worked outside of home'] = methods.my_calc_and_summarize(df)

In [55]:
df = methods.subset_data(data, required_testing_and_not_outside_work, exposure='hcw', outcome='test_positive')
results['Test required, did not work outside of home'] = methods.my_calc_and_summarize(df)

In [56]:
outside_work_results = pd.DataFrame(results).T
outside_work_results.columns = 'n_nonHCW', 'n_HCW', 'RR', 'lb', 'ub'
np.round(outside_work_results, 2)

Unnamed: 0,n_nonHCW,n_HCW,RR,lb,ub
"Test required, worked outside of home",15797.0,12789.0,0.81,0.72,0.88
"Test required, did not work outside of home",12973.0,1871.0,0.77,0.59,0.95


# For a different project, what is the RR of people who attend mass gatherings, compared to those who did not?

In [57]:
data['df']['mass_gathering'] = data['df'].test_reason.fillna('').str.contains('5')  # outdoor gathering
df = methods.subset_data(data, data['df'].test_required, exposure='mass_gathering', outcome='test_positive')
methods.my_calc_and_summarize(df)

[199188, 5450, 5.839862783640819, 5.737774327264093, 5.969629004362062]

In [58]:
data['df']['mass_gathering'] = data['df'].test_reason.fillna('').str.contains('6')  # indoor gathering
df = methods.subset_data(data, data['df'].test_required, exposure='mass_gathering', outcome='test_positive')
methods.my_calc_and_summarize(df)

[197143, 7495, 5.35384040449148, 5.262049922837521, 5.4681418387354785]

# For another question work from home vs not among all workers

In [59]:
rows = (data['df'].date >= data['start_date']) & (data['df'].date <= data['end_date']) & (data['df'].test_required == 1)
sum(rows)

43430

In [60]:
df = methods.subset_data(data, rows, exposure='work_outside_home', outcome='test_positive')
methods.my_calc_and_summarize(df)

[14844, 28586, 0.7387742622684594, 0.7030626583122089, 0.8127058118756061]

In [61]:
df = methods.subset_data(data, rows, exposure='work_outside_home', outcome='test_negative')
methods.my_calc_and_summarize(df)

[14844, 28586, 1.031253205615104, 1.023148548465216, 1.0375152612908656]