In [25]:
import pandas as pd
import altair as alt
from IPython.display import display, Markdown
import numpy as np

In [367]:
df = pd.read_csv('/Users/kaitlinsinger/wa_notify/DEN_future/data_extraction/manipulated_data/simplified_wide_df_with_date_loc_imputed.csv')

state_pop = pd.read_excel('/Users/kaitlinsinger/wa_notify/DEN_future/data_extraction/datasets/baselines/NST-EST2024-POP.xlsx', sheet_name='clean')
case_counts = pd.read_csv('/Users/kaitlinsinger/wa_notify/DEN_future/data_extraction/datasets/baselines/weekly_metrics_by_state.csv')

In [368]:
df['Study Type'] = np.where(df['source'].str.startswith('Case Investigation and Contact Tracing Efforts From Health Departments'), 'Stargel Only', 'All Others')

In [369]:
'''
df['ratio_cases_contacts_intv_computed'] = df['cases_interviewed_count']/df['contacts_interviewed_count']
df['ratio_contacts_cases_intv_computed'] = df['contacts_interviewed_count']/df['cases_interviewed_count']
df['contacts_named_ratio_computed'] = df['contacts_named_count']/df['cases_interviewed_count']
df['contacts_named_naming_ratio_computed'] = df['contacts_named_count']/df['cases_named_contacts_count']
df['staff_per_assigned_ratio'] = (df['staff_hired_ci_count']+df['staff_hired_ct_count'])/(df['cases_assigned_count'] + df['contacts_named_count'])
df['case_assigned_per_staff_ratio'] = (df['cases_assigned_count'])/df['staff_hired_ci_count']
df['contact_assigned_per_staff_ratio'] = (df['contacts_named_count'])/df['staff_hired_ct_count']
df['ci_per_case_ratio'] = (df['staff_hired_ci_count'])/(df['cases_assigned_count'])
df['ct_per_contact_ratio'] = (df['staff_hired_ct_count'])/(df['contacts_named_count'])
'''
rename_dict = {
 'source': 'Data Source',
 'pm_start_date': 'Start Date',
 'pm_end_date': 'End Date of Collection',
 'cases_assigned_count': 'Count of Cases Assigned',
 'cases_interviewed_count': 'Count of Cases Interviewed',
 'cases_interviewed_perc': 'Percent of Cases Interviewed',
 'cases_named_contacts_count': 'Count of Cases Naming Contacts',
 'cases_not_assigned_count': 'Count of Cases Not Assigned',
 'cases_reached_count': 'Count of Cases Reached',
 'cases_reached_perc': 'Percent of Cases Reached',
 'percent_naming_contacts': 'Percent of Cases Naming Contacts',
 'contacts_assigned_count': 'Count of Contacts Assigned',
 'contacts_interviewed_count': 'Count of Contacts Interviewed',
 'contacts_interviewed_perc': 'Percent of Contacts Interviewed',
 'contacts_named_count': 'Count of Contacts Named',
 'contacts_named_ratio_mean_cases_naming': 'Mean Number of Contacts Per Case Naming at Least One',
 'contacts_named_ratio_mean': 'Mean Number of Contacts Per Case',
 'contacts_named_ratio_med': 'Median Number of Contacts Per Case',
 'contacts_reached_count': 'Count of Contacts Reached',
 'contacts_reached_perc': 'Percent of Contacts Reached',
 'ratio_cases_contacts_intv_computed': 'Number of Cases Interviewed / Number of Contacts Interviewed',
 'ratio_contacts_cases_intv_computed': 'Number of Contacts Interviewed / Number of Cases Interviewed',
 'contacts_named_ratio_computed': 'Mean Number of Contacts Per Case (All Others, computed)',
 'contacts_named_naming_ratio_computed': 'Mean Number of Contacts Per Case Naming at Least One (All Others, computed)',
 'staff_hired_ci_count': 'Count of Case Investigators Hired',
 'staff_hired_ct_count': 'Count of Contact Tracers Hired',
 'ci_per_case_ratio': 'Number of Case Investigators per Case',
 'ct_per_contact_ratio': 'Number of Contact Tracers per Contact',
 'case_per_ci_ratio': 'Number of Cases per Case Investigator',
 'contact_per_ct_ratio': 'Number of Contacts per Contact Tracer',
 'cases_missed_count': 'Count of Cases Assigned, Not Interviewed',
 'cases_missed_perc': 'Percent of Cases Assigned, Not Interviewed',
 'hd_contacts_named_ratio_mean_mean': 'Mean Number of Contacts Per Case (Stargel)',
 'hd_contacts_named_ratio_mean_cases_naming_mean': 'Mean Number of Contacts Per Case Naming at Least One (Stargel)' 
}

In [370]:
df = df.rename(columns=rename_dict)
df['pm_location'] = df['pm_location'].replace('Members of large CA health care plan','CA')

In [371]:
df['Study Summary'] = df.apply(lambda x: f"{x['pm_location']} {x['Start Date']}-{x['End Date of Collection']}", axis=1)

In [372]:
df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Data Source,Start Date,End Date of Collection,pm_location,Count of Cases Assigned,Count of Cases Interviewed,Percent of Cases Interviewed,Count of Cases Naming Contacts,...,"Mean Number of Contacts Per Case Naming at Least One (All Others, computed)",staff_per_assigned_ratio,case_assigned_per_staff_ratio,contact_assigned_per_staff_ratio,Number of Case Investigators per Case,Number of Contact Tracers per Contact,Number of Cases per Case Investigator,Number of Contacts per Contact Tracer,Study Type,Study Summary
0,0,0,A Multifaceted Evaluation of a COVID-19 Contac...,2021-03-01,03/31/2021,King County,,,,,...,,,,,,,,,All Others,King County 2021-03-01-03/31/2021
1,1,1,A Multifaceted Evaluation of a COVID-19 Contac...,2021-04-01,06/30/2021,King County,,,76.0,,...,,,,,,,,,All Others,King County 2021-04-01-06/30/2021


In [373]:
df_st = df[df['Study Type'] == 'Stargel Only']

In [374]:
title = alt.TitleParams(
    "Count metrics by month",
    subtitle=["Data from Stargel et al"],
    anchor='start'
)

long_df = df_st.melt(
    id_vars=['Start Date'],
    value_vars=['Count of Cases Assigned', 'Count of Cases Interviewed','Count of Cases Naming Contacts',
                'Count of Contacts Named', 'Count of Contacts Reached'],
    var_name='Metric',
    value_name='count'
)
comb = alt.Chart(long_df).mark_bar().encode(
    alt.X('yearmonth(Start Date):T', title=''),
    alt.Y('sum(count):Q', title=None),
    alt.Color('Metric:N'),
    xOffset='Metric:N'
).properties(
    width=550,
    height=200,
    title=title
)

comb

In [375]:
title = alt.TitleParams(
    "Assignment load vs staff maintenance by month",
    subtitle=["Data from Stargel et al"],
    anchor='start'
)


c2 = alt.Chart(df_st).mark_bar().encode(
    alt.X('yearmonth(Start Date):T', title='Start Date'),
    alt.Y('Count of Case Investigators Hired:Q', title='Count of Case Investigators Hired (blue)')
).properties(
    width=650,
    height=200
)
c3 = alt.Chart(df_st).mark_line(color='#F28E2B').encode(
    alt.X('yearmonth(Start Date):T', title='Start Date'),
    alt.Y('Number of Cases per Case Investigator:Q', title='Number of Cases per Case Investigator (orange)')
).properties(
    width=650,
    height=200
)
#alt.vconcat(c1, 
ch1 = alt.layer(c2, c3, data=df_st).resolve_scale(y='independent').properties(
    title=title 
)
ch1

In [376]:
title = alt.TitleParams(
    "Assignment load vs staff maintenance by month",
    subtitle=["Data from Stargel et al"],
    anchor='start'
)


c2 = alt.Chart(df_st).mark_bar().encode(
    alt.X('yearmonth(Start Date):T', title=''
         ),
    alt.Y('Count of Contact Tracers Hired:Q', title='Count of Contact Tracers Hired (blue)')
).properties(
    width=650,
    height=200
)
c3 = alt.Chart(df_st).mark_line(color='#F28E2B').encode(
    alt.X('yearmonth(Start Date):T'
          #, title='Ratio of staff per assigned case+contact by start month'
         ),
    alt.Y('Number of Contacts per Contact Tracer:Q', title='Number of Contacts per Contact Tracer (orange)')
).properties(
    width=650,
    height=200
)
#alt.vconcat(c1, 
ch2 = alt.layer(c2, c3, data=df_st).resolve_scale(y='independent').properties(
    title=title 
)

ch2

In [377]:
c1 = alt.Chart(df_st).mark_point().encode(
    alt.X('Count of Cases Assigned:Q'),
    alt.Y('Count of Case Investigators Hired:Q'),
    alt.Size('Number of Cases per Case Investigator:Q'),
    alt.Color('Start Date:T',scale=alt.Scale(scheme='blueorange'))
).properties(
    width=650,
    height=200
)
line = alt.Chart(df_st).transform_regression('Count of Cases Assigned', 'Count of Case Investigators Hired').mark_line(color='#F28E2B').encode(
    x='Count of Cases Assigned:Q',
    y='Count of Case Investigators Hired:Q'
)
c1 + line 

In [378]:
c1 = alt.Chart(df_st).mark_point().encode(
    alt.X('Count of Cases Assigned:Q'),
    alt.Y('Percent of Cases Interviewed:Q'),
    alt.Size('Count of Case Investigators Hired:Q'),
    alt.Color('Start Date:T',scale=alt.Scale(scheme='blueorange'))
).properties(
    width=650,
    height=200
)
line = alt.Chart(df_st).transform_regression('Count of Cases Assigned', 'Percent of Cases Interviewed').mark_line(color='#F28E2B').encode(
    x='Count of Cases Assigned:Q',
    y='Percent of Cases Interviewed:Q'
)
c1 + line 

In [379]:
title = alt.TitleParams(
    "Percent of Cases Naming Contacts",
    subtitle=["Averaged by Month for 'All Others"],
    anchor='start'
)

alt.Chart(df).mark_bar().encode(
    alt.X('yearmonth(Start Date):T', title='Start Date'),
    alt.Y('average(Percent of Cases Naming Contacts):Q', title=''),
    alt.Color('Study Type:N'),
    xOffset='Study Type:N'
).properties(
    width=650,
    height=200,
    title=title
)

In [380]:
title =  alt.TitleParams(
    "Count of Cases Assigned vs Cases Missed",
    subtitle=["Data from 'All Others', Square Root Scale"],
    anchor='start'
)
c1 = alt.Chart(df[df['Count of Cases Assigned'] < 1000000]).mark_point().encode(
    alt.X('Count of Cases Assigned:Q', scale=alt.Scale(type='sqrt')),
    alt.Y('Percent of Cases Assigned, Not Interviewed:Q', scale=alt.Scale(type='sqrt'))
)

# Regression line with same variables
line = alt.Chart(df[df['Count of Cases Assigned'] < 1000000]).transform_regression(
    'Count of Cases Assigned', 
    'Percent of Cases Assigned, Not Interviewed'
).mark_line(color='#F28E2B').encode(
    alt.X('Count of Cases Assigned:Q', scale=alt.Scale(type='sqrt')),
    alt.Y('Percent of Cases Assigned, Not Interviewed:Q', scale=alt.Scale(type='sqrt'))
)

# Combine plots and resolve axes
chart = (c1 + line).resolve_scale(
    x='shared',
    y='shared'
).properties(width=650,
    height=200, title=title)

chart

In [381]:
c1 = alt.Chart(df).mark_point().encode(
    alt.X("Count of Cases Assigned, Not Interviewed:Q"),
    alt.Y("Study Summary:N"),
    alt.Size("Count of Cases Assigned:Q")
)
c2 = alt.Chart(df).mark_point().encode(
    alt.X("Percent of Cases Assigned, Not Interviewed:Q"),
    alt.Y("Study Summary:N"),
    alt.Size("Count of Cases Assigned:Q")
)
c2

In [382]:
title = alt.TitleParams(
    "Contact Reporting Trends by Month",
    subtitle=["Data from all sources"],
    anchor='start'
)

long_df = df.melt(
    id_vars=['Start Date'],
    value_vars=['Mean Number of Contacts Per Case (Stargel)', 'Mean Number of Contacts Per Case (All Others, computed)', 
                'Mean Number of Contacts Per Case Naming at Least One (Stargel)','Mean Number of Contacts Per Case Naming at Least One (All Others, computed)'],
    var_name='Metric',
    value_name='value'
)
comb = alt.Chart(long_df[long_df['value'] < 8]).mark_bar().encode(
    alt.X('yearmonth(Start Date):T', title=''),
    alt.Y('average(value):Q', title=None),
    alt.Color('Metric:N', legend=alt.Legend(labelLimit=1000)),
    xOffset='Metric:N'
).properties(
    width=550,
    height=200,
    title=title
)

comb

In [357]:
long_df = df.melt(
    id_vars=['Start Date','region'],
    value_vars=['Percent of Cases Interviewed', 'Percent of Cases Reached'],
    var_name='Reached',
    value_name='Percent'
)
c1 = alt.Chart(long_df).mark_point().encode(
    alt.X("Percent:Q"),
    alt.Y("region:N", title='Region'),
    alt.Color('Reached:N'),
)

In [358]:
long_df = df.melt(
    id_vars=['Start Date','region'],
    value_vars=['Percent of Contacts Interviewed', 'Percent of Contacts Reached'],
    var_name='Reached',
    value_name='Percent'
)
c2 = alt.Chart(long_df).mark_point().encode(
    alt.X("Percent:Q"),
    alt.Y("region:N", title='Region'),
    alt.Color('Reached:N'),
)
(c1 | c2)

In [387]:
long_df = df.melt(
    id_vars=['Start Date','Data Source'],
    value_vars=['Percent of Cases Interviewed', 'Percent of Cases Reached'],
    var_name='Reached',
    value_name='Percent'
)
alt.Chart(long_df).mark_point().encode(
    alt.X("Percent:Q"),
    alt.Y("Data Source:N"),
    alt.Color('Reached:N'),
)

In [359]:
long_df = df.melt(
    id_vars=['Start Date','Study Summary'],
    value_vars=['Percent of Cases Interviewed', 'Percent of Cases Reached'],
    var_name='Reached',
    value_name='Percent'
)
alt.Chart(long_df).mark_point().encode(
    alt.X("Percent:Q"),
    alt.Y("Study Summary:N"),
    alt.Color('Reached:N'),
)

In [388]:
long_df = df.melt(
    id_vars=['Start Date','Data Source'],
    value_vars=['Percent of Contacts Interviewed', 'Percent of Contacts Reached'],
    var_name='Reached',
    value_name='Percent'
)
alt.Chart(long_df).mark_point().encode(
    alt.X("Percent:Q"),
    alt.Y("Data Source:N"),
    alt.Color('Reached:N'),
)

In [360]:
long_df = df.melt(
    id_vars=['Start Date','Study Summary'],
    value_vars=['Percent of Contacts Interviewed', 'Percent of Contacts Reached'],
    var_name='Reached',
    value_name='Percent'
)
alt.Chart(long_df).mark_point().encode(
    alt.X("Percent:Q"),
    alt.Y("Study Summary:N"),
    alt.Color('Reached:N'),
)

In [361]:
ratio = alt.Chart(df).mark_point().encode(
    alt.X("Number of Cases Interviewed / Number of Contacts Interviewed:Q"),
    alt.Y("region:N")
)
ratio2 = alt.Chart(df).mark_point().encode(
    alt.X("Number of Contacts Interviewed / Number of Cases Interviewed:Q"),
  alt.Y("region:N")
)
(ratio | ratio2)

In [383]:
ratio = alt.Chart(df).mark_point().encode(
    alt.X("Number of Cases Interviewed / Number of Contacts Interviewed:Q"),
    alt.Y("Study Summary:N")
)
ratio2 = alt.Chart(df).mark_point().encode(
    alt.X("Number of Contacts Interviewed / Number of Cases Interviewed:Q"),
  alt.Y("Study Summary:N")
)
(ratio | ratio2)

In [386]:
df[['Data Source','contacts_named_perc_household']].dropna()

Unnamed: 0,Data Source,contacts_named_perc_household
2,A Multifaceted Evaluation of a COVID-19 Contac...,81.4
21,COVID-19 Case Investigation and Contact Tracin...,0.78
38,COVID-19 Contact Tracing Outcomes in Washingto...,80.0
39,COVID-19 Contact Tracing Outcomes in Washingto...,74.0
101,Factors Influencing the Results of COVID-19 Ca...,0.7
104,Integrating Contact Tracers Into Point-of-Care...,0.66
