# Data Paper Figures and Statistics
This notebook produces figures and statistics for the data descriptior paper, using the openly available version of the UK COVID-19 Vocal Audio Dataset. Some subfigures use metadata from the protected version of the dataset and cannot be replicated here. All figures are written to .svg format and further processed for appearance and layout. 

In [None]:
# python imports 
import os
import pandas as pd
#import kaleido # Required for exporting figures
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime as dt
import io

In [None]:
# reads participant metadata file 
participant_df =  pd.read_csv('participant_metadata.csv')

In [None]:
# reads audio metadata file 
audio_df = pd.read_csv('audio_metadata.csv')

### Data Paper Statistics 

Total Participants

In [None]:
participant_df.shape[0]

Positive cases (COVID)

In [None]:
participant_df['covid_test_result'].value_counts(dropna=False)

In [None]:
100*participant_df['covid_test_result'].value_counts(dropna=False, normalize=True)

% PCR Test Results

In [None]:
participant_df[~participant_df['covid_test_result'].isna()].shape

In [None]:
participant_df[(participant_df['covid_test_method'].str.contains('PCR', na=False)) & (~participant_df['covid_test_result'].isna())].shape[0]

In [None]:
round((70565/72999)*100, 1)

In [None]:
# Tos dataset contains 27101 PCR test results
70565/27101

In [None]:
round(100*participant_df[(participant_df['covid_test_method'].str.contains('PCR', na=False)) & (~participant_df['covid_test_result'].isna())].shape[0]/participant_df.shape[0],1)

PCR positive participants 

In [None]:
participant_df[participant_df['covid_test_method'].str.contains('PCR', na=False)]['covid_test_result'].value_counts(dropna=False)

% Respiratory Symptoms

In [None]:
respiratory_symptoms = ['symptom_new_continuous_cough', 'symptom_cough_any', 
                        'symptom_runny_or_blocked_nose', 'symptom_shortness_of_breath',
       'symptom_sore_throat',]

In [None]:
participant_df['symptoms_respiratory']=participant_df.apply(lambda x:x[respiratory_symptoms].sum()>0, axis=1)
participant_df['symptoms_respiratory'].sum()

In [None]:
round(100*participant_df['symptoms_respiratory'].sum()/participant_df.shape[0],1)

In [None]:
round(100*participant_df[participant_df['covid_test_result']=='Positive']['symptoms_respiratory'].value_counts(normalize=True, dropna=False),1)

In [None]:
round(100*participant_df[participant_df['covid_test_result']=='Negative']['symptoms_respiratory'].value_counts(normalize=True, dropna=False),1)

In [None]:
### % participants testing positive for SARS-CoV-2 infection reporting no symptoms

participants_covid = participant_df[participant_df['covid_test_result']=='Positive'].shape[0]
covid_no_symptoms = participant_df[(participant_df['covid_test_result']=='Positive') & (participant_df['symptom_none']==1)].shape[0]

round(100*(covid_no_symptoms/participants_covid), 1)

In [None]:
### % participants testing positive for SARS-CoV-2 infection reporting no respiratory symptoms

covid_no_respiratory_symptoms = participant_df[(participant_df['covid_test_result']=='Positive') & (participant_df['symptoms_respiratory']==0)].shape[0]

round(100*(covid_no_respiratory_symptoms/participants_covid), 1)

% Respiratory conditions

In [None]:
respiratory_conditions = ['respiratory_condition_asthma', 'respiratory_condition_other']

In [None]:
participant_df[participant_df[respiratory_conditions].sum(axis=1)>0].shape[0]

In [None]:
round(100*participant_df[participant_df[respiratory_conditions].sum(axis=1)>0].shape[0]/participant_df.shape[0],1)

In [None]:
### % participants testing positive for SARS-CoV-2 infection reporting a respiratory condition

participants_covid = participant_df[participant_df['covid_test_result']=='Positive'].shape[0]
participants_covid_asthma = participant_df[(participant_df['respiratory_condition_asthma']==1) & (participant_df['covid_test_result']=='Positive')].shape[0]
participants_covid_other_condition = participant_df[(participant_df['respiratory_condition_other']==1) & (participant_df['covid_test_result']=='Positive')].shape[0]

round(100*(participants_covid_other_condition+participants_covid_asthma)/participants_covid, 1)

% Asthma

In [None]:
participant_df[participant_df['respiratory_condition_asthma']==1].shape[0]

In [None]:
round(100*participant_df[participant_df['respiratory_condition_asthma']==1].shape[0]/participant_df.shape[0],1)

In [None]:
### % participants with asthma testing positive for SARS-CoV-2 infection

participants_asthma_covid = participant_df[(participant_df['respiratory_condition_asthma']==1) & (participant_df['covid_test_result']=='Positive')].shape[0]
participants_asthma = participant_df[participant_df['respiratory_condition_asthma']==1].shape[0]

round(100*participants_asthma_covid/participants_asthma, 1)

In [None]:
### % participants with asthma reporting another respiratory condition

participants_asthma_other_condition = participant_df[(participant_df['respiratory_condition_asthma']==1) & (participant_df['respiratory_condition_other']==1)].shape[0]

round(100*participants_asthma_other_condition/participants_asthma, 1)

In [None]:
### % participants with asthma testing positive for influenza infection

participants_asthma_influenza_a = participant_df[(participant_df['respiratory_condition_asthma']==1) & (participant_df['influenza_a_test_result']=='Positive')].shape[0]
participants_asthma_influenza_b = participant_df[(participant_df['respiratory_condition_asthma']==1) & (participant_df['influenza_b_test_result']=='Positive')].shape[0]

round(100*(participants_asthma_influenza_a + participants_asthma_influenza_b)/participants_asthma, 1)

% Influenza A or B test results

In [None]:
participant_df['influenza_a_test_result'].value_counts()[['Positive', 'Negative']].sum()

In [None]:
participant_df['influenza_b_test_result'].value_counts()[['Positive', 'Negative']].sum()

In [None]:
round(100*participant_df['influenza_a_test_result'].value_counts()[['Positive', 'Negative']].sum()/participant_df.shape[0],1)

In [None]:
round(100*participant_df['influenza_b_test_result'].value_counts()[['Positive', 'Negative']].sum()/participant_df.shape[0], 1)

In [None]:
participant_df['influenza_a_test_result'].value_counts()

In [None]:
participant_df['influenza_b_test_result'].value_counts()

% Test method

In [None]:
participant_df['covid_test_method'].value_counts(dropna=False)

In [None]:
round(100*participant_df['covid_test_method'].value_counts(normalize=True, dropna=False),1)

In [None]:
participant_df['covid_test_result'].value_counts(dropna=False)

In [None]:
round(100*participant_df['covid_test_result'].value_counts(normalize=True, dropna=False),1)

% REACT, Test and Trace

In [None]:
participant_df['recruitment_source'].value_counts(dropna=False)

In [None]:
round(100*participant_df['recruitment_source'].value_counts(normalize=True, dropna=False),1) 

Recruitment rate

In [None]:
participant_df[['recruitment_source', 'survey_phase']].value_counts()

In [None]:
# REACT beta phase recruitment rate
# REACT beta phase recruitment: 295493 emails
round(100*36116/295493,2)

Age

In [None]:
participant_df['age'].value_counts(dropna=False)

Gender

In [None]:
participant_df['gender'].value_counts(dropna=False)

In [None]:
round(participant_df['gender'].value_counts(dropna=False, normalize=True)*100, 1)

% Wearing mask 

In [None]:
participant_df['wearing_mask'].value_counts(dropna=False)

In [None]:
round(100*participant_df['wearing_mask'].value_counts(dropna=False, normalize=True), 1)

Audio sample rate stats

In [None]:
audio_df[['cough_sample_rate', 'three_cough_sample_rate', 'exhalation_sample_rate']].value_counts()

In [None]:
round(100*audio_df[['cough_sample_rate', 'three_cough_sample_rate', 'exhalation_sample_rate']].value_counts(normalize=True),1)

Audio length

In [None]:
max([audio_df['cough_length'].max(),
      audio_df['three_cough_length'].max(),
     audio_df['exhalation_length'].max()])

Survey phase

In [None]:
participant_df['survey_phase'].value_counts(dropna=False)

In [None]:
round(100*participant_df['survey_phase'].value_counts(dropna=False, normalize=True),1)

Missing audio

In [None]:
audio_df['missing_audio'].value_counts(dropna=False)

In [None]:
round(100*audio_df['missing_audio'].value_counts(dropna=False, normalize=True),1)

Number of audio files

In [None]:
audio_df[~(audio_df['exhalation_size']<45) & ~(audio_df['exhalation_size'].isna())].shape[0]  + audio_df[~(audio_df['cough_size']<45) & ~(audio_df['cough_size'].isna())].shape[0] + audio_df[~(audio_df['three_cough_size']<45) & ~(audio_df['three_cough_size'].isna())].shape[0]


In [None]:
participant_df.shape[0]*4 - 289696

Total size of audio in GB

In [None]:
(audio_df['exhalation_size'].sum()+audio_df['exhalation_size'].sum()+
 audio_df['cough_size'].sum()+
 audio_df['three_cough_size'].sum())/1000000000

In [None]:
# Metadata size in MB
27.8+35.2 #+7.4

Viral load data

In [None]:
participant_df[~participant_df['covid_viral_load'].isna()].shape[0]

In [None]:
round(100*participant_df[~participant_df['covid_viral_load'].isna()].shape[0]/participant_df.shape[0],1)

In [None]:
participant_df['covid_viral_load_category'].value_counts()

In [None]:
participant_df.columns

Variable completeness

In [None]:
participant_na_df = participant_df.copy()
participant_na_df = participant_na_df.replace('Prefer not to say', np.nan)
participant_na_df = participant_na_df.replace('Unknown', np.nan)
participant_na_df = participant_na_df.replace('Unknown/Void', np.nan)

In [None]:
for column in participant_na_df.columns:
    print(column, round(100-(100*participant_na_df[column].isna().sum()/participant_na_df.shape[0]), 1), "%")

In [None]:
for column in audio_df.columns:
    print(column, round(100-(100*audio_df[column].isna().sum()/audio_df.shape[0]), 1), "%")

## Figure 1 - Recruitment

#### Figure 1.C Survey completion rates

In [None]:
# Data from survey provider, for beta phase only, in order of survey 
react_data = [43015, 42707, 41500, 41418, 41389, 41386, 41372, 41343, 41326, 41323, 37602, 37150, 36933, 36768]
tt_data = [41060, 40531, 28126, 28063, 28045, 28040, 28031, 28018, 28003, 28001, 26260, 26071, 25959, 25853]

# List survey questions
questions = ["Privacy confirmation", "Participation agreement", 
               "Barcode entry", "Symptoms", 
               "Smoker status", "Respiratory conditions", 
               "First language", "Height", 
               "Weight", "Mask", 
               "Sentence audio", "Exhalations audio", 
               "Single cough audio", "Three coughs audio",
               "<b>Processed Data Set<b>"]

react_data.append(participant_df[participant_df['recruitment_source'].str.contains('REACT')]['survey_phase' ].value_counts()['beta'])
tt_data.append(participant_df[participant_df['recruitment_source']=='Test and Trace']['survey_phase' ].value_counts()['beta'])


In [None]:
# Plots figure
fig = go.Figure()

fig.add_trace(go.Funnel(
    name = 'NHS Test and Trace',
    orientation = "h",
    width=0.75, 
    y = questions,
    x = tt_data,
    marker_color='#0072B2',
    textposition = "inside",
    textinfo = "percent initial"))

fig.add_trace(go.Funnel(
    name = 'REACT',
    orientation = "h",
    width=0.75,
    y = questions,
    x = react_data,
    marker_color='#D55E00',
    textposition = "inside",
    textinfo = "percent initial"))

fig.update_layout(template='simple_white', font_family="Arial", font_size=12, width=500)

fig.show()

In [None]:
# Writes figure to image
fig.write_image('DataPaperFig1C.svg', scale=1)
fig.write_image('DataPaperFig1C.png', scale=3)

#### Combined subplots for figures 1.D, 1.E.  (combined to ensure matched font scale and width)

D. Submissions over time 

In [None]:
# Copies participant dataframe and gets number of participants 
all_df = participant_df.copy()

participants = participant_df.shape[0]

In [None]:
# Adds week and cohort variables 
all_df['week'] = pd.to_datetime(all_df['submission_date'], errors='coerce').dt.date - pd.to_datetime(all_df['submission_date'], errors='coerce').dt.weekday * np.timedelta64(1, 'D')

all_df['cohort'] = all_df['recruitment_source'].apply(lambda x: 'react' if 'REACT' in x else ('tt' if 'Trace' in x else 'None'))

In [None]:
# Groups by week 
week_df = all_df.groupby(['cohort', 'covid_test_result', 'week']).size().to_frame('count').reset_index()

week_df['percentage']=100*week_df['count']/participants

In [None]:
# Creates plotly traces to plot
tt_pos_timeline_trace=go.Scatter(x=week_df[(week_df['cohort']=='tt')&(week_df['covid_test_result']=='Positive')]['week'],
                         y=week_df[(week_df['cohort']=='tt')&(week_df['covid_test_result']=='Positive')]['percentage'],
                         stackgroup='one', name='NHS Test and Trace<br>COVID Positive', legendgroup=1,
                        line=dict(width=0, color='rgba(0, 114, 178, 1)'))

tt_neg_timeline_trace=go.Scatter(x=week_df[(week_df['cohort']=='tt')&(week_df['covid_test_result']=='Negative')]['week'],
                         y=week_df[(week_df['cohort']=='tt')&(week_df['covid_test_result']=='Negative')]['percentage'],
                         stackgroup='one', name='NHS Test and Trace<br>COVID Negative', legendgroup=1,
                        line=dict(width=0, color='rgba(86, 180, 233, 1)'),)

react_pos_timeline_trace=go.Scatter(x=week_df[(week_df['cohort']=='react')&(week_df['covid_test_result']=='Positive')]['week'],
                         y=week_df[(week_df['cohort']=='react')&(week_df['covid_test_result']=='Positive')]['percentage'],
                         stackgroup='one', name='REACT<br>COVID Positive', legendgroup=1,
                         fillcolor='rgba(213,94,0,1)',
                        line=dict(width=0),)

react_neg_timeline_trace=go.Scatter(x=week_df[(week_df['cohort']=='react')&(week_df['covid_test_result']=='Negative')]['week'],
                         y=week_df[(week_df['cohort']=='react')&(week_df['covid_test_result']=='Negative')]['percentage'],
                         stackgroup='one', name='REACT<br>COVID Negative', legendgroup=1,
                        line=dict(width=0, color='rgba(230, 159, 0, 1)'),)


timeline_traces = [tt_pos_timeline_trace, tt_neg_timeline_trace, react_pos_timeline_trace, react_neg_timeline_trace]

E. Repsonse Gap

In [None]:
# Splits participant dataframe by recuitment cohort 
tt_df=participant_df[participant_df['recruitment_source']=='Test and Trace']
react_df= participant_df[participant_df['recruitment_source'].str.contains('REACT')]

In [None]:
# Run following cells 
delay_df=react_df.copy()
participants=react_df.shape[0]

# format to be in context of submission date
delay_df['symptom_onset'] = 0-delay_df['symptom_onset'] 
# Make symptom onset na when no symptoms 
delay_df['symptom_onset'] = delay_df.apply(lambda x: np.nan if x['symptom_none']==1 else x['symptom_onset'], axis=1)
delay_df['covid_test_date'] = delay_df['covid_test_date'] - delay_df['submission_date']
delay_df['covid_test_processed_date'] = delay_df['covid_test_processed_date'] - delay_df['submission_date']



In [None]:
# Print % of participants completing survey within 72 hours 
delay_df['submission_delay_72hrs'] = delay_df['covid_test_date'].apply(lambda x: 1 if x<-3 else 0)
print("percentage participants completing survey withing 72hrs of testing: ")
print(round(100*delay_df['submission_delay_72hrs'].value_counts(dropna=False, normalize=True), 1))

In [None]:
# format test completion date data
test_date_df = delay_df['covid_test_date'].value_counts().reset_index().sort_values(by='covid_test_date', ascending=True)

In [None]:
# Formats data for both participant cohorts, prints 72 hours survey delay statistic
subplot_traces=[]

for cohort_df in [tt_df, react_df]:

    # Run following cells 
    delay_df=cohort_df.copy()
    participants=cohort_df.shape[0]

    # format to be in context of submission date
    delay_df['symptom_onset'] = 0-delay_df['symptom_onset'] 
    # Make symptom onset na when no symptoms 
    delay_df['symptom_onset'] = delay_df.apply(lambda x: np.nan if x['symptom_none']==1 else x['symptom_onset'], axis=1)
    delay_df['covid_test_date'] = delay_df['covid_test_date'] - delay_df['submission_date']
    delay_df['covid_test_processed_date'] = delay_df['covid_test_processed_date'] - delay_df['submission_date']

    # Print % of participants completing survey within 72 hours 
    delay_df['submission_delay_72hrs'] = delay_df['covid_test_date'].apply(lambda x: 1 if x<-3 else 0)
    print("percentage participants completing survey withing 72hrs of testing: ")
    print(round(100*delay_df['submission_delay_72hrs'].value_counts(dropna=False, normalize=True), 1))

    # format test completion date data
    test_date_df = delay_df['covid_test_date'].value_counts().reset_index().sort_values(by='covid_test_date', ascending=True)
    test_date_df.columns=['days','count']
    test_date_df['percentage'] = 100*test_date_df['count']/participants
    test_date_df['days']= test_date_df['days'].astype(int)
    
    # get test completion date plotly trace
    taken_trace = go.Scatter(
    x=test_date_df['days'], 
    y=test_date_df['percentage'],
    name='Test Taken',
    fill='tozeroy',
    fillcolor='rgba(213, 94, 0, 0.75)',
    line=dict(width=0),
    legendgroup=2,
    showlegend=False,
    mode='lines')

    # format test processing date data
    processed_date_df = delay_df['covid_test_processed_date'].value_counts().reset_index().sort_values(by='covid_test_processed_date', ascending=True)
    processed_date_df.columns=['days','count']
    processed_date_df['percentage'] = 100*processed_date_df['count']/participants
    processed_date_df['days']= processed_date_df['days'].astype(int)
    
    # get test processing date plotly trace
    processed_trace = go.Scatter(
    x=processed_date_df['days'], 
    y=processed_date_df['percentage'],
    name='Test Processed',
    fill='tozeroy',
    fillcolor='rgba(204, 121, 167, 0.75)',
    line=dict(width=0),
    legendgroup=2,
    showlegend=False,
    mode='lines')

    # format symptom onset data
    onset_df = delay_df['symptom_onset'].value_counts().reset_index().sort_values(by='symptom_onset', ascending=True)
    onset_df.columns=['days','count']
    onset_df['percentage'] = 100*onset_df['count']/participants
    
    # get symptom onset date plotly trace
    onset_trace = go.Scatter(
    x=onset_df['days'], 
    y=onset_df['percentage'],
    name='Symptom Onset',
    fill='tozeroy',
    fillcolor='rgba(0, 114, 178, 0.75)',
    line=dict(width=0),
    legendgroup=2,
    showlegend=False,
    mode='lines',
    )
    
    subplot_traces.append([processed_trace, taken_trace, onset_trace])

Combined Subplot

In [None]:
# Plots above traces 
fig = make_subplots(rows=2, cols=2,  vertical_spacing=0.25, 
                    subplot_titles=('<b>Survey participation by week<b>', '<b>REACT<b>', '<b>NHS Test and Trace<b>'),
                    specs=[[{"colspan": 2}, None],
                          [{},{}]]
                   )


# for trace in timeline_traces:
#     fig.add_trace(trace, row=1, col=1)

for trace in subplot_traces[1]:
    fig.add_trace(trace, row=2, col=1)
    
for trace in subplot_traces[0]:
    fig.add_trace(trace, row=2, col=2)


fig.add_vline(x=0, line_width=2, line_dash="dash", opacity=1,
              line_color="black", annotation_text = ' Survey<br> submission',
              annotation_position='top right', annotation_align='left', row=2)


fig.update_yaxes(ticksuffix='%')
fig.update_xaxes(title='Survey submission date', row=1)
fig.update_yaxes(title='Percentage of particpants', title_standoff=10, range=[0,10], row=1)
fig.update_yaxes(title='Percentage of particpants', title_standoff=5, row=2)
fig.update_xaxes(range=[-15, 5], title_text='Days relative to survey submission', row=2)
fig.update_yaxes(range=[0, 80], row=2)
  
fig.update_layout(template='simple_white',
                  font_family='Arial',
                  font_size=12,
                  height=600,
                 legend=dict(x=0.025,y=1, bgcolor='rgba(255, 255, 255, 0)', borderwidth=0,
                             tracegroupgap=130))

fig.show() 

In [None]:
# Writes plot to image
fig.write_image('DataPaperFig1DE.svg', scale=1) 
fig.write_image('DataPaperFig1DE.png', scale=3) 

## Figure 2 - Dataset summary

In [None]:
all_df = participant_df.copy()

In [None]:
participants = all_df.shape[0]

#### Audio Plot
Audio data shown in the manuscript is recorded by the author and not included in this repository. Similar plots can be recreated using the audio files listed in audio_df and read using scipy.io.wavfile.read

In [None]:
#my_audio_df = pd.read_csv('my_audio_df.csv')
#samplerate = 44100 

In [None]:
# One example audio file for each audio modality 
#sentence = np.array(my_audio_df['sentence'])
#exhalation = np.array(my_audio_df['exhalation'])
#one_cough = np.array(my_audio_df['one_cough'])
#three_coughs = np.array(my_audio_df['three_coughs'])

In [None]:
# Generates plotly traces and plots subplots

# fig = make_subplots(
#     rows=4, cols=1,
#     shared_xaxes=True,
#     shared_yaxes=True,
#     subplot_titles=("Speech (read sentence)", "Three sharp exhalations", "One volitional cough", "Three volitional coughs"))

# sentence_trace = go.Scatter(
#     x=[x/samplerate for x in range(0, len(sentence))],
#     y=sentence,
# showlegend=False,
# line=dict(color="#D55E00"))
    
# exhalation_trace = go.Scatter(
#     x=[x/samplerate for x in range(0, len(exhalation))],
#     y=exhalation,
# showlegend=False,
# line=dict(color="#CC79A7"))
    
# one_cough_trace = go.Scatter(
#     x=[x/samplerate for x in range(0, len(one_cough))],
#     y=one_cough,
# showlegend=False,
# line=dict(color="#CC79A7"))
    
# three_coughs_trace = go.Scatter(
#     x=[x/samplerate for x in range(0, len(three_coughs))],
#     y=three_coughs,
# showlegend=False,
# line=dict(color="#CC79A7"))

# fig.add_trace(sentence_trace, col=1, row=1)

# fig.add_trace(exhalation_trace, col=1, row=2)

# fig.add_trace(one_cough_trace, col=1, row=3)

# fig.add_trace(three_coughs_trace, col=1, row=4)

# fig.update_xaxes(title='Seconds', row=4)
# fig.update_yaxes(showticklabels=False)
# fig.update_yaxes(ticks="")
# fig.update_layout(template='simple_white', font_family='Arial', font_size=14, width=500)

# fig.show()

#### COVID Test Results
The following subsections generate plotly traces which are plotted later as subplots 

In [None]:
# Groups all PCR-type tests 
def test_simplify(test):
    try:
        if 'PCR' in test:
            return 'PCR'
        else:
            return test     
    except:
        return None

In [None]:
# Format data
all_df['covid_test_method'] = all_df['covid_test_method'].apply(lambda x:test_simplify(x))

covid_df = all_df.groupby(['covid_test_result', 'covid_test_method']).size().reset_index()

covid_df = covid_df[covid_df['covid_test_result']!='Unknown/Void']

covid_df = covid_df[covid_df['covid_test_method']!='Unknown']

covid_df.columns  =['covid_test_result', 'covid_test_method', 'count']

covid_df['percentage'] = 100* covid_df['count']/participants

In [None]:
# Generates plotly traces
covid_trace_1 = go.Bar(
x=covid_df[covid_df['covid_test_method']=='PCR']['covid_test_result'],
y=covid_df[covid_df['covid_test_method']=='PCR']['percentage'],
name='PCR',
offsetgroup='A',
marker=dict(line=dict(width=0)),
marker_color='#0072B2')

covid_trace_2 = go.Bar(
x=covid_df[covid_df['covid_test_method']=='LAMP']['covid_test_result'],
y=covid_df[covid_df['covid_test_method']=='LAMP']['percentage'],
name='LAMP',
offsetgroup='A',
base = covid_df[covid_df['covid_test_method']=='PCR']['percentage'],
marker=dict(line=dict(width=0)),
marker_color='#009E73')

covid_trace_3 = go.Bar(
x=covid_df[covid_df['covid_test_method']=='LFT']['covid_test_result'],
y=covid_df[covid_df['covid_test_method']=='LFT']['percentage'],
name='LFT',
offsetgroup='A',
base = covid_df[covid_df['covid_test_method']!='LFT'].groupby('covid_test_result').sum().reset_index()['percentage'],
marker=dict(line=dict(width=0)),
marker_color='#F0E442')

covid_test_traces = [covid_trace_1, covid_trace_2, covid_trace_3]

#### Influenza Test Results

In [None]:
# Combines influenza A and B test results 

def flu_label(row):
    try:
        if (row['influenza_a_test_result']=='Positive') or (row['influenza_b_test_result']=='Positive'):
            return 'Positive'
        if (row['influenza_a_test_result']=='Negative') and (row['influenza_b_test_result']=='Negative'):
            return 'Negative'
        else:
            return None
    except:
        return None

In [None]:
# Formats data 
all_df['influenza_test_result'] = all_df.apply(lambda x:flu_label(x), axis=1)

flu_df = all_df.groupby('influenza_test_result').size().reset_index()

flu_df.columns = ['influenza_test_result', 'count']

flu_df['percentage'] = 100* flu_df['count']/participants

In [None]:
# Generates plotly trace
flu_trace = go.Bar(
x=flu_df['influenza_test_result'],
y=flu_df['percentage'],
name='PCR',
showlegend=False,
marker=dict(line=dict(width=0)),
marker_color='#0072B2')

#### Symptoms

In [None]:
# Groups non-respiratory symptoms for plot 

non_respiratory_symptoms =  [
       'symptom_change_to_sense_of_smell_or_taste',
       'symptom_abdominal_pain',
       'symptom_diarrhoea', 'symptom_fatigue',
       'symptom_fever_high_temperature', 'symptom_headache',
       'symptom_loss_of_taste',]

all_df['symptom_non_resp'] = all_df[non_respiratory_symptoms].any(axis=1).replace({True: 1, False: 0})

In [None]:
# New symptom list with grouped non-respiratory symptoms 
symptoms = ['symptom_cough_any', 'symptom_new_continuous_cough', 'symptom_runny_or_blocked_nose',
            'symptom_sore_throat', 'symptom_shortness_of_breath',
       'symptom_non_resp', 'symptom_other', 'symptom_none', 'symptom_prefer_not_to_say',]

In [None]:
# Formats symptom names for plot
symptom_dict = {'symptom_change_to_sense_of_smell_or_taste':'Change to sense<br>of smell or taste',
       'symptom_new_continuous_cough':'New continuous<br>cough',
        'symptom_abdominal_pain':'Abdominal pain',
       'symptom_cough_any': 'Cough (any)',
        'symptom_diarrhoea':'Diarrhoea',
        'symptom_fatigue':'Fatigue',
       'symptom_fever_high_temperature':'Fever',
        'symptom_headache':'Headache',
       'symptom_loss_of_taste':'Loss of taste',
        'symptom_runny_or_blocked_nose':'Runny or<br>blocked nose',
        'symptom_shortness_of_breath':'Shortness of<br>breath',
       'symptom_sore_throat':'Sore throat', 
        'symptom_other':'Other',
        'symptom_none':'None',
        'symptom_non_resp':'Non-respiratory<br>symptoms*',
        'symptom_prefer_not_to_say':'Prefer not to say',}

In [None]:
# Split by positive an negative COVID test results for plot 
pos_df = all_df[all_df['covid_test_result']=='Positive']
neg_df = all_df[all_df['covid_test_result']=='Negative']

In [None]:
# Format data for bar plot 
symptom_counts = []
for symptom in symptoms:
    symptom_counts.append([symptom, pos_df[symptom].sum(), neg_df[symptom].sum()])
    
symptom_df = pd.DataFrame(symptom_counts)
symptom_df.columns = ['symptom', 'pos_count', 'neg_count']

symptom_df['symptom'] = symptom_df['symptom'].map(symptom_dict)
symptom_df = symptom_df[:8]

In [None]:
# Generate plotly traces 
pos_symptom_trace = go.Bar(
x=symptom_df['symptom'],
y=100*symptom_df['pos_count']/participants, 
name='COVID<br>Positive', 
marker_color='rgba(230,159,0,1)',
)

neg_symptom_trace = go.Bar(
x=symptom_df['symptom'],
y=100*symptom_df['neg_count']/participants, 
name='COVID<br>Negative', 
marker_color='rgba(86,180,233,1)', 
)

symptom_traces = [pos_symptom_trace, neg_symptom_trace]

#### Respiratory Conditions

In [None]:
# Create None columns and nan columns 
all_df['respiratory_condition_prefer_not_to_say'] = all_df.apply(lambda x: 1 if pd.isna(x['respiratory_condition_other']) else 0 , axis=1)
all_df['respiratory_condition_prefer_not_to_say'].sum()
all_df['respiratory_condition_none'] = all_df.apply(lambda x: 1 if (x['respiratory_condition_other'] + x['respiratory_condition_asthma'] == 0) else 0 , axis=1)
all_df['respiratory_condition_none'] = all_df.apply(lambda x: np.nan if x['respiratory_condition_prefer_not_to_say']==1 else x['respiratory_condition_none'], axis=1)
all_df['respiratory_condition_none'].value_counts(dropna=False)

In [None]:
# List of respiratory condition variables
conditions=['respiratory_condition_asthma',
       'respiratory_condition_other',
       'respiratory_condition_none',
       'respiratory_condition_prefer_not_to_say']

In [None]:
# Rename variables for plot 
condition_dict = {'respiratory_condition_asthma':'Asthma',
       'respiratory_condition_other':'Other',
       'respiratory_condition_none':'None',
       'respiratory_condition_prefer_not_to_say':'Prefer not to say'}

In [None]:
# Split by positive an negative COVID test results for plot 
pos_df = all_df[all_df['covid_test_result']=='Positive']
neg_df = all_df[all_df['covid_test_result']=='Negative']

In [None]:
# Get negative and positive counts 
condition_counts = []
for condition in conditions:
    condition_counts.append([condition, pos_df[condition].sum(), neg_df[condition].sum()])

In [None]:
# Format data for bar plot 
condition_df = pd.DataFrame(condition_counts)

condition_df.columns = ['condition', 'pos_count', 'neg_count']

condition_df['condition'] = condition_df['condition'].map(condition_dict)

condition_df = condition_df[:3] # Removes 'prefer not to say' option

In [None]:
# Generate plotly traces
pos_condition_trace = go.Bar(
x=condition_df['condition'],
y=100*condition_df['pos_count']/participants, 
name='COVID Positive', 
showlegend=False,
marker_color='rgba(230,159,0,1)',
)

neg_condition_trace = go.Bar(
x=condition_df['condition'],
y=100*condition_df['neg_count']/participants, 
name='COVID Negative', 
showlegend=False,
marker_color='rgba(86,180,233,1)',
)


condition_traces = [pos_condition_trace, neg_condition_trace]

#### Smoker Status

In [None]:
# Group all variable options for current smokers
all_df['smoker_status']=all_df['smoker_status'].apply(lambda x:'Current smoker' if 'Current smoker' in x else x)
all_df['smoker_status']=all_df['smoker_status'].apply(lambda x: None if 'Prefer not to say' in x else x)

In [None]:
# Format data for bar plot 
smoker_df = all_df.groupby(['smoker_status' ,'covid_test_result']).size().to_frame().reset_index().rename(columns={0:'counts'})
smoker_df = smoker_df[0:8] # Removes prefer not to say' option 

In [None]:
# Generate plotly traces 
pos_smoker_trace = go.Bar(
    x=smoker_df[smoker_df['covid_test_result']=='Positive']['smoker_status'],
    y=100*smoker_df[smoker_df['covid_test_result']=='Positive']['counts']/participants,
    name='COVID Positive',
    showlegend=False,
    marker_color='rgba(230,159,0,1)', 
    offsetgroup='Pos',
)

neg_smoker_trace = go.Bar(
    x=smoker_df[smoker_df['covid_test_result']=='Negative']['smoker_status'],
    y=100*smoker_df[smoker_df['covid_test_result']=='Negative']['counts']/participants,
    name='COVID Negative',
    showlegend=False,
    marker_color='rgba(86,180,233,1)',
    offsetgroup='Neg',
)

smoker_traces = [pos_smoker_trace, neg_smoker_trace]


#### Age

In [None]:
# Format data for bar plot 
age_df = all_df.groupby(['age' ,'covid_test_result']).size().to_frame().reset_index().rename(columns={0:'counts'})

In [None]:
# Generate plotly traces
pos_age_trace = go.Bar(
    x=age_df[age_df['covid_test_result']=='Positive']['age'],
    y=100*age_df[age_df['covid_test_result']=='Positive']['counts']/participants,
    name='COVID Positive',
    showlegend=False,
    marker_color='rgba(230,159,0,1)',
)

neg_age_trace = go.Bar(
    x=age_df[age_df['covid_test_result']=='Negative']['age'],
    y=100*age_df[age_df['covid_test_result']=='Negative']['counts']/participants,
    name='COVID Negative',
    showlegend=False,
    marker_color='rgba(86,180,233,1)'
)

age_traces = [pos_age_trace, neg_age_trace]

#### Gender

In [None]:
# Renames variables, formats data for bar plot, removes no repsonse

all_df['gender']=all_df['gender'].apply(lambda x:'Women' if x=='Female' else x)
all_df['gender']=all_df['gender'].apply(lambda x:'Men' if x=='Male' else x)
all_df['gender']=all_df['gender'].apply(lambda x:'Not Recorded' if pd.isna(x) else x)

gender_df = all_df.groupby(['gender' ,'covid_test_result']).size().to_frame().reset_index().rename(columns={0:'counts'})
gender_df=gender_df[gender_df['gender']!='Not Recorded'] 

In [None]:
# Geneates plotly traces
gender_pos_trace = go.Bar(
    x=gender_df[gender_df['covid_test_result']=='Positive']['gender'],
    y=100*gender_df[gender_df['covid_test_result']=='Positive']['counts']/participants,
    name='COVID Positive',
    showlegend=False,
    marker_color='rgba(230,159,0,1)',
)

gender_neg_trace = go.Bar(
    x=gender_df[gender_df['covid_test_result']=='Negative']['gender'],
    y=100*gender_df[gender_df['covid_test_result']=='Negative']['counts']/participants,
    name='COVID Negative',
    showlegend=False,
    marker_color='rgba(86,180,233,1)'
)

gender_traces = [gender_pos_trace, gender_neg_trace]

#### Combined subplot for all Figure 2 traces

In [None]:
fig = make_subplots(rows=4, cols=4, vertical_spacing=0.115,
                     subplot_titles=('<b>Exhalation<b>', '<b>COVID Test Results<b>', '<b>Influenza Test Results<b>',
                                   '<b>One Cough<b>', '<b>Age<b>', '<b>Gender<b>',
                                   '<b>Three Coughs<b>', '<b>Respiratory Conditions<b>', '<b>Smoker Status<b>',
                                   '<b>"I love nothing more than an afternoon cream tea"<b>', '<b>Symptoms<b>' ),
                    specs = [[{"colspan":2}, None, {}, {}],
                    [{"colspan":2}, None, {}, {}],
                    [{"colspan":2}, None, {}, {}],
                    [{"colspan":2}, None, {"colspan":2}, None]] 
                        
                        )

# Audio traces can be uncommented to include if they are generated above 
#fig.add_trace(sentence_trace, row=4, col=1) 
#fig.add_trace(exhalation_trace, row=1, col=1) 
#fig.add_trace(one_cough_trace, row=2, col=1) 
#fig.add_trace(three_coughs_trace, row=3, col=1) 

for trace in covid_test_traces: 
    fig.add_trace(trace, row=1, col=3)

fig.add_trace(flu_trace, row=1, col=4) 

for trace in age_traces: 
    fig.add_trace(trace, row=2, col=3)

for trace in gender_traces:
    fig.add_trace(trace, row=2, col=4)

for trace in condition_traces: 
    fig.add_trace(trace, row=3, col=3)

for trace in smoker_traces: 
    fig.add_trace(trace, row=3, col=4)

for trace in symptom_traces: 
    fig.add_trace(trace, row=4, col=3)
    
    
fig.for_each_trace(
    lambda trace: trace.update(colorbar=dict(x=0.925, y=0.16, len=0.15, borderwidth=0)) if trace.type == "choropleth" else ()
)

fig.for_each_trace(
    lambda trace: trace.update(width=0.3, marker=dict(line=dict(width=0))) if trace.type == "bar" else ()
)

fig.for_each_trace(
    lambda trace: trace.update(offset=0) if trace.type == "bar" and 'Negative' in trace.name else ()
)

fig.for_each_trace(
    lambda trace: trace.update(offset=-0.3) if trace.type == "bar" and 'Positive' in trace.name else ()
)

fig.for_each_trace(
    lambda trace: trace.update(legendgroup='A') if trace.type == "bar" and trace.offsetgroup == 'A' else trace.update(legendgroup='B')
)


fig.update_yaxes(showticklabels=False, ticks="", col=1, row=1)
fig.update_yaxes(showticklabels=False, ticks="", col=1, row=2)
fig.update_yaxes(showticklabels=False, ticks="", col=1, row=3)
fig.update_yaxes(showticklabels=False, ticks="", col=1, row=4)
fig.update_xaxes(title='Seconds', col=1, row=1)
fig.update_xaxes(title='Seconds', col=1, row=2)
fig.update_xaxes(title='Seconds', col=1, row=3)
fig.update_xaxes(title='Seconds', col=1, row=4)
fig.update_yaxes(ticksuffix="%", col=3)
fig.update_yaxes(ticksuffix="%", col=4)
fig.update_yaxes(ticksuffix="%", row=5, col=2)
fig.update_xaxes(tickangle=35, col=3)
fig.update_xaxes(tickangle=35, col=4)
fig.update_xaxes(tickangle=35, row=4, col=3)
fig.update_layout(template='simple_white',
                  font_family='Arial', font_size=12, 
                  width=1140, height=1040,
                  legend_tracegroupgap=100,
                 boxmode='group')

fig.show()

In [None]:
# Write plot to image 
fig.write_image('DataPaperFig2.svg', scale=1)
fig.write_image('DataPaperFig2.png', scale=3)

# Figure 3a.-c. - Technical Validation

In [None]:
# Get statistic for number of audio channels in all audio files
audio_df['cough_channels'].unique()

In [None]:
# Plot data

fig = make_subplots(rows=1, cols=3, vertical_spacing=0.25, horizontal_spacing=0.15)


# Length subplot

# fig.add_trace(go.Violin(
# y=audio_df['sentence_length'],
# name='Sentence',
# marker_color = '#D55E00',
# points=False, 
# showlegend=False,
# fillcolor='rgba(204, 121, 167, 0.25)', 
# scalegroup='A', 
# scalemode='count', 
# spanmode='manual', 
# span=[audio_df['sentence_length'].quantile(0.005), audio_df['sentence_length'].quantile(0.995)], 
# box=dict(visible=True)
# ), row=1, col=1)

fig.add_trace(go.Violin(
y=audio_df['exhalation_length'],
name='Exhalation',
marker_color = '#CC79A7',
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
scalegroup='A', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['exhalation_length'].quantile(0.005), audio_df['exhalation_length'].quantile(0.995)], 
box=dict(visible=True)
), row=1, col=1)

fig.add_trace(go.Violin(
y=audio_df['cough_length'],
name='One Cough',
marker_color = '#CC79A7',
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)',  
scalemode='count',
scalegroup='A', 
spanmode='manual', 
span=[audio_df['cough_length'].quantile(0.005), audio_df['cough_length'].quantile(0.995)],
box=dict(visible=True)
), row=1, col=1)

fig.add_trace(go.Violin(
y=audio_df['three_cough_length'],
name='Three Coughs',
marker_color = '#CC79A7',
scalegroup='A', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['three_cough_length'].quantile(0.005), audio_df['three_cough_length'].quantile(0.995)], 
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=1)


# Amplitude subplot

# fig.add_trace(go.Violin(
# y=audio_df['sentence_amplitude'],
# name='Sentence',
# marker_color = '#D55E00',
# scalegroup='B', 
# scalemode='count', 
# spanmode='manual', 
# span=[audio_df['sentence_amplitude'].quantile(0.005), audio_df['sentence_amplitude'].quantile(0.995)], 
# points=False, 
# showlegend=False,
# fillcolor='rgba(204, 121, 167, 0.25)', 
# box=dict(visible=True)
# ), row=1, col=2)

fig.add_trace(go.Violin(
y=audio_df['exhalation_amplitude'],
name='Exhalation',
marker_color = '#CC79A7',
scalegroup='B', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['exhalation_amplitude'].quantile(0.005), audio_df['exhalation_amplitude'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=2)

fig.add_trace(go.Violin(
y=audio_df['cough_amplitude'],
name='One Cough',
marker_color = '#CC79A7',
scalegroup='B', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['cough_amplitude'].quantile(0.005), audio_df['cough_amplitude'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=2)

fig.add_trace(go.Violin(
y=audio_df['three_cough_amplitude'],
name='Three Coughs',
marker_color = '#CC79A7',
scalegroup='B', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['three_cough_amplitude'].quantile(0.005), audio_df['three_cough_amplitude'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)',  
box=dict(visible=True)
), row=1, col=2)


# Variance subplot

# fig.add_trace(go.Violin(
# y=audio_df['sentence_signal_mean_std_ratio'],
# name='Sentence',
# marker_color = '#D55E00',
# scalegroup='C', 
# scalemode='count', 
# spanmode='manual',
# span=[audio_df['sentence_signal_mean_std_ratio'].quantile(0.005), audio_df['sentence_signal_mean_std_ratio'].quantile(0.995)],
# points=False, 
# showlegend=False,
# fillcolor='rgba(204, 121, 167, 0.25)', 
# box=dict(visible=True)
# ), row=1, col=3)

fig.add_trace(go.Violin(
y=audio_df['exhalation_signal_mean_std_ratio'],
name='Exhalation',
marker_color = '#CC79A7',
scalegroup='C',
scalemode='count', 
spanmode='manual', 
span=[audio_df['exhalation_signal_mean_std_ratio'].quantile(0.005), audio_df['exhalation_signal_mean_std_ratio'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)',  
box=dict(visible=True)
), row=1, col=3)

fig.add_trace(go.Violin(
y=audio_df['cough_signal_mean_std_ratio'],
name='One Cough',
marker_color = '#CC79A7',
scalegroup='C',
scalemode='count', 
spanmode='manual',
span=[audio_df['cough_signal_mean_std_ratio'].quantile(0.005), audio_df['cough_signal_mean_std_ratio'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=3)

fig.add_trace(go.Violin(
y=audio_df['three_cough_signal_mean_std_ratio'],
name='Three Coughs',
marker_color = '#CC79A7',
scalegroup='C',
scalemode='count', 
spanmode='manual', 
span=[audio_df['three_cough_signal_mean_std_ratio'].quantile(0.005), audio_df['three_cough_signal_mean_std_ratio'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=3)

fig.update_annotations(dict(ay=-15))
fig.update_xaxes(tickangle=45, row=1)
fig.update_yaxes(range=[0,20], row=1, col=1)
fig.update_yaxes(type='log', row=1, range=[-1.25, 0], col=3)
fig.update_xaxes(title='Normalised Outlier Score', row=3, col=1)
fig.update_yaxes(title='Length (seconds)', title_standoff=0.05, row=1, col=1)
fig.update_yaxes(title='Amplitude (AU)', title_standoff=0.05, row=1, col=2)
fig.update_yaxes(title='Mean/SD (ratio)', title_standoff=0.05, nticks=2, row=1, col=3)
fig.update_layout(template='simple_white', violingap=0, violingroupgap=0.1, legend=dict(y=0.45, x=0.05),
                  font_family='Arial', font_size=13, width=750, height=300)


In [None]:
# Write plot to image 
fig.write_image("DataPaperFig3.svg", scale=1)
fig.write_image("DataPaperFig3.png", scale=3)

## Figure 4 - Bias
The gender bias subfigures can be recreated from the open dataset

In [None]:
## ONS Census 2021 data
# from https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/populationandhouseholdestimatesenglandandwalescensus2021

data = ',Gender,Total,% Total\n0,Women,28833500,51.04195801719956\n1,Men,27656300,48.95804198280044\n'
census_gender_df = pd.read_csv(io.StringIO(data), header=0)

In [None]:
# TT Data 
# from Weekly statistics for NHS Test and Trace (England) 2 to 15 June 2022. 22 https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1085136/NHS-test-and-trace-23-june-2022.pdf (2022)

data = ',Gender,Positive,Total,% Total,% Positive\n0,Women,5913615.0,68650051.0,59.40748549683927,5.1174469971827286\n1,Men,5239446.0,46907863.0,40.59251450316073,4.534043423456052\n'
tt_gender_df = pd.read_csv(io.StringIO(data), header=0)

In [None]:
## REACT data 
# from various REACT reports listed here https://www.imperial.ac.uk/medicine/research-and-impact/groups/react-study/real-time-assessment-of-community-transmission-findings/

data = ',Gender,Positive,Total,% Total,% Positive\n1,Women,5798,330449,55.716971429053174,0.9776001753542916\n0,Men,4888,262613,44.27915054334539,0.8241651702538422\n2,Unknown,0,23,0.0038780276014399287,0.0\n'
react_gender_df = pd.read_csv(io.StringIO(data), header=0)
react_gender_df = react_gender_df.drop(index=2)

In [None]:
bam_total_df = participant_df.copy()

In [None]:
gender_dict = {
    'Female':'Women',
    'Male':'Men',
    'Unknown': 'Unknown'
}

bam_total_df['gender'] = bam_total_df['gender'].map(gender_dict)

In [None]:
bam_pos_total_df = bam_total_df[bam_total_df['covid_test_result']=='Positive']

# TT data only PCR & LAMP (filtered by testing types containing 'P')
bam_tt_df = bam_total_df[(bam_total_df['recruitment_source']=='Test and Trace') & (bam_total_df['covid_test_method'].str.contains('P', na=False))]
bam_pos_tt_df = bam_tt_df[bam_tt_df['covid_test_result']=='Positive']

bam_react_df = bam_total_df[bam_total_df['recruitment_source'].str.contains('REACT')]
bam_pos_react_df = bam_react_df[bam_react_df['covid_test_result']=='Positive']

In [None]:
bam_tt = bam_tt_df.shape[0]
bam_react = bam_react_df.shape[0]
bam_total = bam_total_df.shape[0]

In [None]:
# Get gender data for each dataset subset 

# TT
bam_tt_gender_df = bam_tt_df['gender'].value_counts(dropna=False).reset_index()
bam_tt_gender_df.columns = ['Gender', 'Total']
bam_tt_gender_df['% Total'] = 100*bam_tt_gender_df['Total']/bam_tt
bam_tt_gender_df = bam_tt_gender_df[0:2]

bam_pos_tt_gender_df = bam_pos_tt_df['gender'].value_counts(dropna=False).reset_index()
bam_pos_tt_gender_df.columns = ['Gender', 'Positive']
bam_pos_tt_gender_df['% Positive'] = 100*bam_pos_tt_gender_df['Positive']/bam_tt
bam_pos_tt_gender_df = bam_pos_tt_gender_df[0:2]

# REACT
bam_react_gender_df = bam_react_df['gender'].value_counts(dropna=False).reset_index()
bam_react_gender_df.columns = ['Gender', 'Total']
bam_react_gender_df['% Total'] = 100*bam_react_gender_df['Total']/bam_react
bam_react_gender_df = bam_react_gender_df[0:2]

bam_pos_react_gender_df = bam_pos_react_df['gender'].value_counts(dropna=False).reset_index()
bam_pos_react_gender_df.columns = ['Gender', 'Positive']
bam_pos_react_gender_df['% Positive'] = 100*bam_pos_react_gender_df['Positive']/bam_react
bam_pos_react_gender_df = bam_pos_react_gender_df[0:2]

# Total 
bam_total_gender_df = bam_total_df['gender'].value_counts(dropna=False).reset_index()
bam_total_gender_df.columns = ['Gender', 'Total']
bam_total_gender_df['% Total'] = 100*bam_total_gender_df['Total']/bam_total
bam_total_gender_df = bam_total_gender_df[0:2]

bam_pos_total_gender_df = bam_pos_total_df['gender'].value_counts(dropna=False).reset_index()
bam_pos_total_gender_df.columns = ['Gender', 'Positive']
bam_pos_total_gender_df['% Positive'] = 100*bam_pos_total_gender_df['Positive']/bam_total
bam_pos_total_gender_df = bam_pos_total_gender_df[0:2]

In [None]:
# Plot

fig=make_subplots(rows=4, cols=3,
                 vertical_spacing=0.175, horizontal_spacing=0.15,
                 subplot_titles=("<b>Census 2021<b>", "<b>NHS Test and Trace<b>", "<b>REACT<b>"))


# Census gender
fig.add_trace(go.Bar(
x=census_gender_df['Gender'],
y=census_gender_df['% Total'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="Census Total"), 
             row=2, col=1)


fig.add_trace(go.Bar(
x=bam_total_gender_df['Gender'],
y=bam_total_gender_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study"),
row=2, col=1)

fig.add_trace(go.Bar(
x=bam_pos_total_gender_df['Gender'],
y=bam_pos_total_gender_df['% Positive'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
name="This Study"),
row=2, col=1)


# TT gender
fig.add_trace(go.Bar(
x=tt_gender_df['Gender'],
y=tt_gender_df['% Total'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="NHS Test and Trace Total"), 
             row=2, col=2)

fig.add_trace(go.Bar(
x=tt_gender_df['Gender'],
y=tt_gender_df['% Positive'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#0072B2",
marker=dict(line=dict(width=0)),
name="NHS Test and Trace Positive"),
             row=2, col=2)


fig.add_trace(go.Bar(
x=bam_tt_gender_df['Gender'],
y=bam_tt_gender_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study Total (Test and Trace recruited)"),
row=2, col=2)

fig.add_trace(go.Bar(
x=bam_pos_tt_gender_df['Gender'],
y=bam_pos_tt_gender_df['% Positive'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
name="This Study Positive (Test and Trace recruited)"),
row=2, col=2)


# REACT gender
fig.add_trace(go.Bar(
x=react_gender_df['Gender'],
y=react_gender_df['% Total'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="REACT Study Total"),
             row=2, col=3)

fig.add_trace(go.Bar(
x=react_gender_df['Gender'],
y=react_gender_df['% Positive'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#0072B2",
marker=dict(line=dict(width=0)),
name="REACT Study Positive"),
             row=2, col=3)

fig.add_trace(go.Bar(
x=bam_react_gender_df['Gender'],
y=bam_react_gender_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study Total (REACT recruited)"),
row=2, col=3)

fig.add_trace(go.Bar(
x=bam_pos_react_gender_df['Gender'],
y=bam_pos_react_gender_df['% Positive'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
name="This Study Positive (REACT recruited)"),
row=2, col=3)



fig.update_xaxes(title='Age Group', row=1)
fig.update_yaxes(range=[0,26], row=1)
fig.update_xaxes(title='Gender', row=2)
fig.update_yaxes(range=[0, 60], row=2)
fig.update_xaxes(title='Ethnicity', row=3)
fig.update_xaxes(title='Region', row=4)
fig.update_yaxes(range=[0, 20], row=4)
fig.update_xaxes(tickangle = 45, title_standoff = 5)
fig.update_yaxes(title='% of Total')

fig.layout.annotations[0].update(y=1.07)
fig.layout.annotations[1].update(y=1.07)
fig.layout.annotations[2].update(y=1.07)
fig.update_layout(template='simple_white', font_family='Arial', font_size=14, height=850, width=1000)

In [None]:
fig.write_image("DataPaperFig4.svg", scale=1)
fig.write_image("DataPaperFig4.png", scale=3)