# Data Paper Figures and Statistics
This notebook produces all figures and statistics for the data paper, using the openly available dataset

In [None]:
!pip install kaleido # to export high res plot images from plotly, requires kernel restart

In [None]:
pip install geopandas # to produce maps

In [None]:
# python imports 
import os
import pandas as pd
import geopandas as gpd
import kaleido
import numpy as np
import boto3
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import io
from io import StringIO
from scipy.io import wavfile
import datetime as dt
from botocore import UNSIGNED
from botocore.config import Config
import yaml

In [None]:
# Reads files from S3 bucket (unneccesary if not using S3)
def get_file(path, bucket_name):
    s3_resource = boto3.resource('s3', config=Config(signature_version=UNSIGNED), region_name='eu-west-2')
    bucket = s3_resource.Bucket(bucket_name)
    return io.BytesIO(bucket.Object(path).get()['Body'].read())

In [None]:
# Reads yaml file containing data file paths 
with open('data_paper_io.yaml') as file:
    try:
        io_config = yaml.safe_load(file)   
    except yaml.YAMLError as exc:
        print(exc)

In [None]:
# reads participant metadata file 
participant_df =  pd.read_pickle(get_file(io_config['metadata']['participant_df'], io_config['bucket'])) 

In [None]:
# reads audio metadata file 
audio_df = pd.read_pickle(get_file(io_config['metadata']['audio_df'], io_config['bucket'])) 

### Data Paper Statistics 

Total Participants

In [None]:
participant_df.shape[0]

Positive cases (COVID)

In [None]:
participant_df['covid_test_result'].value_counts()

% PCR Test Results

In [None]:
participant_df[participant_df['covid_test_method'].str.contains('PCR', na=False)].shape[0]

In [None]:
# Tos dataset contains 27101 PCR test results
70794/27101

In [None]:
round(100*participant_df[participant_df['covid_test_method'].str.contains('PCR', na=False)].shape[0]/participant_df.shape[0],2)

PCR positive participants 

In [None]:
participant_df[participant_df['covid_test_method'].str.contains('PCR', na=False)]['covid_test_result'].value_counts()

% Respiratory Symptoms

In [None]:
respiratory_symptoms = ['symptom_new_continuous_cough', 'symptom_cough_any', 
                        'symptom_runny_or_blocked_nose', 'symptom_shortness_of_breath',
       'symptom_sore_throat',]

In [None]:
participant_df['symptoms_respiratory']=participant_df.apply(lambda x:x[respiratory_symptoms].sum()>0, axis=1)
participant_df['symptoms_respiratory'].sum()

In [None]:
round(100*participant_df['symptoms_respiratory'].sum()/participant_df.shape[0],2)

In [None]:
round(100*participant_df[participant_df['covid_test_result']=='Positive']['symptoms_respiratory'].value_counts(normalize=True, dropna=False),2)

In [None]:
round(100*participant_df[participant_df['covid_test_result']=='Negative']['symptoms_respiratory'].value_counts(normalize=True, dropna=False),2)

% Respiratory conditions

In [None]:
respiratory_conditions = ['respiratory_condition_asthma',
       'respiratory_condition_copd_or_emphysema',
       'respiratory_condition_other']

In [None]:
participant_df[participant_df[respiratory_conditions].sum(axis=1)>0].shape[0]

In [None]:
round(100*participant_df[participant_df[respiratory_conditions].sum(axis=1)>0].shape[0]/participant_df.shape[0],2)

% Asthma

In [None]:
participant_df[participant_df['respiratory_condition_asthma']==1].shape[0]

In [None]:
round(100*participant_df[participant_df['respiratory_condition_asthma']==1].shape[0]/participant_df.shape[0],2)

% Influenza A or B test results

In [None]:
participant_df['influenza_a_test_result'].value_counts()[['Positive', 'Negative']].sum()

In [None]:
participant_df['influenza_b_test_result'].value_counts()[['Positive', 'Negative']].sum()

In [None]:
round(100*participant_df['influenza_a_test_result'].value_counts()[['Positive', 'Negative']].sum()/participant_df.shape[0],2)

In [None]:
round(100*participant_df['influenza_b_test_result'].value_counts()[['Positive', 'Negative']].sum()/participant_df.shape[0], 2)

In [None]:
participant_df['influenza_a_test_result'].value_counts()

In [None]:
participant_df['influenza_b_test_result'].value_counts()

REACT rounds with influenza test results 

In [None]:
participant_df[['recruitment_source', 'influenza_a_test_result']].value_counts()

In [None]:
participant_df[['recruitment_source', 'influenza_b_test_result']].value_counts()

% Test method

In [None]:
participant_df['covid_test_method'].value_counts(dropna=False)

In [None]:
round(100*participant_df['covid_test_method'].value_counts(normalize=True, dropna=False),2)

In [None]:
participant_df['covid_test_result'].value_counts(dropna=False)

In [None]:
round(100*participant_df['covid_test_result'].value_counts(normalize=True, dropna=False),2)

Survey start and end dates

In [None]:
participant_df['submission_date'].min()

In [None]:
participant_df['submission_date'].max()

% REACT, Test and Trace

In [None]:
participant_df['recruitment_source'].value_counts(dropna=False)[0]

In [None]:
round(100*participant_df['recruitment_source'].value_counts(normalize=True, dropna=False)[0],2) # Test and Trace

In [None]:
participant_df['recruitment_source'].value_counts(dropna=False)[1:].sum()

In [None]:
round(100*participant_df['recruitment_source'].value_counts(normalize=True, dropna=False)[1:].sum(),2) # REACT

Recruitment rate

In [None]:
participant_df['cohort'] = participant_df['recruitment_source'].apply(lambda x:'REACT' if 'REACT' in x else x)

In [None]:
participant_df[['cohort', 'survey_phase']].value_counts()

In [None]:
# REACT beta phase recruitment rate
# REACT beta phase recruitment: 295493 emails
round(100*36116/295493,2)

Median age 

In [None]:
age_df = participant_df.copy()

In [None]:
age_df = age_df[~age_df['age'].isna()]

In [None]:
age_df['age'] = age_df['age'].replace({'94+':94})

In [None]:
age_df['age'].median()

Gender

In [None]:
participant_df['gender'].value_counts(dropna=False)

In [None]:
round(participant_df['gender'].value_counts(dropna=False, normalize=True)*100, 2)

Ethnicity

In [None]:
participant_df['ethnicity'].value_counts(dropna=False)

In [None]:
round(participant_df['ethnicity'].value_counts(dropna=False, normalize=True)*100, 2)

% Wearing mask 

In [None]:
participant_df['wearing_mask'].value_counts(dropna=False)

In [None]:
round(100*participant_df['wearing_mask'].value_counts(dropna=False, normalize=True), 2)

Audio sample rate stats

In [None]:
audio_df[['cough_sample_rate', 'three_cough_sample_rate', 'exhalation_sample_rate', 'sentence_sample_rate']].value_counts()

In [None]:
round(100*audio_df[['cough_sample_rate', 'three_cough_sample_rate', 'exhalation_sample_rate', 'sentence_sample_rate']].value_counts(normalize=True),2)

Audio length

In [None]:
max([audio_df['cough_length'].max(),
      audio_df['three_cough_length'].max(),
     audio_df['exhalation_length'].max(),
     audio_df['sentence_length'].max()])

Pre-June stats

In [None]:
participant_df[participant_df['submission_date']<dt.date(2021, 6, 1)].shape[0]

In [None]:
100*participant_df[participant_df['submission_date']<dt.date(2021, 6, 1)].shape[0]/participant_df.shape[0]

Survey phase

In [None]:
participant_df['survey_phase'].value_counts(dropna=False)

In [None]:
round(100*participant_df['survey_phase'].value_counts(dropna=False, normalize=True),2)

Missing audio

In [None]:
audio_df['missing_audio'].value_counts(dropna=False)

In [None]:
round(100*audio_df['missing_audio'].value_counts(dropna=False, normalize=True),2)

Number of audio files

In [None]:
audio_df[~(audio_df['exhalation_size']<45) & ~(audio_df['exhalation_size'].isna())].shape[0] + audio_df[~(audio_df['sentence_size']<45) & ~(audio_df['sentence_size'].isna())].shape[0] + audio_df[~(audio_df['cough_size']<45) & ~(audio_df['cough_size'].isna())].shape[0] + audio_df[~(audio_df['three_cough_size']<45) & ~(audio_df['three_cough_size'].isna())].shape[0]


In [None]:
participant_df.shape[0]*4 - 289696

Total size of audio in GB

In [None]:
(audio_df['exhalation_size'].sum()+audio_df['exhalation_size'].sum()+
 audio_df['sentence_size'].sum()+
 audio_df['cough_size'].sum()+
 audio_df['three_cough_size'].sum())/1000000000

In [None]:
# Metadata size in MB
27.8+35.2 #+7.4

Audio Transcipts

In [None]:
round(100*audio_df[audio_df['sentence_transcript'] == "i love nothing more than an afternoon cream tea"].shape[0]/participant_df.shape[0], 2)

In [None]:
# contains 'nothing more'
round(100*audio_df[audio_df['sentence_transcript'].str.contains("nothing more", na=False)].shape[0]/participant_df.shape[0], 2)

In [None]:
# contains 'afternoon'
round(100*audio_df[audio_df['sentence_transcript'].str.contains("afternoon", na=False)].shape[0]/participant_df.shape[0], 2)

In [None]:
# contains 'afternoon' AND 'nothing more'
round(100*audio_df[(audio_df['sentence_transcript'].str.contains("afternoon", na=False)) & (audio_df['sentence_transcript'].str.contains("nothing more", na=False))].shape[0]/participant_df.shape[0], 2)

Viral load data

In [None]:
participant_df[~participant_df['covid_viral_load'].isna()].shape[0]

In [None]:
round(100*participant_df[~participant_df['covid_viral_load'].isna()].shape[0]/participant_df.shape[0],2)

In [None]:
participant_df['covid_viral_load_category'].value_counts()

English as first language

In [None]:
participant_df['language'].value_counts(dropna=False)

In [None]:
round(100*participant_df['language'].value_counts(dropna=False, normalize=True),2)

% Min height and weight

In [None]:
round(100*participant_df['height'].value_counts(dropna=False, normalize=True),2)

In [None]:
round(100*participant_df['weight'].value_counts(dropna=False, normalize=True),2)

Variable completeness

In [None]:
# make 'Prefer not to say' null for this calculation
participant_na_df = participant_df.copy()
participant_na_df = participant_na_df.replace('Prefer not to say', np.nan)
participant_na_df = participant_na_df.replace('Unknown', np.nan)
participant_na_df = participant_na_df.replace('Unknown/Void', np.nan)

In [None]:
for column in participant_na_df.columns:
    print(column, round(100-(100*participant_na_df[column].isna().sum()/participant_na_df.shape[0]), 2), "%")

In [None]:
for column in audio_df.columns:
    print(column, round(100-(100*audio_df[column].isna().sum()/audio_df.shape[0]), 2), "%")

## Figure 1 - Recruitment

#### Figure 1.C Survey completion rates

In [None]:
# Data from survey provider, for beta phase only, in order of survey 
react_data = [43015, 42707, 41500, 41418, 41389, 41386, 41372, 41343, 41326, 41323, 37602, 37150, 36933, 36768]
tt_data = [41060, 40531, 28126, 28063, 28045, 28040, 28031, 28018, 28003, 28001, 26260, 26071, 25959, 25853]

# List survey questions
questions = ["Privacy confirmation", "Participation agreement", 
               "Barcode entry", "Symptoms", 
               "Smoker status", "Respiratory conditions", 
               "First language", "Height", 
               "Weight", "Mask", 
               "Sentence audio", "Exhalations audio", 
               "Single cough audio", "Three coughs audio",
               "<b>Processed Data Set<b>"]

react_data.append(participant_df[participant_df['recruitment_source'].str.contains('REACT')]['survey_phase' ].value_counts()['beta'])
tt_data.append(participant_df[participant_df['recruitment_source']=='Test and Trace']['survey_phase' ].value_counts()['beta'])


In [None]:
# Plots figure
fig = go.Figure()

fig.add_trace(go.Funnel(
    name = 'NHS Test and Trace',
    orientation = "h",
    width=0.75, 
    y = questions,
    x = tt_data,
    marker_color='#0072B2',
    textposition = "inside",
    textinfo = "percent initial"))

fig.add_trace(go.Funnel(
    name = 'REACT',
    orientation = "h",
    width=0.75,
    y = questions,
    x = react_data,
    marker_color='#D55E00',
    textposition = "inside",
    textinfo = "percent initial"))

fig.update_layout(template='simple_white', font_family="Arial", font_size=12, width=500)

fig.show()

In [None]:
# Writes figure to image
fig.write_image('DataPaperFig1C.svg', scale=1)
fig.write_image('DataPaperFig1C.png', scale=3)

#### Combined subplots for figures 1.D, 1.E.  (combined to ensure matched font scale and width)

D. Submissions over time 

In [None]:
# Copies participant dataframe and gets number of participants 
all_df = participant_df.copy()

participants = participant_df.shape[0]

In [None]:
# Adds week and cohort variables 
all_df['week'] = pd.to_datetime(all_df['submission_date'], errors='coerce').dt.date - pd.to_datetime(all_df['submission_date'], errors='coerce').dt.weekday * np.timedelta64(1, 'D')

all_df['cohort'] = all_df['recruitment_source'].apply(lambda x: 'react' if 'REACT' in x else ('tt' if 'Trace' in x else 'None'))

In [None]:
# Groups by week 
week_df = all_df.groupby(['cohort', 'covid_test_result', 'week']).size().to_frame('count').reset_index()

week_df['percentage']=100*week_df['count']/participants

In [None]:
# Creates plotly traces to plot
tt_pos_timeline_trace=go.Scatter(x=week_df[(week_df['cohort']=='tt')&(week_df['covid_test_result']=='Positive')]['week'],
                         y=week_df[(week_df['cohort']=='tt')&(week_df['covid_test_result']=='Positive')]['percentage'],
                         stackgroup='one', name='NHS Test and Trace<br>COVID Positive', legendgroup=1,
                        line=dict(width=0, color='rgba(0, 114, 178, 1)'))

tt_neg_timeline_trace=go.Scatter(x=week_df[(week_df['cohort']=='tt')&(week_df['covid_test_result']=='Negative')]['week'],
                         y=week_df[(week_df['cohort']=='tt')&(week_df['covid_test_result']=='Negative')]['percentage'],
                         stackgroup='one', name='NHS Test and Trace<br>COVID Negative', legendgroup=1,
                        line=dict(width=0, color='rgba(86, 180, 233, 1)'),)

react_pos_timeline_trace=go.Scatter(x=week_df[(week_df['cohort']=='react')&(week_df['covid_test_result']=='Positive')]['week'],
                         y=week_df[(week_df['cohort']=='react')&(week_df['covid_test_result']=='Positive')]['percentage'],
                         stackgroup='one', name='REACT<br>COVID Positive', legendgroup=1,
                         fillcolor='rgba(213,94,0,1)',
                        line=dict(width=0),)

react_neg_timeline_trace=go.Scatter(x=week_df[(week_df['cohort']=='react')&(week_df['covid_test_result']=='Negative')]['week'],
                         y=week_df[(week_df['cohort']=='react')&(week_df['covid_test_result']=='Negative')]['percentage'],
                         stackgroup='one', name='REACT<br>COVID Negative', legendgroup=1,
                        line=dict(width=0, color='rgba(230, 159, 0, 1)'),)


timeline_traces = [tt_pos_timeline_trace, tt_neg_timeline_trace, react_pos_timeline_trace, react_neg_timeline_trace]

E. Repsonse Gap

In [None]:
# Splits participant dataframe by recuitment cohort 
tt_df=participant_df[participant_df['recruitment_source']=='Test and Trace']
react_df= participant_df[participant_df['recruitment_source'].str.contains('REACT')]

In [None]:
# Formats data for both participant cohorts, prints 72 hours survey delay statistic
subplot_traces=[]

for cohort_df in [tt_df, react_df]:

    # Run following cells 
    delay_df=cohort_df.copy()
    participants=cohort_df.shape[0]

    # format to be in context of submission date
    delay_df['symptom_onset'] = 0-delay_df['symptom_onset'] 
    # Make symptom onset na when no symptoms 
    delay_df['symptom_onset'] = delay_df.apply(lambda x: np.nan if x['symptom_none']==1 else x['symptom_onset'], axis=1)
    delay_df['covid_test_date'] = delay_df['covid_test_date'] - delay_df['submission_date']
    delay_df['covid_test_processed_date'] = delay_df['covid_test_processed_date'] - delay_df['submission_date']

    # Print % of participants completing survey within 72 hours 
    delay_df['submission_delay_72hrs'] = pd.to_timedelta(delay_df['covid_test_date']).dt.days.apply(lambda x: 1 if x<-3 else 0)
    print("percentage participants completing survey withing 72hrs of testing: ")
    print(round(100*delay_df['submission_delay_72hrs'].value_counts(dropna=False, normalize=True), 2))

    # format test completion date data
    test_date_df = delay_df['covid_test_date'].value_counts().reset_index().sort_values(by='index', ascending=True)
    test_date_df.columns=['days','count']
    test_date_df['percentage'] = 100*test_date_df['count']/participants
    test_date_df['days']= test_date_df['days'].astype(str).str.split(" days").str[0].astype(int)
    
    # get test completion date plotly trace
    taken_trace = go.Scatter(
    x=test_date_df['days'], 
    y=test_date_df['percentage'],
    name='Test Taken',
    fill='tozeroy',
    fillcolor='rgba(213, 94, 0, 0.75)',
    line=dict(width=0),
    legendgroup=2,
    showlegend=False,
    mode='lines')

    # format test processing date data
    processed_date_df = delay_df['covid_test_processed_date'].value_counts().reset_index().sort_values(by='index', ascending=True)
    processed_date_df.columns=['days','count']
    processed_date_df['percentage'] = 100*processed_date_df['count']/participants
    processed_date_df['days']= processed_date_df['days'].astype(str).str.split(" days").str[0].astype(int)
    
    # get test processing date plotly trace
    processed_trace = go.Scatter(
    x=processed_date_df['days'], 
    y=processed_date_df['percentage'],
    name='Test Processed',
    fill='tozeroy',
    fillcolor='rgba(204, 121, 167, 0.75)',
    line=dict(width=0),
    legendgroup=2,
    showlegend=False,
    mode='lines')

    # format symptom onset data
    onset_df = delay_df['symptom_onset'].value_counts().reset_index().sort_values(by='index', ascending=True)
    onset_df.columns=['days','count']
    onset_df['percentage'] = 100*onset_df['count']/participants
    
    # get symptom onset date plotly trace
    onset_trace = go.Scatter(
    x=onset_df['days'], 
    y=onset_df['percentage'],
    name='Symptom Onset',
    fill='tozeroy',
    fillcolor='rgba(0, 114, 178, 0.75)',
    line=dict(width=0),
    legendgroup=2,
    showlegend=False,
    mode='lines',
    )
    
    subplot_traces.append([processed_trace, taken_trace, onset_trace])
    

Combined Subplot

In [None]:
# Plots above traces 
fig = make_subplots(rows=2, cols=2,  vertical_spacing=0.25, #shared_yaxes=True,
                    subplot_titles=('<b>Survey participation by week<b>', '<b>REACT<b>', '<b>NHS Test and Trace<b>'),
                    specs=[[{"colspan": 2}, None],
                          [{},{}]]
                   )


for trace in timeline_traces:
    fig.add_trace(trace, row=1, col=1)

for trace in subplot_traces[1]:
    fig.add_trace(trace, row=2, col=1)
    
for trace in subplot_traces[0]:
    fig.add_trace(trace, row=2, col=2)


fig.add_vline(x=0, line_width=2, line_dash="dash", opacity=1,
              line_color="black", annotation_text = ' Survey<br> submission',
              annotation_position='top right', annotation_align='left', row=2)


fig.update_yaxes(ticksuffix='%')
fig.update_xaxes(title='Survey submission date', row=1)
fig.update_yaxes(title='Percentage of particpants', title_standoff=10, range=[0,10], row=1)
fig.update_yaxes(title='Percentage of particpants', title_standoff=5, row=2)
fig.update_xaxes(range=[-15, 5], title_text='Days relative to survey submission', row=2)
fig.update_yaxes(range=[0, 80], row=2)
  
fig.update_layout(template='simple_white',
                  font_family='Arial',
                  font_size=12,
                  height=600,
                 legend=dict(x=0.025,y=1, bgcolor='rgba(255, 255, 255, 0)', borderwidth=0,
                             tracegroupgap=130))

fig.show() 

In [None]:
# Writes plot to image
fig.write_image('DataPaperFig1DE.svg', scale=1) 
fig.write_image('DataPaperFig1DE.png', scale=3) 

## Figure 2 - Dataset summary

In [None]:
all_df = participant_df.copy()

In [None]:
participants = all_df.shape[0]

#### Audio Plot
Audio data shown in the manuscript is recorded by the author and not included in this repository. Similar plots can be recreated using the audio files listed in audio_df and read using scipy.io.wavfile.read

In [None]:
# my_audio_df = pd.read_csv('my_audio_df.csv')
samplerate = 44100 

In [None]:
# One example audio file for each audio modality 
sentence = np.array(my_audio_df['sentence'])
exhalation = np.array(my_audio_df['exhalation'])
one_cough = np.array(my_audio_df['one_cough'])
three_coughs = np.array(my_audio_df['three_coughs'])

In [None]:
# Generates plotly traces and plots subplots

fig = make_subplots(
    rows=4, cols=1,
    shared_xaxes=True,
    shared_yaxes=True,
    subplot_titles=("Speech (read sentence)", "Three sharp exhalations", "One volitional cough", "Three volitional coughs"))

sentence_trace = go.Scatter(
    x=[x/samplerate for x in range(0, len(sentence))],
    y=sentence,
showlegend=False,
line=dict(color="#CC79A7"))
    
exhalation_trace = go.Scatter(
    x=[x/samplerate for x in range(0, len(exhalation))],
    y=exhalation,
showlegend=False,
line=dict(color="#CC79A7"))
    
one_cough_trace = go.Scatter(
    x=[x/samplerate for x in range(0, len(one_cough))],
    y=one_cough,
showlegend=False,
line=dict(color="#CC79A7"))
    
three_coughs_trace = go.Scatter(
    x=[x/samplerate for x in range(0, len(three_coughs))],
    y=three_coughs,
showlegend=False,
line=dict(color="#CC79A7"))

fig.add_trace(sentence_trace, col=1, row=1)

fig.add_trace(exhalation_trace, col=1, row=2)

fig.add_trace(one_cough_trace, col=1, row=3)

fig.add_trace(three_coughs_trace, col=1, row=4)

fig.update_xaxes(title='Seconds', row=4)
fig.update_yaxes(showticklabels=False)
fig.update_yaxes(ticks="")
fig.update_layout(template='simple_white', font_family='Arial', font_size=14, width=500)

fig.show()

#### COVID Test Results
The following subsections generate plotly traces which are plotted later as subplots 

In [None]:
# Groups all PCR-type tests 
def test_simplify(test):
    try:
        if 'PCR' in test:
            return 'PCR'
        else:
            return test     
    except:
        return None

In [None]:
# Format data
all_df['covid_test_method'] = all_df['covid_test_method'].apply(lambda x:test_simplify(x))

covid_df = all_df.groupby(['covid_test_result', 'covid_test_method']).size().reset_index()

covid_df = covid_df[covid_df['covid_test_result']!='Unknown/Void']

covid_df = covid_df[covid_df['covid_test_method']!='Unknown']

covid_df.columns  =['covid_test_result', 'covid_test_method', 'count']

covid_df['percentage'] = 100* covid_df['count']/participants

In [None]:
# Generates plotly traces
covid_trace_1 = go.Bar(
x=covid_df[covid_df['covid_test_method']=='PCR']['covid_test_result'],
y=covid_df[covid_df['covid_test_method']=='PCR']['percentage'],
name='PCR',
offsetgroup='A',
marker=dict(line=dict(width=0)),
marker_color='#0072B2')

covid_trace_2 = go.Bar(
x=covid_df[covid_df['covid_test_method']=='LAMP']['covid_test_result'],
y=covid_df[covid_df['covid_test_method']=='LAMP']['percentage'],
name='LAMP',
offsetgroup='A',
base = covid_df[covid_df['covid_test_method']=='PCR']['percentage'],
marker=dict(line=dict(width=0)),
marker_color='#009E73')

covid_trace_3 = go.Bar(
x=covid_df[covid_df['covid_test_method']=='LFT']['covid_test_result'],
y=covid_df[covid_df['covid_test_method']=='LFT']['percentage'],
name='LFT',
offsetgroup='A',
base = covid_df[covid_df['covid_test_method']!='LFT'].groupby('covid_test_result').sum().reset_index()['percentage'],
marker=dict(line=dict(width=0)),
marker_color='#F0E442')

covid_test_traces = [covid_trace_1, covid_trace_2, covid_trace_3]

#### Influenza Test Results

In [None]:
# Combines influenza A and B test results 

def flu_label(row):
    try:
        if (row['influenza_a_test_result']=='Positive') or (row['influenza_b_test_result']=='Positive'):
            return 'Positive'
        if (row['influenza_a_test_result']=='Negative') and (row['influenza_b_test_result']=='Negative'):
            return 'Negative'
        else:
            return None
    except:
        return None

In [None]:
# Formats data 
all_df['influenza_test_result'] = all_df.apply(lambda x:flu_label(x), axis=1)

flu_df = all_df.groupby('influenza_test_result').size().reset_index()

flu_df.columns = ['influenza_test_result', 'count']

flu_df['percentage'] = 100* flu_df['count']/participants

In [None]:
# Generates plotly trace
flu_trace = go.Bar(
x=flu_df['influenza_test_result'],
y=flu_df['percentage'],
name='PCR',
showlegend=False,
marker=dict(line=dict(width=0)),
marker_color='#0072B2')

#### Symptoms

In [None]:
# Groups non-respiratory symptoms for plot 

non_respiratory_symptoms =  [
       'symptom_change_to_sense_of_smell_or_taste',
       'symptom_abdominal_pain',
       'symptom_diarrhoea', 'symptom_fatigue',
       'symptom_fever_high_temperature', 'symptom_headache',
       'symptom_loss_of_taste',]

all_df['symptom_non_resp'] = all_df[non_respiratory_symptoms].any(axis=1).replace({True: 1, False: 0})

In [None]:
# New symptom list with grouped non-respiratory symptoms 
symptoms = ['symptom_cough_any', 'symptom_new_continuous_cough', 'symptom_runny_or_blocked_nose',
            'symptom_sore_throat', 'symptom_shortness_of_breath',
       'symptom_non_resp', 'symptom_other', 'symptom_none', 'symptom_prefer_not_to_say',]

In [None]:
# Formats symptom names for plot
symptom_dict = {'symptom_change_to_sense_of_smell_or_taste':'Change to sense<br>of smell or taste',
       'symptom_new_continuous_cough':'New continuous<br>cough',
        'symptom_abdominal_pain':'Abdominal pain',
       'symptom_cough_any': 'Cough (any)',
        'symptom_diarrhoea':'Diarrhoea',
        'symptom_fatigue':'Fatigue',
       'symptom_fever_high_temperature':'Fever',
        'symptom_headache':'Headache',
       'symptom_loss_of_taste':'Loss of taste',
        'symptom_runny_or_blocked_nose':'Runny or<br>blocked nose',
        'symptom_shortness_of_breath':'Shortness of<br>breath',
       'symptom_sore_throat':'Sore throat', 
        'symptom_other':'Other',
        'symptom_none':'None',
        'symptom_non_resp':'Non-respiratory<br>symptoms*',
        'symptom_prefer_not_to_say':'Prefer not to say',}

In [None]:
# Split by positive an negative COVID test results for plot 
pos_df = all_df[all_df['covid_test_result']=='Positive']
neg_df = all_df[all_df['covid_test_result']=='Negative']

In [None]:
# Format data for bar plot 
symptom_counts = []
for symptom in symptoms:
    symptom_counts.append([symptom, pos_df[symptom].sum(), neg_df[symptom].sum()])
    
symptom_df = pd.DataFrame(symptom_counts)
symptom_df.columns = ['symptom', 'pos_count', 'neg_count']

symptom_df['symptom'] = symptom_df['symptom'].map(symptom_dict)
symptom_df = symptom_df[:8]

In [None]:
# Generate plotly traces 
pos_symptom_trace = go.Bar(
x=symptom_df['symptom'],
y=100*symptom_df['pos_count']/participants, 
name='COVID<br>Positive', 
marker_color='rgba(230,159,0,1)',
)

neg_symptom_trace = go.Bar(
x=symptom_df['symptom'],
y=100*symptom_df['neg_count']/participants, 
name='COVID<br>Negative', 
marker_color='rgba(86,180,233,1)', 
)

symptom_traces = [pos_symptom_trace, neg_symptom_trace]

#### Respiratory Conditions

In [None]:
# List of respiratory condition variables
conditions=['respiratory_condition_asthma',
       'respiratory_condition_copd_or_emphysema',
       'respiratory_condition_other',
       'respiratory_condition_none',
       'respiratory_condition_prefer_not_to_say']

In [None]:
# Rename variables for plot 
condition_dict = {'respiratory_condition_asthma':'Asthma',
       'respiratory_condition_copd_or_emphysema':'COPD or<br>Emphysema',
       'respiratory_condition_other':'Other',
       'respiratory_condition_none':'None',
       'respiratory_condition_prefer_not_to_say':'Prefer not to say'}

In [None]:
# Split by positive an negative COVID test results for plot 
pos_df = all_df[all_df['covid_test_result']=='Positive']
neg_df = all_df[all_df['covid_test_result']=='Negative']

In [None]:
# Get negative and positive counts 
condition_counts = []
for condition in conditions:
    condition_counts.append([condition, pos_df[condition].sum(), neg_df[condition].sum()])

In [None]:
# Format data for bar plot 
condition_df = pd.DataFrame(condition_counts)

condition_df.columns = ['condition', 'pos_count', 'neg_count']

condition_df['condition'] = condition_df['condition'].map(condition_dict)

condition_df = condition_df[:4] # Removes 'prefer not to say'

In [None]:
# Generate plotly traces
pos_condition_trace = go.Bar(
x=condition_df['condition'],
y=100*condition_df['pos_count']/participants, 
name='COVID Positive', 
showlegend=False,
marker_color='rgba(230,159,0,1)',
)

neg_condition_trace = go.Bar(
x=condition_df['condition'],
y=100*condition_df['neg_count']/participants, 
name='COVID Negative', 
showlegend=False,
marker_color='rgba(86,180,233,1)',
)


condition_traces = [pos_condition_trace, neg_condition_trace]

#### Smoker Status

In [None]:
# Group all variable options for current smokers
all_df['smoker_status']=all_df['smoker_status'].apply(lambda x:'Current smoker' if 'Current smoker' in x else x)

In [None]:
# Format data for bar plot 
smoker_df = all_df.groupby(['smoker_status' ,'covid_test_result']).size().to_frame().reset_index().rename(columns={0:'counts'})
smoker_df = smoker_df[0:8] # Removes prefer not to say' option 

In [None]:
# Generate plotly traces 
pos_smoker_trace = go.Bar(
    x=smoker_df[smoker_df['covid_test_result']=='Positive']['smoker_status'],
    y=100*smoker_df[smoker_df['covid_test_result']=='Positive']['counts']/participants,
    name='COVID Positive',
    showlegend=False,
    marker_color='rgba(230,159,0,1)', 
    offsetgroup='Pos',
)

neg_smoker_trace = go.Bar(
    x=smoker_df[smoker_df['covid_test_result']=='Negative']['smoker_status'],
    y=100*smoker_df[smoker_df['covid_test_result']=='Negative']['counts']/participants,
    name='COVID Negative',
    showlegend=False,
    marker_color='rgba(86,180,233,1)',
    offsetgroup='Neg',
)

smoker_traces = [pos_smoker_trace, neg_smoker_trace]


#### Language

In [None]:
# Group all non-English languages
def language_simplify(language):
    try:
        if language=='English':
            return 'English'
        if language =='Prefer not to say':
            return None
        else:
            return 'Other<br>Language'
    except:
        return None
    
all_df['language'] = all_df['language'].apply(lambda x:language_simplify(x)) 

In [None]:
# Format data for bar plot 
language_df = all_df.groupby(['language' ,'covid_test_result']).size().to_frame().reset_index().rename(columns={0:'counts'})

In [None]:
# Generate plotly traces
pos_language_trace = go.Bar(
    x=language_df[language_df['covid_test_result']=='Positive']['language'],
    y=100*language_df[language_df['covid_test_result']=='Positive']['counts']/participants,
    name='COVID Positive',
    showlegend=False,
    marker_color='rgba(230,159,0,1)',
)

neg_language_trace = go.Bar(
    x=language_df[language_df['covid_test_result']=='Negative']['language'],
    y=100*language_df[language_df['covid_test_result']=='Negative']['counts']/participants,
    name='COVID Negative',
    showlegend=False,
    marker_color='rgba(86,180,233,1)'
)

language_traces = [pos_language_trace, neg_language_trace]

#### Age

In [None]:
# Generates plotly traces

colors = {'Positive':'#E69F00','Negative':'#56B4E9'}


age_trace_1 = go.Violin(
        x=all_df[all_df['covid_test_result']=='Positive']['covid_test_result'],
        y=all_df[all_df['covid_test_result']=='Positive']['age'],
        name='Positive',
        box_visible=False,
        meanline_visible=False,
        points=False,
        orientation='v',
        side='negative',
        marker=dict(color='#E69F00'),
        fillcolor='#E69F00',
        scalegroup='A',
        scalemode='count',
        spanmode='hard',
        showlegend=False,
        opacity=1,
    )
    
age_trace_2 = go.Box( # not visible, used to offset visible boxplot
        x=all_df[all_df['covid_test_result']=='Positive']['covid_test_result'],
        y=all_df[all_df['covid_test_result']=='Positive']['age'],
        name='Positive',
        orientation='v',
        offsetgroup='A',
        opacity=0,
        showlegend=False,
    )
    
age_trace_3 = go.Box(
        x=all_df[all_df['covid_test_result']=='Positive']['covid_test_result'],
        y=all_df[all_df['covid_test_result']=='Positive']['age'],
        orientation='v',
        boxmean=True,
        offsetgroup='B',
        boxpoints=False,
        line=dict(color='#E69F00'),
        showlegend=False,
    )


age_trace_4 = go.Violin(
        x=all_df[all_df['covid_test_result']=='Negative']['covid_test_result'],
        y=all_df[all_df['covid_test_result']=='Negative']['age'],
        name='Positive',
        box_visible=False,
        meanline_visible=False,
        points=False,
        orientation='v',
        side='negative',
        marker=dict(color='#56B4E9'),
        fillcolor='#56B4E9',
        scalegroup='A',
        scalemode='count',
        spanmode='hard',
        showlegend=False,
        opacity=1,
    )
    
age_trace_5 = go.Box( # not visible, used to offset visible boxplot
        x=all_df[all_df['covid_test_result']=='Negative']['covid_test_result'],
        y=all_df[all_df['covid_test_result']=='Negative']['age'],
        name='Positive',
        orientation='v',
        offsetgroup='A',
        opacity=0,
        showlegend=False,
    )
    
age_trace_6 = go.Box(
        x=all_df[all_df['covid_test_result']=='Negative']['covid_test_result'],
        y=all_df[all_df['covid_test_result']=='Negative']['age'],
        orientation='v',
        boxmean=True,
        offsetgroup='B',
        boxpoints=False,
        line=dict(color='#56B4E9'),
        showlegend=False,
    )

age_traces = [age_trace_1, age_trace_2, age_trace_3, age_trace_4, age_trace_5, age_trace_6]

#### Gender

In [None]:
# Renames variables, formats data for bar plot, removes no repsonse

all_df['gender']=all_df['gender'].apply(lambda x:'Women' if x=='Female' else x)
all_df['gender']=all_df['gender'].apply(lambda x:'Men' if x=='Male' else x)
all_df['gender']=all_df['gender'].apply(lambda x:'Not Recorded' if x=='Unknown' else x)

gender_df = all_df.groupby(['gender' ,'covid_test_result']).size().to_frame().reset_index().rename(columns={0:'counts'})
gender_df=gender_df[gender_df['gender']!='Not Recorded'] 

In [None]:
# Geneates plotly traces
gender_pos_trace = go.Bar(
    x=gender_df[gender_df['covid_test_result']=='Positive']['gender'],
    y=100*gender_df[gender_df['covid_test_result']=='Positive']['counts']/participants,
    name='COVID Positive',
    showlegend=False,
    marker_color='rgba(230,159,0,1)',
)

gender_neg_trace = go.Bar(
    x=gender_df[gender_df['covid_test_result']=='Negative']['gender'],
    y=100*gender_df[gender_df['covid_test_result']=='Negative']['counts']/participants,
    name='COVID Negative',
    showlegend=False,
    marker_color='rgba(86,180,233,1)'
)

gender_traces = [gender_pos_trace, gender_neg_trace]

#### Ethnicity

In [None]:
# Groups all ethnicities that are not white British for plot 

def ethnicity_simplify(ethnicity):
    if ethnicity=='White British':
        return 'White<br>British'
    if ethnicity=='No response':
        return None
    else:
        return 'Other<br>Ethnicity'
    
all_df['ethnicity'] = all_df['ethnicity'].apply(lambda x:ethnicity_simplify(x)) 

In [None]:
# Formats data for bar plot 

ethnicity_df = all_df.groupby(['ethnicity' ,'covid_test_result']).size().to_frame().reset_index().rename(columns={0:'counts'})

In [None]:
# Generates plotly traces 

ethnicity_pos_trace = go.Bar(
    x=ethnicity_df[ethnicity_df['covid_test_result']=='Positive']['ethnicity'],
    y=100*ethnicity_df[ethnicity_df['covid_test_result']=='Positive']['counts']/participants,
    name='COVID Positive',
    showlegend=False,
    marker_color='rgba(230,159,0,1)',
)

ethnicity_neg_trace = go.Bar(
    x=ethnicity_df[ethnicity_df['covid_test_result']=='Negative']['ethnicity'],
    y=100*ethnicity_df[ethnicity_df['covid_test_result']=='Negative']['counts']/participants,
    name='COVID Negative',
    showlegend=False,
    marker_color='rgba(86,180,233,1)'
)

ethnicity_traces = [ethnicity_pos_trace, ethnicity_neg_trace]

#### UK administrative region
Choropleth map plot 

In [None]:
region_df = participant_df.copy()
participants = region_df.shape[0]

In [None]:
# Gets aggregate participant counts by region 
region_df = region_df.groupby('region_code').size().reset_index()
region_df.columns = ['region_code', 'count']
region_df['percentage']=100*(region_df['count'])/participants

In [None]:
# Gets open-source UK countries and region boundaries from Office for National Statistics open geography portal 
countries_url="https://opendata.arcgis.com/datasets/e05662741ac2452081eaf663dfea92e3_0.geojson" # Dec 2021
regions_url="https://opendata.arcgis.com/datasets/64286008effe44b1be10a8c76e6e731c_0.geojson" # Dec 2021

countries_gdf= gpd.read_file(countries_url)
region_gdf = gpd.read_file(regions_url)

In [None]:
# combine countries and English regions boundaries 
countries_gdf = countries_gdf.drop(columns=['CTRY21NMW'])
countries_gdf = countries_gdf.drop(index=0)
countries_gdf.columns = region_gdf.columns

uk_gdf = region_gdf.append(countries_gdf)
uk_gdf = uk_gdf.merge(region_df, left_on='RGN21CD', right_on='region_code')

In [None]:
# set to geo-dataframe and set Coodinate Reference System
uk_gdf = gpd.GeoDataFrame(uk_gdf, geometry='geometry')
uk_gdf.crs = {'init': 'epsg:4326'} 

geojson=uk_gdf.__geo_interface__

In [None]:
# Generates plotly trace
region_trace = go.Choropleth(
                               locations=uk_gdf["RGN21CD"], # Spatial coordinates
                               z = uk_gdf["percentage"], # Data to be color-coded
                               zmin=0,
                               zmax=20, # extreme values excluded to avoid skewing colorscale
                               geojson=geojson, # set of locations match entries in `locations
                               featureidkey="properties.RGN21CD", # feature geoJSON that matches the LAD boundaries you're interested in 
                               colorscale = [[0, '#E5F5F9'], [1, '#D55E00']],
                               reversescale=False,
                               marker_line={"width":0},
                               colorbar_title = "Participants<br>by Region<br />&nbsp;", 
                               colorbar={"titlefont":{"size":14}, "tickfont":{"size":12}, "thickness":15, "ticksuffix":'%',
                                         "len":0.25,"yanchor":"bottom","x":0.57, 'outlinewidth':0},
)

#### Combined subplot for all Figure 2 traces

In [None]:
fig = make_subplots(rows=5, cols=4, vertical_spacing=0.125,
                     subplot_titles=('<b>Sentence<b>', '<b>COVID Test Results<b>', '<b>Influenza Test Results<b>',
                                   '<b>Exhalation<b>', '<b>Symptoms<b>', 
                                   '<b>One Cough<b>', '<b>Respiratory Conditions<b>', '<b>Smoker Status<b>',
                                   '<b>Three Coughs<b>', '<b>Language<b>', '<b>UK Region<b>', 
                                   '<b>Age<b>', '<b>Gender<b>', '<b>Ethnicity<b>'),
                    specs = [[{"colspan":2}, None, {}, {}],
                    [{"colspan":2}, None, {"colspan":2}, None],
                    [{"colspan":2}, None, {}, {}],
                    [{"colspan":2}, None, {}, {"type":"choropleth", "rowspan":2}],
                    [{}, {}, {}, None]] 
                        
                        )

# Audio traces can be uncommented to include if they are generated above 
#fig.add_trace(sentence_trace, row=1, col=1) 
#fig.add_trace(exhalation_trace, row=2, col=1) 
#fig.add_trace(one_cough_trace, row=3, col=1) 
#fig.add_trace(three_coughs_trace, row=4, col=1) 

for trace in covid_test_traces: 
    fig.add_trace(trace, row=1, col=3)

fig.add_trace(flu_trace, row=1, col=4) 

for trace in symptom_traces: 
    fig.add_trace(trace, row=2, col=3)
    

for trace in condition_traces: 
    fig.add_trace(trace, row=3, col=3)

for trace in smoker_traces: 
    fig.add_trace(trace, row=3, col=4)

for trace in language_traces: 
    fig.add_trace(trace, row=4, col=3)


fig.add_trace(region_trace, row=4, col=4) 

for trace in age_traces: 
    fig.add_trace(trace, row=5, col=1)

for trace in gender_traces:
    fig.add_trace(trace, row=5, col=2)
    
for trace in ethnicity_traces:
    fig.add_trace(trace, row=5, col=3)
    
    
fig.for_each_trace(
    lambda trace: trace.update(colorbar=dict(x=0.925, y=0.16, len=0.15, borderwidth=0)) if trace.type == "choropleth" else ()
)

fig.for_each_trace(
    lambda trace: trace.update(width=0.3, marker=dict(line=dict(width=0))) if trace.type == "bar" else ()
)

fig.for_each_trace(
    lambda trace: trace.update(offset=0) if trace.type == "bar" and 'Negative' in trace.name else ()
)

fig.for_each_trace(
    lambda trace: trace.update(offset=-0.3) if trace.type == "bar" and 'Positive' in trace.name else ()
)

fig.for_each_trace(
    lambda trace: trace.update(legendgroup='A') if trace.type == "bar" and trace.offsetgroup == 'A' else trace.update(legendgroup='B')
)

fig.update_geos(
    lataxis_range=[49.5,61.0],
    lonaxis_range=[-7.8,1.7],
    projection={"type":"azimuthal equal area"},
    visible=False)


fig.update_yaxes(showticklabels=False, ticks="", col=1, row=1)
fig.update_yaxes(showticklabels=False, ticks="", col=1, row=2)
fig.update_yaxes(showticklabels=False, ticks="", col=1, row=3)
fig.update_yaxes(showticklabels=False, ticks="", col=1, row=4)
fig.update_xaxes(title='Seconds', col=1, row=1)
fig.update_xaxes(title='Seconds', col=1, row=2)
fig.update_xaxes(title='Seconds', col=1, row=3)
fig.update_xaxes(title='Seconds', col=1, row=4)
fig.update_yaxes(ticksuffix="%", col=3)
fig.update_yaxes(ticksuffix="%", col=4)
fig.update_yaxes(ticksuffix="%", row=5, col=2)
fig.update_xaxes(tickangle=35, col=3)
fig.update_xaxes(tickangle=35, col=4)
fig.update_xaxes(tickangle=35, row=5)
fig.update_layout(template='simple_white',
                  font_family='Arial', font_size=12, 
                  width=1100, height=1300,
                  legend_tracegroupgap=120,
                 boxmode='group')

fig.show()

In [None]:
# Write plot to image 
fig.write_image('DataPaperFig2.svg', scale=1)
fig.write_image('DataPaperFig2.png', scale=3)

# Figure 3 - Technical Validation

In [None]:
# Get statistic for number of audio channels in all audio files
audio_df['cough_channels'].unique()

In [None]:
# Get sentence audio transcript variables, drop where no audio 
outlier_df = audio_df[['sentence_outlier_score', 'sentence_transcript']]
outlier_df = outlier_df.dropna(subset=['sentence_transcript']) # drop where not transcript 
outlier_df.sort_values(by=['sentence_outlier_score'])[0:1001]['sentence_transcript'].to_list()

# Normalise outlier scores (High score = more agreement)
outlier_min = outlier_df['sentence_outlier_score'].min()
outlier_max = outlier_df['sentence_outlier_score'].max()
outlier_df['Normalised Outlier Scores'] = outlier_df['sentence_outlier_score'].apply(lambda x: (x-outlier_min)/(outlier_max-outlier_min))
outlier_df['Normalised Outlier Scores'] = outlier_df['Normalised Outlier Scores'].round(2)

# Format dataframe for bar plot 
outlier_count_df = outlier_df.groupby('Normalised Outlier Scores').size().reset_index()
outlier_count_df['checked_count'] = outlier_count_df[0].cumsum()
outlier_count_df['checked'] = outlier_count_df['checked_count'].apply(lambda x: 1 if x<=1000 else 0)

In [None]:
# Get example transcripts 

In [None]:
outlier_df[outlier_df['Normalised Outlier Scores']==0.99]['sentence_transcript'].iloc[0]

In [None]:
outlier_df[outlier_df['Normalised Outlier Scores']==0.68]['sentence_transcript'].iloc[1]

In [None]:
outlier_df[outlier_df['Normalised Outlier Scores']==0.37]['sentence_transcript'].iloc[17]

In [None]:
outlier_df[outlier_df['Normalised Outlier Scores']==0.11]['sentence_transcript'].iloc[14]

In [None]:
# Plot data

fig = make_subplots(rows=5, cols=3, vertical_spacing=0.25, horizontal_spacing=0.15,
                   specs=[[{"rowspan":2}, {"rowspan":2}, {"rowspan":2}],
                          [None, None, None],
           [{"colspan": 3, "rowspan":3}, None, None], [None, None, None], [None, None, None],
                        ],)


# Length subplot

fig.add_trace(go.Violin(
y=audio_df['sentence_length'],
name='Sentence',
marker_color = '#CC79A7',
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
scalegroup='A', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['sentence_length'].quantile(0.005), audio_df['sentence_length'].quantile(0.995)], 
box=dict(visible=True)
), row=1, col=1)

fig.add_trace(go.Violin(
y=audio_df['exhalation_length'],
name='Exhalation',
marker_color = '#CC79A7',
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
scalegroup='A', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['exhalation_length'].quantile(0.005), audio_df['exhalation_length'].quantile(0.995)], 
box=dict(visible=True)
), row=1, col=1)

fig.add_trace(go.Violin(
y=audio_df['cough_length'],
name='One Cough',
marker_color = '#CC79A7',
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)',  
scalemode='count',
scalegroup='A', 
spanmode='manual', 
span=[audio_df['cough_length'].quantile(0.005), audio_df['cough_length'].quantile(0.995)],
box=dict(visible=True)
), row=1, col=1)

fig.add_trace(go.Violin(
y=audio_df['three_cough_length'],
name='Three Coughs',
marker_color = '#CC79A7',
scalegroup='A', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['three_cough_length'].quantile(0.005), audio_df['three_cough_length'].quantile(0.995)], 
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=1)


# Amplitude subplot

fig.add_trace(go.Violin(
y=audio_df['sentence_amplitude'],
name='Sentence',
marker_color = '#CC79A7',
scalegroup='B', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['sentence_amplitude'].quantile(0.005), audio_df['sentence_amplitude'].quantile(0.995)], 
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=2)

fig.add_trace(go.Violin(
y=audio_df['exhalation_amplitude'],
name='Exhalation',
marker_color = '#CC79A7',
scalegroup='B', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['exhalation_amplitude'].quantile(0.005), audio_df['exhalation_amplitude'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=2)

fig.add_trace(go.Violin(
y=audio_df['cough_amplitude'],
name='One Cough',
marker_color = '#CC79A7',
scalegroup='B', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['cough_amplitude'].quantile(0.005), audio_df['cough_amplitude'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=2)

fig.add_trace(go.Violin(
y=audio_df['three_cough_amplitude'],
name='Three Coughs',
marker_color = '#CC79A7',
scalegroup='B', 
scalemode='count', 
spanmode='manual', 
span=[audio_df['three_cough_amplitude'].quantile(0.005), audio_df['three_cough_amplitude'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)',  
box=dict(visible=True)
), row=1, col=2)


# Variance subplot

fig.add_trace(go.Violin(
y=audio_df['sentence_snr'],
name='Sentence',
marker_color = '#CC79A7',
scalegroup='C', 
scalemode='count', 
spanmode='manual',
span=[audio_df['sentence_snr'].quantile(0.005), audio_df['sentence_snr'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=3)

fig.add_trace(go.Violin(
y=audio_df['exhalation_snr'],
name='Exhalation',
marker_color = '#CC79A7',
scalegroup='C',
scalemode='count', 
spanmode='manual', 
span=[audio_df['exhalation_snr'].quantile(0.005), audio_df['exhalation_snr'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)',  
box=dict(visible=True)
), row=1, col=3)

fig.add_trace(go.Violin(
y=audio_df['cough_snr'],
name='One Cough',
marker_color = '#CC79A7',
scalegroup='C',
scalemode='count', 
spanmode='manual',
span=[audio_df['cough_snr'].quantile(0.005), audio_df['cough_snr'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=3)

fig.add_trace(go.Violin(
y=audio_df['three_cough_snr'],
name='Three Coughs',
marker_color = '#CC79A7',
scalegroup='C',
scalemode='count', 
spanmode='manual', 
span=[audio_df['three_cough_snr'].quantile(0.005), audio_df['three_cough_snr'].quantile(0.995)],
points=False, 
showlegend=False,
fillcolor='rgba(204, 121, 167, 0.25)', 
box=dict(visible=True)
), row=1, col=3)


# Sentence transcript outlier subplot 

fig.add_trace(go.Bar(
    x=outlier_count_df[outlier_count_df['checked']==1]['Normalised Outlier Scores'],
    y=outlier_count_df[outlier_count_df['checked']==1][0],
    marker_color='#009E73',
    name='Manually checked',
    showlegend=True,
), row=3, col=1)

fig.add_trace(go.Bar(
    x=outlier_count_df[outlier_count_df['checked']==0]['Normalised Outlier Scores'],
    y=outlier_count_df[outlier_count_df['checked']==0][0],
    marker_color='#56B4E9',
    name='Sampled',
    showlegend=True,
), row=3, col=1)

fig.add_annotation(x=0.99, y=3.75,
            text="",
            showarrow=True,
            align='left',
            yshift=0,
            xshift=-2,
            arrowhead=1, 
                  row=3, col=1)

fig.add_annotation(x=0.86, y=4.5,
            text='"I love nothing more than<br>an afternoon cream tea"',
            showarrow=False,
            align='left',
            yshift=0,
            xshift=-2,
            arrowhead=1, 
                  row=3, col=1)

fig.add_annotation(x=0.68, y=2.95,
            text="",
            showarrow=True,
            align='left',
            yshift=0,
            xshift=-2,
            arrowhead=1, 
                  row=3, col=1)

fig.add_annotation(x=0.60, y=3.78,
            text='"Isle of nothing more than<br>an afternoon cream tea"',
            showarrow=False,
            align='left',
            yshift=0,
            xshift=-2,
            arrowhead=1,
                  row=3, col=1)


fig.add_annotation(x=0.37, y=1.92,
            text="",
            showarrow=True,
            align='left',
            yshift=0,
            xshift=-2,
            arrowhead=1, 
                  row=3, col=1)

fig.add_annotation(x=0.43, y=2.73,
            text='"ah ah i look nothing more than<br>an afternoon at gravity"',
            showarrow=False,
            align='left',
            yshift=0,
            xshift=-2,
            arrowhead=1, 
                  row=3, col=1)

fig.add_annotation(x=0.11, y=1.4,
            text="",
            showarrow=True,
            align='left',
            yshift=0,
            xshift=-2,
            arrowhead=1,
                  row=3, col=1)

fig.add_annotation(x=0.09, y=2.2,
            text="\"but I don't<br>think that\"",
            showarrow=False,
            align='left',
            yshift=0,
            xshift=-2,
            arrowhead=1, 
                  row=3, col=1)

fig.update_annotations(dict(ay=-15))
fig.update_xaxes(tickangle=45, row=1)
fig.update_yaxes(range=[0,20], row=1, col=1)
fig.update_yaxes(type='log', row=1, range=[-1.25, 0], col=3)
fig.update_yaxes(type='log', title='Count', title_standoff=0.05, row=3, col=1)
fig.update_xaxes(title='Normalised Outlier Score', row=3, col=1)
fig.update_yaxes(title='Length (seconds)', title_standoff=0.05, row=1, col=1)
fig.update_yaxes(title='Amplitude (AU)', title_standoff=0.05, row=1, col=2)
fig.update_yaxes(title='Mean/SD (ratio)', title_standoff=0.05, nticks=2, row=1, col=3)
fig.update_layout(template='simple_white', violingap=0, violingroupgap=0.1, legend=dict(y=0.45, x=0.05),
                  font_family='Arial', font_size=13, width=750, height=600)


In [None]:
# Write plot to image 
fig.write_image("DataPaperFig3.svg", scale=1)
fig.write_image("DataPaperFig3.png", scale=3)

## Figure 4 - Bias

In [None]:
## ONS Census 2021 data
# from https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/populationandhouseholdestimatesenglandandwalescensus2021

data = ',Age,Total,% Total\n0,0-4,3077000,5.447010693984921\n1,5-9,3348600,5.927806308052618\n2,10-14,3413100,6.041986415222598\n3,15-19,3218900,5.698206929758876\n4,20-24,3414400,6.044287719708195\n5,25-29,3715400,6.5771282198347665\n6,30-34,3952600,6.997027776745141\n7,35-39,3795400,6.718746957409935\n8,40-44,3580400,6.338146600176669\n9,45-49,3602600,6.377445799853779\n10,50-54,3907700,6.917544260281078\n11,55-59,3806300,6.738042510404552\n12,60-64,3256100,5.76405964273133\n13,65-69,2767500,4.899123202990988\n14,70-74,2796600,4.950637018783955\n15,75-79,2038800,3.609153527103171\n16,80-84,1427900,2.52771744229479\n17,85-89,872200,1.5439982864132753\n18,90+,498200,0.8819306882493623\n'
census_age_df = pd.read_csv(StringIO(data), header=0)

data = ',Gender,Total,% Total\n0,Women,28833500,51.04195801719956\n1,Men,27656300,48.95804198280044\n'
census_gender_df = pd.read_csv(StringIO(data), header=0)

data = ',Region,Total,% Total\n0,East Midlands,4880200,8.639081745731088\n1,East of England,6334500,11.213528814051386\n2,London,8799800,15.577679510283273\n3,North East,2647100,4.685978707660498\n4,North West,7417300,13.13033503393533\n5,South East,9278100,16.42438103870079\n6,South West,5701200,10.092441467309143\n7,West Midlands,5950800,10.534291146366247\n8,Yorkshire and The Humber,5480800,9.702282535962244\n'
census_region_df = pd.read_csv(StringIO(data), header=0)

In [None]:
# TT Data 
# from Weekly statistics for NHS Test and Trace (England) 2 to 15 June 2022. 22 https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1085136/NHS-test-and-trace-23-june-2022.pdf (2022)

data = ',Age,Positive,Total,% Total,% Positive\n0,0-9,1032524.0,7510511.0,6.488996382902808,0.8920890337901561\n1,10-19,2065689.0,9333016.0,8.063620047367488,1.7847318843154771\n2,20-29,1979192.0,17078289.0,14.755448137572639,1.7099994566375276\n3,30-39,1967645.0,19531060.0,16.874614482857123,1.7000229795065604\n4,40-49,1691963.0,17790251.0,15.370575236482987,1.4618368559749642\n5,50-59,1266484.0,18753357.0,16.202687905028707,1.0942278221820434\n6,60-69,643160.0,11779362.0,10.177235265470324,0.5556829506844169\n7,70-79,329558.0,6770194.0,5.849370885356575,0.28473437692277975\n8,80-89,158203.0,4873396.0,4.210558910898741,0.13668559899111699\n9,90+,56520.0,2322823.0,2.0068927460626114,0.04883263942515585\n'
tt_age_df = pd.read_csv(StringIO(data), header=0)
data = ',Gender,Positive,Total,% Total,% Positive\n0,Women,5913615.0,68650051.0,59.40748549683927,5.1174469971827286\n1,Men,5239446.0,46907863.0,40.59251450316073,4.534043423456052\n'
tt_gender_df = pd.read_csv(StringIO(data), header=0)
data = ',Region,Positive,Total,% Total,% Positive\n0,East Midlands,972070.0,8855595.0,7.5848326660656475,0.8325796617508405\n1,East of England,1190555.0,13218132.0,11.321353266264735,1.019712447864631\n2,London,1784633.0,21534221.0,18.444098096071112,1.5285412979408763\n3,North East,604307.0,5366546.0,4.596456071528106,0.5175900065362219\n4,North West,1596281.0,15003841.0,12.850816160094842,1.3672174792343077\n5,South East,1742775.0,19825174.0,16.980296339843385,1.4926898474470163\n6,South West,1045968.0,11934850.0,10.222219980090959,0.8958734285002141\n7,West Midlands,1140933.0,11278614.0,9.660152693878315,0.9772111177388169\n8,Yorkshire and The Humber,1127128.0,9737020.0,8.339774726162899,0.9653871110001351\n'
tt_region_df = pd.read_csv(StringIO(data), header=0)

In [None]:
## REACT data 
# from various REACT reports listed here https://www.imperial.ac.uk/medicine/research-and-impact/groups/react-study/real-time-assessment-of-community-transmission-findings/

data = ',Age,Positive,Total,% Total,% Positive\n0,5-17,2417,66558,11.222337438984294,0.40753011794262206\n1,18-24,429,20806,3.508097490241702,0.0723336452616404\n2,25-34,1086,52657,8.878491278653144,0.18311034674625054\n3,35-44,1629,74586,12.575937681782543,0.2746655201193758\n4,45-54,1681,94364,15.910704199229453,0.28343323469654436\n5,55-64,1657,117480,19.808290548572295,0.27938659719938963\n6,65-74,1284,110912,18.700860753517624,0.2164951060977769\n7,75+,503,55722,9.395280609018943,0.08481077754453409\n'
react_age_df = pd.read_csv(StringIO(data), header=0)

data = ',Gender,Positive,Total,% Total,% Positive\n1,Women,5798,330449,55.716971429053174,0.9776001753542916\n0,Men,4888,262613,44.27915054334539,0.8241651702538422\n2,Unknown,0,23,0.0038780276014399287,0.0\n'
react_gender_df = pd.read_csv(StringIO(data), header=0)
react_gender_df = react_gender_df.drop(index=2)

data = ',Ethnicity,Positive,Total,% Total,% Positive\n1,Asian,749,33464,5.642361550199381,0.12628881189036983\n2,Black,264,11528,1.9437348777999781,0.04451301246870179\n3,Mixed,260,10700,1.8041258841481407,0.04383857288584267\n4,Other,134,6182,1.042346375308767,0.022593726025780452\n5,Unknown,245,13920,2.347049748349731,0.04130942445012098\n0,White,9034,517291,87.220381564194,1.523221797887318\n'
react_ethnicity_df = pd.read_csv(StringIO(data), header=0)
react_ethnicity_df = react_ethnicity_df.drop(index=4)

data = ',Region,Positive,Total,% Total,% Positive\n4,East Midlands,932,51826,8.738376455314162,0.1571444228061745\n6,East of England,1056,68559,11.559725840309568,0.17805204987480716\n7,London,1647,89478,15.086876248767041,0.27770049824224186\n1,North East,546,26724,4.505930853081767,0.09206100306026961\n2,North West,1368,70615,11.906387785899154,0.23065833733781835\n0,South East,1701,103625,17.472200443444027,0.2868054326108399\n8,South West,1067,65180,10.989993002689328,0.17990675872766973\n5,West Midlands,1101,59791,10.081354274682381,0.18563949518197223\n3,Yorkshire and The Humber,1124,57287,9.659155095812574,0.18951752278341216\n'
react_region_df = pd.read_csv(StringIO(data), header=0)

In [None]:
bam_total_df = participant_df.copy()

In [None]:
gender_dict = {
    'Female':'Women',
    'Male':'Men',
    'Unknown': 'Unknown'
}

# Group ethnicities to match categories from external data sources 
ethnicity_dict = {
    'White British':'White', 
    'Another White background':'White',
    'Pakistani':'Asian',
    'No response': 'Unknown',
    'Another ethnic background': 'Other',
    'Black African and White': 'Mixed',
    'Indian': 'Asian',
    'Irish': 'White',
    'Chinese': 'Asian',
    'Another Mixed background': 'Mixed',
    'Another Asian background': 'Asian',
    'Caribbean': 'Black',
    'Asian and White': 'Mixed',
    'African': 'Black',
    'Black Caribbean and White': 'Mixed',
    'Another Black background': 'Black',
    'Bangladeshi': 'Asian',
    'Jewish': 'Other',
    'Arab': 'Asian',
    'Irish Traveller or Gypsy': 'White'}

bam_total_df['ethnicity'] = bam_total_df['ethnicity'].map(ethnicity_dict)
bam_total_df['gender'] = bam_total_df['gender'].map(gender_dict)

In [None]:
# Format age, make numeric (all ages > 94 will be grouped)
bam_total_df['age'] = bam_total_df['age'].replace('94+', 94)

In [None]:
bam_pos_total_df = bam_total_df[bam_total_df['covid_test_result']=='Positive']

# TT data only PCR & LAMP (filtered by testing types containing 'P')
bam_tt_df = bam_total_df[(bam_total_df['recruitment_source']=='Test and Trace') & (bam_total_df['covid_test_method'].str.contains('P', na=False))]
bam_pos_tt_df = bam_tt_df[bam_tt_df['covid_test_result']=='Positive']

bam_react_df = bam_total_df[bam_total_df['recruitment_source'].str.contains('REACT')]
bam_pos_react_df = bam_react_df[bam_react_df['covid_test_result']=='Positive']

In [None]:
bam_tt = bam_tt_df.shape[0]
bam_react = bam_react_df.shape[0]
bam_total = bam_total_df.shape[0]

In [None]:
census_ages = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 110]

# Format dataset to Census age categories 
bam_total_age_df = bam_total_df.groupby(pd.cut(bam_total_df["age"], census_ages, right=False)).size().reset_index()
bam_total_age_df['Age'] = census_age_df['Age']
bam_total_age_df.columns = ['.', 'Total', 'Age']
bam_total_age_df['% Total'] = 100*bam_total_age_df['Total']/bam_total

bam_pos_total_age_df = bam_pos_total_df.groupby(pd.cut(bam_pos_total_df["age"], census_ages, right=False)).size().reset_index()
bam_pos_total_age_df['Age'] = census_age_df['Age']
bam_pos_total_age_df.columns = ['.', 'Positive', 'Age']
bam_pos_total_age_df['% Positive'] = 100*bam_pos_total_age_df['Positive']/bam_total

In [None]:
tt_ages = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 110]

# Format dataset to Test and Trace report age categories 
bam_tt_age_df = bam_tt_df.groupby(pd.cut(bam_tt_df["age"], tt_ages, right=False)).size().reset_index()
bam_tt_age_df['Age'] = tt_age_df['Age']
bam_tt_age_df.columns = ['.', 'Total', 'Age']
bam_tt_age_df['% Total'] = 100*bam_tt_age_df['Total']/bam_tt

bam_pos_tt_age_df = bam_pos_tt_df.groupby(pd.cut(bam_pos_tt_df["age"], tt_ages, right=False)).size().reset_index()
bam_pos_tt_age_df['Age'] = tt_age_df['Age']
bam_pos_tt_age_df.columns = ['.', 'Positive', 'Age']
bam_pos_tt_age_df['% Positive'] = 100*bam_pos_tt_age_df['Positive']/bam_tt

In [None]:
react_ages = [5, 18, 25, 35, 45, 55, 65, 75, 110]

# Format dataset to REACT report age categories
bam_react_age_df = bam_react_df.groupby(pd.cut(bam_react_df["age"], react_ages, right=False)).size().reset_index()
bam_react_age_df['Age'] = react_age_df['Age']
bam_react_age_df.columns = ['.', 'Total', 'Age']
bam_react_age_df['% Total'] = 100*bam_react_age_df['Total']/bam_react

bam_pos_react_age_df = bam_pos_react_df.groupby(pd.cut(bam_pos_react_df["age"], react_ages, right=False)).size().reset_index()
bam_pos_react_age_df['Age'] = react_age_df['Age']
bam_pos_react_age_df.columns = ['.', 'Positive', 'Age']
bam_pos_react_age_df['% Positive'] = 100*bam_pos_react_age_df['Positive']/bam_react

In [None]:
regions=['South East', 'South West', 'North West', 'North East', 'London',
       'Yorkshire and The Humber', 'East of England', 'West Midlands',
       'East Midlands']

# Region TT
bam_tt_region_df = bam_tt_df['region_name'].value_counts(dropna=False).reset_index()
bam_tt_region_df.columns = ['Region', 'Total']
bam_tt_region_df['% Total'] = 100*bam_tt_region_df['Total']/bam_tt
bam_tt_region_df=bam_tt_region_df[bam_tt_region_df['Region'].isin(regions)].sort_values(by='Region')

bam_pos_tt_region_df = bam_pos_tt_df['region_name'].value_counts(dropna=False).reset_index()
bam_pos_tt_region_df.columns = ['Region', 'Positive']
bam_pos_tt_region_df['% Positive'] = 100*bam_pos_tt_region_df['Positive']/bam_tt
bam_pos_tt_region_df=bam_pos_tt_region_df[bam_pos_tt_region_df['Region'].isin(regions)].sort_values(by='Region')

# Region REACT 
bam_react_region_df = bam_react_df['region_name'].value_counts(dropna=False).reset_index()
bam_react_region_df.columns = ['Region', 'Total']
bam_react_region_df['% Total'] = 100*bam_react_region_df['Total']/bam_react
bam_react_region_df=bam_react_region_df[bam_react_region_df['Region'].isin(regions)].sort_values(by='Region')

bam_pos_react_region_df = bam_pos_react_df['region_name'].value_counts(dropna=False).reset_index()
bam_pos_react_region_df.columns = ['Region', 'Positive']
bam_pos_react_region_df['% Positive'] = 100*bam_pos_react_region_df['Positive']/bam_react
bam_pos_react_region_df=bam_pos_react_region_df[bam_pos_react_region_df['Region'].isin(regions)].sort_values(by='Region')

# Region Total
bam_total_region_df = bam_total_df['region_name'].value_counts(dropna=False).reset_index()
bam_total_region_df.columns = ['Region', 'Total']
bam_total_region_df['% Total'] = 100*bam_total_region_df['Total']/bam_total
bam_total_region_df=bam_total_region_df[bam_total_region_df['Region'].isin(regions)].sort_values(by='Region')

bam_pos_total_region_df = bam_pos_total_df['region_name'].value_counts(dropna=False).reset_index()
bam_pos_total_region_df.columns = ['Region', 'Total']
bam_pos_total_region_df['% Positive'] = 100*bam_pos_total_region_df['Total']/bam_total
bam_pos_total_region_df=bam_pos_total_region_df[bam_pos_total_region_df['Region'].isin(regions)].sort_values(by='Region')

In [None]:
# Format region data for plotting

bam_react_region_df = bam_react_region_df.replace('Yorkshire and The Humber', 'Y&H*')
bam_pos_react_region_df = bam_pos_react_region_df.replace('Yorkshire and The Humber', 'Y&H*')

bam_tt_region_df = bam_tt_region_df.replace('Yorkshire and The Humber', 'Y&H*')
bam_pos_tt_region_df = bam_pos_tt_region_df.replace('Yorkshire and The Humber', 'Y&H*')

bam_total_region_df = bam_total_region_df.replace('Yorkshire and The Humber', 'Y&H*')
bam_pos_total_region_df = bam_pos_total_region_df.replace('Yorkshire and The Humber', 'Y&H*')

react_region_df = react_region_df.replace('Yorkshire and The Humber', 'Y&H*')

tt_region_df = tt_region_df.replace('Yorkshire and The Humber', 'Y&H*')

census_region_df = census_region_df.replace('Yorkshire and The Humber', 'Y&H*')

In [None]:
# Format dataset to REACT report ethnicity categories 
bam_react_ethnicity_df = bam_react_df['ethnicity'].value_counts(dropna=False).reset_index()
bam_react_ethnicity_df.columns = ['Ethnicity', 'Total']
bam_react_ethnicity_df['% Total'] = 100*bam_react_ethnicity_df['Total']/bam_react
bam_react_ethnicity_df=bam_react_ethnicity_df.sort_values(by='Ethnicity')
bam_react_ethnicity_df = bam_react_ethnicity_df[bam_react_ethnicity_df['Ethnicity']!='Unknown']

bam_pos_react_ethnicity_df = bam_pos_react_df['ethnicity'].value_counts(dropna=False).reset_index()
bam_pos_react_ethnicity_df.columns = ['Ethnicity', 'Positive']
bam_pos_react_ethnicity_df['% Positive'] = 100*bam_pos_react_ethnicity_df['Positive']/bam_react
bam_pos_react_ethnicity_df=bam_pos_react_ethnicity_df.sort_values(by='Ethnicity')
bam_pos_react_ethnicity_df = bam_pos_react_ethnicity_df[bam_pos_react_ethnicity_df['Ethnicity']!='Unknown']

In [None]:
# Get gender data for each dataset subset 

# TT
bam_tt_gender_df = bam_tt_df['gender'].value_counts(dropna=False).reset_index()
bam_tt_gender_df.columns = ['Gender', 'Total']
bam_tt_gender_df['% Total'] = 100*bam_tt_gender_df['Total']/bam_tt
bam_tt_gender_df = bam_tt_gender_df[0:2]

bam_pos_tt_gender_df = bam_pos_tt_df['gender'].value_counts(dropna=False).reset_index()
bam_pos_tt_gender_df.columns = ['Gender', 'Positive']
bam_pos_tt_gender_df['% Positive'] = 100*bam_pos_tt_gender_df['Positive']/bam_tt
bam_pos_tt_gender_df = bam_pos_tt_gender_df[0:2]

# REACT
bam_react_gender_df = bam_react_df['gender'].value_counts(dropna=False).reset_index()
bam_react_gender_df.columns = ['Gender', 'Total']
bam_react_gender_df['% Total'] = 100*bam_react_gender_df['Total']/bam_react
bam_react_gender_df = bam_react_gender_df[0:2]

bam_pos_react_gender_df = bam_pos_react_df['gender'].value_counts(dropna=False).reset_index()
bam_pos_react_gender_df.columns = ['Gender', 'Positive']
bam_pos_react_gender_df['% Positive'] = 100*bam_pos_react_gender_df['Positive']/bam_react
bam_pos_react_gender_df = bam_pos_react_gender_df[0:2]

# Total 
bam_total_gender_df = bam_total_df['gender'].value_counts(dropna=False).reset_index()
bam_total_gender_df.columns = ['Gender', 'Total']
bam_total_gender_df['% Total'] = 100*bam_total_gender_df['Total']/bam_total
bam_total_gender_df = bam_total_gender_df[0:2]

bam_pos_total_gender_df = bam_pos_total_df['gender'].value_counts(dropna=False).reset_index()
bam_pos_total_gender_df.columns = ['Gender', 'Positive']
bam_pos_total_gender_df['% Positive'] = 100*bam_pos_total_gender_df['Positive']/bam_total
bam_pos_total_gender_df = bam_pos_total_gender_df[0:2]

In [None]:
# Plot

fig=make_subplots(rows=4, cols=3,
                 vertical_spacing=0.175, horizontal_spacing=0.15,
                 subplot_titles=("<b>Census 2021<b>", "<b>NHS Test and Trace<b>", "<b>REACT<b>"))

# Census age 
fig.add_trace(go.Bar(
x=census_age_df['Age'],
y=census_age_df['% Total'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="Census Total"), 
             row=1, col=1)


fig.add_trace(go.Bar(
x=bam_total_age_df['Age'],
y=bam_total_age_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=True,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study Total"),
row=1, col=1)

fig.add_trace(go.Bar(
x=bam_pos_total_age_df['Age'],
y=bam_pos_total_age_df['% Positive'],
width=0.4,
offset=-0.4,
showlegend=True,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
name="This Study Positive"),
row=1, col=1)


# TT age 
fig.add_trace(go.Bar(
x=tt_age_df['Age'],
y=tt_age_df['% Total'],
width=0.4,
offset=0,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="Baseline Total"), 
             row=1, col=2)

fig.add_trace(go.Bar(
x=tt_age_df['Age'],
y=tt_age_df['% Positive'],
width=0.4,
offset=0,
marker_color="#0072B2",
marker=dict(line=dict(width=0)),
name="Baseline Positive"),
             row=1, col=2)


fig.add_trace(go.Bar(
x=bam_tt_age_df['Age'],
y=bam_tt_age_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study Total"),
row=1, col=2)


fig.add_trace(go.Bar(
x=bam_pos_tt_age_df['Age'],
y=bam_pos_tt_age_df['% Positive'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
name="This Study Positive"),
row=1, col=2)


# REACT age 
fig.add_trace(go.Bar(
x=react_age_df['Age'],
y=react_age_df['% Total'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="REACT Study Total"), 
             row=1, col=3)

fig.add_trace(go.Bar(
x=react_age_df['Age'],
y=react_age_df['% Positive'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#0072B2",
marker=dict(line=dict(width=0)),
name="REACT Study Positive"),
             row=1, col=3)


fig.add_trace(go.Bar(
x=bam_react_age_df['Age'],
y=bam_react_age_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study Total (REACT recruited)"),
row=1, col=3)

fig.add_trace(go.Bar(
x=bam_pos_react_age_df['Age'],
y=bam_pos_react_age_df['% Positive'],
width=0.4,
offset=-0.4,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
showlegend=False,
name="This Study Positive (REACT recruited)"),
row=1, col=3)


# Census gender
fig.add_trace(go.Bar(
x=census_gender_df['Gender'],
y=census_gender_df['% Total'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="Census Total"), 
             row=2, col=1)


fig.add_trace(go.Bar(
x=bam_total_gender_df['Gender'],
y=bam_total_gender_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study"),
row=2, col=1)

fig.add_trace(go.Bar(
x=bam_pos_total_gender_df['Gender'],
y=bam_pos_total_gender_df['% Positive'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
name="This Study"),
row=2, col=1)


# TT gender
fig.add_trace(go.Bar(
x=tt_gender_df['Gender'],
y=tt_gender_df['% Total'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="NHS Test and Trace Total"), 
             row=2, col=2)

fig.add_trace(go.Bar(
x=tt_gender_df['Gender'],
y=tt_gender_df['% Positive'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#0072B2",
marker=dict(line=dict(width=0)),
name="NHS Test and Trace Positive"),
             row=2, col=2)


fig.add_trace(go.Bar(
x=bam_tt_gender_df['Gender'],
y=bam_tt_gender_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study Total (Test and Trace recruited)"),
row=2, col=2)

fig.add_trace(go.Bar(
x=bam_pos_tt_gender_df['Gender'],
y=bam_pos_tt_gender_df['% Positive'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
name="This Study Positive (Test and Trace recruited)"),
row=2, col=2)


# REACT gender
fig.add_trace(go.Bar(
x=react_gender_df['Gender'],
y=react_gender_df['% Total'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="REACT Study Total"),
             row=2, col=3)

fig.add_trace(go.Bar(
x=react_gender_df['Gender'],
y=react_gender_df['% Positive'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#0072B2",
marker=dict(line=dict(width=0)),
name="REACT Study Positive"),
             row=2, col=3)

fig.add_trace(go.Bar(
x=bam_react_gender_df['Gender'],
y=bam_react_gender_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study Total (REACT recruited)"),
row=2, col=3)

fig.add_trace(go.Bar(
x=bam_pos_react_gender_df['Gender'],
y=bam_pos_react_gender_df['% Positive'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
name="This Study Positive (REACT recruited)"),
row=2, col=3)


# REACT ethnicity
fig.add_trace(go.Bar(
x=react_ethnicity_df['Ethnicity'],
y=react_ethnicity_df['% Total'],
width=0.4,
offset=0,
showlegend=False, 
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="REACT Study Total"),
             row=3, col=3)

fig.add_trace(go.Bar(
x=react_ethnicity_df['Ethnicity'],
y=react_ethnicity_df['% Positive'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#0072B2",
marker=dict(line=dict(width=0)),
name="REACT Study Positive"),
             row=3, col=3)


fig.add_trace(go.Bar(
x=bam_pos_react_ethnicity_df['Ethnicity'],
y=bam_pos_react_ethnicity_df['% Positive'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
name="This Study Positive (REACT recruited)"),
row=3, col=3)

fig.add_trace(go.Bar(
x=bam_react_ethnicity_df['Ethnicity'],
y=bam_react_ethnicity_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study Total (REACT recruited)"),
row=3, col=3)


# Census region
fig.add_trace(go.Bar(
x=census_region_df['Region'],
y=census_region_df['% Total'],
width=0.4,
offset=0,
showlegend=False,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
name="Census Total"), 
             row=4, col=1)


fig.add_trace(go.Bar(
x=bam_total_region_df['Region'],
y=bam_total_region_df['% Total'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
name="This Study"),
row=4, col=1)

fig.add_trace(go.Bar(
x=bam_pos_total_region_df['Region'],
y=bam_pos_total_region_df['% Positive'],
width=0.4,
offset=-0.4,
showlegend=False,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
name="This Study"),
row=4, col=1)


# TT region
fig.add_trace(go.Bar(
x=tt_region_df['Region'],
y=tt_region_df['% Total'],
width=0.4,
offset=0,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
showlegend=False,
name="REACT Study Total"),
             row=4, col=2)

fig.add_trace(go.Bar(
x=tt_region_df['Region'],
y=tt_region_df['% Positive'],
width=0.4,
offset=0,
marker_color="#0072B2",
marker=dict(line=dict(width=0)),
showlegend=False,
name="REACT Study Positive"),
             row=4, col=2)


fig.add_trace(go.Bar(
x=bam_tt_region_df['Region'],
y=bam_tt_region_df['% Total'],
width=0.4,
offset=-0.4,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
showlegend=False,
name="This Study Positive (REACT recruited)"),
row=4, col=2)

fig.add_trace(go.Bar(
x=bam_pos_tt_region_df['Region'],
y=bam_pos_tt_region_df['% Positive'],
width=0.4,
offset=-0.4,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
showlegend=False,
name="This Study Total (REACT recruited)"),
row=4, col=2)



# REACT region
fig.add_trace(go.Bar(
x=react_region_df['Region'],
y=react_region_df['% Total'],
width=0.4,
offset=0,
marker_color="#56B4E9",
marker=dict(line=dict(width=0)),
showlegend=False,
name="REACT Study Total"),
             row=4, col=3)

fig.add_trace(go.Bar(
x=react_region_df['Region'],
y=react_region_df['% Positive'],
width=0.4,
offset=0,
marker_color="#0072B2",
marker=dict(line=dict(width=0)),
showlegend=False,
name="REACT Study Positive"),
             row=4, col=3)

fig.add_trace(go.Bar(
x=bam_react_region_df['Region'],
y=bam_react_region_df['% Total'],
width=0.4,
offset=-0.4,
marker_color="#E69F00",
marker=dict(line=dict(width=0)),
showlegend=False,
name="This Study Total (REACT recruited)"),
row=4, col=3)

fig.add_trace(go.Bar(
x=bam_pos_react_region_df['Region'],
y=bam_pos_react_region_df['% Positive'],
width=0.4,
offset=-0.4,
marker_color="#D55E00",
marker=dict(line=dict(width=0)),
showlegend=False,
name="This Study Positive (REACT recruited)"),
row=4, col=3)



fig.update_xaxes(title='Age Group', row=1)
fig.update_yaxes(range=[0,26], row=1)
fig.update_xaxes(title='Gender', row=2)
fig.update_yaxes(range=[0, 60], row=2)
fig.update_xaxes(title='Ethnicity', row=3)
fig.update_xaxes(title='Region', row=4)
fig.update_yaxes(range=[0, 20], row=4)
fig.update_xaxes(tickangle = 45, title_standoff = 5)
fig.update_yaxes(title='% of Total')

fig.layout.annotations[0].update(y=1.07)
fig.layout.annotations[1].update(y=1.07)
fig.layout.annotations[2].update(y=1.07)
fig.update_layout(template='simple_white', font_family='Arial', font_size=14, height=850, width=1000)

In [None]:
fig.write_image("DataPaperFig4.svg", scale=1)
fig.write_image("DataPaperFig4.png", scale=3)