# Generate Train Test Split

Import packages

In [None]:
import matplotlib
import seaborn as sns
from cycler import cycler
col_lst = ['#007C91', '#003B5C', '#582C83', '#1D57A5', '#8A1B61', '#E40046', '#00AB8E', '#00A5DF', '#84BD00', '#FF7F32', '#FFB81C', '#D5CB9F']
ukhsa_cycler = (cycler(color=['#007C91', '#003B5C', '#582C83', '#1D57A5', '#8A1B61', '#E40046', '#00AB8E', '#00A5DF', '#84BD00', '#FF7F32', '#FFB81C', '#D5CB9F']))
import pandas
from matplotlib import pyplot as plt
import pandas as pd
!pip install geopandas
import geopandas
import numpy as np
import ciab_data as uf
import sys
import pickle as pkl
import warnings
from configparser import ConfigParser
warnings.filterwarnings ('ignore')

Retrieve S3 config

In [None]:
config_filepath = 's3_config.ini'
config = ConfigParser()
config.read(config_filepath)
bucket_name = config['S3']['bucket']
original_split_path = config['S3']['original_split_path']
rebalanced_split_path = config['S3']['rebalanced_split_path']
splits_path = config['S3']['splits_path']

Initialise CIAB data class, filter missing data from the whole dataset and save to csv for use in logistic regression modelling

In [None]:
ciab_data = uf.CIAB_Data()
study_data = ciab_data.filter_missing_data(type='all', analyse=False, save_figures=False)
study_data.to_csv('StudyData.csv')

Generate original train test split with all asymptomatics held for the test set.

In [None]:
train_test_split = ciab_data.train_test_split(print_summary=True)

Generate rebalanced test set where ~half asymptomatics are held for the test set

In [None]:
train_test_split_asymp = ciab_data.train_test_split(print_summary=True, asymptomatics_in_train=True)

In [None]:
split_pkl1 = open('original_split_stage1.pkl', 'wb') 
pkl.dump(train_test_split, split_pkl1)

split_pkl2 = open('rebalanced_split_stage1.pkl', 'wb') 
pkl.dump(train_test_split_asymp, split_pkl2)

import boto3
s3 = boto3.client('s3')
with open('original_split_stage1.pkl', 'rb') as f:
    s3.put_object(Bucket=bucket_name,
                  Key=original_split_path,
                  Body=f
                 )
with open('rebalanced_split_stage1.pkl', 'rb') as f:
    s3.put_object(Bucket=bucket_name,
                  Key=rebalanced_split_path,
                  Body=f
                 )

In [None]:
splits_df = ciab_data.raw_study_data[['participant_identifier']]
splits_df['splits'] = splits_df['participant_identifier'].apply(lambda x: 'train' if x in train_test_split_asymp['train'] else ('test' if x in train_test_split_asymp['test'] else ('long' if x in train_test_split_asymp['longitudinal'] else 'Not Used')))
splits_df['original_splits'] = splits_df['participant_identifier'].apply(lambda x: 'train' if x in train_test_split['train'] else ('test' if x in train_test_split['test'] else ('long' if x in train_test_split['longitudinal'] else 'Not Used')))
splits_df.to_csv('train_test_splits_stage1.csv')


import boto3
s3 = boto3.client('s3')
with open('train_test_splits_stage1.csv', 'rb') as f:
    s3.put_object(Bucket=bucket_name,
                  Key=splits_path,
                  Body=f
                 )

# Meta-data Analysis

Print some statistics around submission dates and submission counts within the train-test set and longitudinal set

In [None]:
raw_data = ciab_data.raw_study_data

print('Earliest submission: ' + min(raw_data['submission_date']).strftime('%dst %B %Y'))
print('Latest Submission: ' + max(raw_data['submission_date']).strftime('%dth %B %Y'))

print('Total Raw Submissions: ' + str(len(raw_data)))

filtered_raw_data = ciab_data.filter_missing_data(type='all', analyse=False, save_figures=False)
print('Total Filtered Submissions: ' + str(len(filtered_raw_data)))

pre_nov29_study_data = ciab_data.raw_study_data.loc[pd.to_datetime(ciab_data.raw_study_data['submission_date']) + pd.to_timedelta(ciab_data.raw_study_data['submission_hour'], unit='H') <= '2021-11-29']

print('Total Pre Nov 29th Raw Submissions: ' + str(len(pre_nov29_study_data)))

filtered_pre_nov29_study_data = ciab_data.filter_missing_data(type='train_test', analyse=False, save_figures=False)
print('Total Pre Nov 29th Filtered Submissions: ' + str(len(filtered_pre_nov29_study_data)))

Analyse the train-test dataset, print out json data summary and save plots to 'figures' folder

In [None]:
train_test = ciab_data.filter_missing_data(type='train_test', analyse=True, save_figures=True)

Printing some statistics used within the Pigoli et. al. paper

In [None]:
print('Number of Asymptomatics: ' + str(len(train_test.loc[(train_test['symptom_none'] == 1) & (train_test['covid_test_result'] == 'Positive')])))

In [None]:
print('Median age of Covid + participants: ' + str(np.median(train_test.loc[train_test['covid_test_result'] == 'Positive']['age'])))
print('Median age of Covid - participants: ' + str(np.median(train_test.loc[train_test['covid_test_result'] == 'Negative']['age'])))
print('Median age of asymptomatics: ' + str(np.median(train_test.loc[(train_test['symptom_none'] == 1) & (train_test['covid_test_result'] == 'Positive')]['age'])))

In [None]:
print('Number of Males: ' + str(len(train_test.loc[train_test['gender'] == 'Male'])))
print('Number of Males: ' + str(len(train_test.loc[train_test['gender'] == 'Female'])))

Generating some other plots and tables used within the Pigoli et. al. paper:
* Gender split by test result
* Submission over time by recruitment source
* Age by Test Result split by Gender
* Symptom combinations frequency count split by recruitment source
* Cross tables of Symptoms, Recruitment Source and Covid-19 infection status

In [None]:
covid_by_gender = train_test[['gender', 'covid_test_result']]
covid_by_gender.columns = ['Gender', 'Covid Test Result']
ax = pd.crosstab(covid_by_gender['Gender'], covid_by_gender['Covid Test Result'], margins=False).plot.barh(stacked=True, color = ['#E40046', '#00AB8E'])
ax.set_xlabel('Frequency')
ax.set_ylabel('Gender')
plt.savefig('figures/covid_by_gender.png')

In [None]:
train_test['submission_time'] = pd.to_datetime(train_test['submission_date']) + pd.to_timedelta(train_test['submission_hour'], unit='H')

In [None]:
recruitment_time = train_test[['recruitment_source','submission_time', 'covid_test_result']]
recruitment_time['recruitment_source_agg'] = recruitment_time['recruitment_source'].apply(lambda x: x if x.strip() == 'Test and Trace' else 'REACT')
base = min(recruitment_time['submission_time'])
m = max(recruitment_time['submission_time'])
import datetime
num_days = 265
base + datetime.timedelta(days=num_days)

In [None]:
dates = []
tnt_positives = {}
tnt_negatives = {}
r_positives = {}
r_negatives = {}
for x in [x*2 for x in range(int((num_days+2)/2))]:
    x = x-1
    date = base + datetime.timedelta(days = x)
    dates = dates + [date]
    tnt_positives[date] = len(recruitment_time.loc[(recruitment_time['covid_test_result'] == 'Positive') & (recruitment_time['submission_time'] < date) & (recruitment_time['recruitment_source_agg'] == 'Test and Trace')]['submission_time'])
    tnt_negatives[date] = len(recruitment_time.loc[(recruitment_time['covid_test_result'] == 'Negative') & (recruitment_time['submission_time'] < date) & (recruitment_time['recruitment_source_agg'] == 'Test and Trace')]['submission_time'])
    r_positives[date] = len(recruitment_time.loc[(recruitment_time['covid_test_result'] == 'Positive') & (recruitment_time['submission_time'] < date) & (recruitment_time['recruitment_source_agg'] == 'REACT')]['submission_time'])
    r_negatives[date] = len(recruitment_time.loc[(recruitment_time['covid_test_result'] == 'Negative') & (recruitment_time['submission_time'] < date) & (recruitment_time['recruitment_source_agg'] == 'REACT')]['submission_time'])



plotting_df = pd.DataFrame(dates, columns = ['Date'])
plotting_df['Test and Trace Positives'] = plotting_df['Date'].apply(lambda x: tnt_positives[x])
plotting_df['Test and Trace Negatives'] = plotting_df['Date'].apply(lambda x: tnt_negatives[x])
plotting_df['REACT Positives'] = plotting_df['Date'].apply(lambda x: r_positives[x])
plotting_df['REACT Negatives'] = plotting_df['Date'].apply(lambda x: r_negatives[x])

from matplotlib.transforms import Bbox
small = min(plotting_df['Date'])
med = base + datetime.timedelta(days = int(num_days/2))
large = max(plotting_df['Date'])

plt.plot(plotting_df['Date'], plotting_df['Test and Trace Positives'], 'r--', label = 'Test and Trace Positives')
plt.plot(plotting_df['Date'], plotting_df['Test and Trace Negatives'], 'r', label = 'Test and Trace Negatives')
plt.plot(plotting_df['Date'], plotting_df['REACT Positives'], 'g--', label = 'REACT Positives')
plt.plot(plotting_df['Date'], plotting_df['REACT Negatives'], "g", label = 'REACT Negatives')
plt.xticks([small, med, large])
plt.title('Submission count over time')
plt.xlabel('Time')
plt.ylabel('Numer of submissions')
plt.legend(loc='upper center', bbox_to_anchor=(1.3, 1.03), shadow=True, ncol=1)
plt.savefig('figures/submission_over_time.png', bbox_inches=Bbox([[0, 0], [8, 4]]))

In [None]:
age_plot = train_test[['covid_test_result', 'age','gender']]
age_plot.columns = ['Test Result', 'Age','Gender']
age_plot = age_plot.sort_values(['Test Result', 'Gender'], ascending = [False, True])
sns.boxplot(data = age_plot, y = 'Test Result' , x = 'Age', palette=['#007C91', '#582C83', '#8A1B61'], hue = 'Gender')
plt.savefig('figures/age_by_gender_by_test_result.png')

In [None]:
symptom_df = train_test[['symptom_cough_any', 'symptom_new_continuous_cough',
       'symptom_runny_or_blocked_nose', 'symptom_shortness_of_breath',
       'symptom_sore_throat', 'symptom_abdominal_pain', 'symptom_diarrhoea',
       'symptom_fatigue', 'symptom_fever_high_temperature', 'symptom_headache',
       'symptom_change_to_sense_of_smell_or_taste', 'symptom_loss_of_taste', 'symptom_none', 'symptom_other', 'symptom_prefer_not_to_say']]

symptom_df.columns = [x.replace('symptom_', '').replace('_', ' ').capitalize() for x in symptom_df.columns]
for col in symptom_df.columns:
    symptom_df[col] = symptom_df[col].apply(lambda x: col if x == 1 else '')
train_test['symptoms'] = [[x for x in list(set(y)) if x != ''] for y in symptom_df.values.tolist()]
symptom_df = train_test[['covid_test_result', 'symptoms']]

In [None]:
!pip install upsetplot
from upsetplot import UpSet, from_memberships
pos = symptom_df.loc[symptom_df['covid_test_result'] == 'Positive']
neg = symptom_df.loc[symptom_df['covid_test_result'] == 'Negative']
covid_by_symptoms = from_memberships(pos.symptoms.apply(lambda x: ','.join(x)).str.split(','), data=pos)

UpSet(covid_by_symptoms, min_subset_size=80, sort_by='cardinality',facecolor='#007C91',show_counts=True).plot()
plt.savefig('figures/PositiveCasesSymptoms.png')

covid_by_symptoms = from_memberships(neg.symptoms.apply(lambda x: ','.join(x)).str.split(','), data=neg)
UpSet(covid_by_symptoms, min_subset_size=30, sort_by='cardinality',facecolor='#007C91',show_counts=True).plot()
plt.savefig('figures/NegativeCasesSymptoms.png')

In [None]:
train_test['recruitment_source_agg'] = train_test['recruitment_source'].apply(lambda x: x if x.strip() == 'Test and Trace' else 'REACT')
pd.crosstab(train_test['recruitment_source_agg'], train_test['symptom_any'])

In [None]:
pd.crosstab(train_test['covid_test_result'], train_test['symptom_any'])

In [None]:
pd.crosstab(train_test['covid_test_result'], train_test['recruitment_source_agg'])

## Other Analysis

#### Covid-19 infection status vs Any Symptoms

In [None]:
pd.crosstab(train_test['symptom_any'], train_test['covid_test_result'], margins = True)

#### Breakdown of smoker status

In [None]:
crosstab_subset = train_test[['smoker_status', 'covid_test_result']]
crosstab_subset.columns = ['Smoker Status', 'Test Result']
pd.crosstab(crosstab_subset['Smoker Status'], crosstab_subset['Test Result'], margins=False).plot.barh(stacked=True, color = ['#E40046', '#00AB8E'])
pd.crosstab(crosstab_subset['Smoker Status'], crosstab_subset['Test Result'], margins=False)

#### Breakdown of other respiratory conditions

In [None]:
respiratory_plot = train_test[['respiratory_condition_none', 'respiratory_condition_asthma',
       'respiratory_condition_copd_or_emphysema',
       'respiratory_condition_other',
       'respiratory_condition_prefer_not_to_say', 'covid_test_result']]

respiratory_plot.columns = [x.replace('respiratory_condition_', '').replace('_', ' ').capitalize() for x in respiratory_plot.columns[0:5]] + ['covid_test_result']
for col in respiratory_plot.columns[0:5]:
    respiratory_plot[col] = respiratory_plot[col].apply(lambda x: col if x == 1 else '')
respiratory_plot['conditions'] = [[x for x in list(set(y)) if x != ''] for y in respiratory_plot.iloc[:, 0:5].values.tolist()]

crosstab_subset = respiratory_plot[['conditions', 'covid_test_result']].explode('conditions')
crosstab_subset.columns = ['Other Respiratory Conditions', 'Test Result']
pd.crosstab(crosstab_subset['Other Respiratory Conditions'], crosstab_subset['Test Result'], margins=False).plot.barh(stacked=True, color = ['#E40046', '#00AB8E'])
print(pd.crosstab(crosstab_subset['Other Respiratory Conditions'], crosstab_subset['Test Result'], margins=False))

#### Breakdown of mask wearing

In [None]:
mask_count_dict = {}
for option in list(set(train_test['wearing_mask'])):
    mask_count_dict[option] = list(train_test['wearing_mask']).count(option)
    print(option + ': ' + str(list(train_test['wearing_mask']).count(option)))
plt_dict = mask_count_dict.items()
plt_dict = sorted(plt_dict) 
x, y = zip(*plt_dict) 
plt.barh(x, y, color = col_lst[0])
plt.show()

#### Breakdown of ethnicity

In [None]:
eth_count_dict = {}
exploded_data = [x for x in train_test['ethnicity'] if str(x) != 'nan']
for option in list(set(exploded_data)):
    eth_count_dict[option] = list(exploded_data).count(option)
    print(option + ': ' + str(list(exploded_data).count(option)))
# plt_dict = eth_count_dict.items()
# plt_dict = sorted(plt_dict) 
# x, y = zip(*plt_dict) 
# plt.barh(x, y, color = col_lst[0])
# plt.show()

#### Vaccine Status

In [None]:
vaccine_by_age = train_test[['age', 'covid_vaccine_doses']].dropna()
vaccine_by_age.covid_vaccine_doses = pd.Categorical(vaccine_by_age.covid_vaccine_doses, categories = ['>2', 2, 1, 0])
vaccine_by_age.columns = ['Age', 'Vaccination Doses']
sns.boxplot(data = vaccine_by_age , y = 'Vaccination Doses' , x = 'Age', palette=['#007C91'])#, '#582C83', '#8A1B61'])


#### Breakdown of Local Authority

In [None]:
loc_count_df = pd.DataFrame()
for loc in list(set([x for x in train_test['pseudonymised_local_authority_code'] if str(x) != 'nan'])):
    #print(loc + ': ' + str([x for x in study_data['Location'] if str(x) != 'nan'].count(loc)))
    loc_count_df = loc_count_df.append([[loc, [x for x in train_test['pseudonymised_local_authority_code'] if str(x) != 'nan'].count(loc)]])
loc_count_df.columns = ['Pseudonymised Local Authority Code', 'Frequency']
loc_count_df.sort_values(by = 'Frequency', ascending = False)

##### Geography Analysis - these are strategically selected for inclusion in the test set for high submission counts

In [None]:
print('In LAD00262:')
print('Num. Positive Cases: ' + str(list(train_test.loc[train_test['pseudonymised_local_authority_code'] == 'LAD00262']['covid_test_result']).count('Positive')))
print('Num. Negative Cases: ' + str(list(train_test.loc[train_test['pseudonymised_local_authority_code'] == 'LAD00262']['covid_test_result']).count('Negative')))

In [None]:
print('In LAD00272:')
print('Num. Positive Cases: ' + str(list(train_test.loc[train_test['pseudonymised_local_authority_code'] == 'LAD00272']['covid_test_result']).count('Positive')))
print('Num. Negative Cases: ' + str(list(train_test.loc[train_test['pseudonymised_local_authority_code'] == 'LAD00272']['covid_test_result']).count('Negative')))

In [None]:
print('In LAD00048:')
print('Num. Positive Cases: ' + str(list(train_test.loc[train_test['pseudonymised_local_authority_code'] == 'LAD00048']['covid_test_result']).count('Positive')))
print('Num. Negative Cases: ' + str(list(train_test.loc[train_test['pseudonymised_local_authority_code'] == 'LAD00048']['covid_test_result']).count('Negative')))

In [None]:
print('In LAD00257:')
print('Num. Positive Cases: ' + str(list(train_test.loc[train_test['pseudonymised_local_authority_code'] == 'LAD00257']['covid_test_result']).count('Positive')))
print('Num. Negative Cases: ' + str(list(train_test.loc[train_test['pseudonymised_local_authority_code'] == 'LAD00257']['covid_test_result']).count('Negative')))

#### Breakdown of Recruitment Source

In [None]:
crosstab_rec = train_test[['recruitment_source', 'covid_test_result']]
crosstab_rec.columns = ['Recruitment Channel', 'Test Result']
pd.crosstab(crosstab_rec['Recruitment Channel'], crosstab_rec['Test Result'], margins=False).plot.barh(stacked=True, color = ['#E40046', '#00AB8E'])
pd.crosstab(train_test['recruitment_source'], train_test['covid_test_result'], margins=True)

#### Breakdown of Submission Delay

Firstly we change these time-deta objects to numerical values representing the amount of time in days.

Plot this against covid status

In [None]:
print('The minimum submission delay is ' + str(min(train_test['submission_delay'])) + ' days')
print('The maximum submission delay is ' + str(max(train_test['submission_delay'])) + ' days')
print('The number of submissions with a delay >2 days is ' + str(sum(train_test['submission_delay'] > 2)))
print('The number of submissions with a delay >3 days is ' + str(sum(train_test['submission_delay'] > 3)))
print('The number of submissions with a delay >4 days is ' + str(sum(train_test['submission_delay'] > 4)))
print('The number of submissions with a delay >5 days is ' + str(sum(train_test['submission_delay'] > 5)))
delay_counts = {}
for delay in list(set(train_test['submission_delay'])):
    delay_counts[int(delay)] = list(train_test['submission_delay']).count(delay)
plot_df = pd.DataFrame({'Submission Delay (days)':delay_counts.keys(), 'Frequency':delay_counts.values()})
plot_df.plot.bar(x='Submission Delay (days)', y='Frequency', color='#007C91', width=0.98)

#### Viral Load Analysis

In [None]:
print('There are ' + str(sum(1-train_test['covid_viral_load_category'].isna())) + ' (' + str(np.round(100*sum(1-train_test['covid_viral_load_category'].isna())/len(train_test), 2)) + '%' + ') submissions with an associated viral load category')

In [None]:
dct = {}
viral_load_filt = train_test.loc[~train_test['covid_viral_load_category'].isna()]
for cat in list(set(viral_load_filt['covid_viral_load_category'])):
    dct[cat] = list(viral_load_filt['covid_viral_load_category']).count(cat)
sorted_dct = {k: v for k, v in sorted(dct.items(), key=lambda item: item[1])}
plt.bar([x.replace(' ', '\n') for x in sorted_dct.keys()], sorted_dct.values(), color = '#007C91')
plt.savefig('figures/viral_load_frequency.png', dpi=600)

Check that this is not counfounding with Age as this will effect the age distribution in train-test split

In [None]:
viral_load_df = train_test[['covid_viral_load_category', 'age']].dropna()
viral_load_df['covid_viral_load_category'] = viral_load_df['covid_viral_load_category'].apply(lambda x: x.replace(' ', '\n'))
viral_load_df.columns = ['Covid viral load category', 'Age']
sns.boxplot(data = viral_load_df , x = 'Covid viral load category' , y = 'Age')

#### Vaccination Status

In [None]:
pd.crosstab(train_test['covid_vaccine_doses'], train_test['covid_test_result'])

In [None]:
sns.boxplot(data=train_test, x='covid_vaccine_doses', y='age')