# Experimental Results

## Functions for statistics

In [1]:
import os
import numpy as np
import random
import pandas as pd
from sklearn import metrics
from operator import itemgetter
import json

map_dict = {'Male': 0,
            'Female': 1,
            'Sex':2,
            '0-19':3,
            '20-29':4,
            '30-39':5,
            '40-49':6,
            '50-59':7,
            '60-69':8,
            '70-79':9,
            '80-89':10,
            '90-':11,
            'Age': 12,
            'ecig':13,  
            'ex': 14, 
            'never':15,
            'ltOnce':16,
            '1to10':17,
            '21+':18, 
            'Smoke':19,
            'it':20,
            'en':21,
            'es':22,
            'de':23,
            'Others':24,
            'Hospital_yes':25,
            'Hospital_no':26,
            }
#Uid;Age;Sex;Medhistory;Smoking;Language;Date;Folder Name;Symptoms;Covid-Tested;Hospitalized;Location;Voice filename;Cough filename;Breath filename 
def get_covid(temp):
    cot, sym, med, smo, hos = temp[9], temp[8], temp[3], temp[4], temp[10]
    #print(cot, sym, med, smo)
    sym_dict = {'drycough':0.0, 'smelltasteloss':0.0, 'headache':0.0,'sorethroat':0.0,
            'muscleache':0.0,'wetcough':0.0,'shortbreath':0.0,'tightness':0.0,
            'fever':0.0,'dizziness':0.0,'chills':0.0,'runnyblockednose':0.0, 'None': 0.0}
    syms = sym.split(',')
    for s in syms:
        if s == 'tighness':
            s = 'tightness'
        if s == 'drycoough':
            s = 'drycough'
        if s == 'runny':
            s = 'runnyblockednose'
        if s == 'none' or s == '':
            s = 'None'
        if s in sym_dict:
            sym_dict[s] = 1
    sym_feature = [sym_dict[s] for s in sorted(sym_dict)]  
    
    if cot == 'last14' or cot == 'yes' or cot == 'positiveLast14': #or cot == 'positiveOver14' or cot == 'over14' :
       
        if sym in ['None','','none']: #'pnts'
            label = 'covidnosym'
        else:
            label = 'covidsym'

    elif cot == 'negativeNever':
       
        if sym in ['None','','none']:
            label = 'healthnosym'
        else:
            label = 'healthsym'
  
    else:
        label = 'negativeLast14_over14'
    return label,sym_feature

def get_demo(temp):
    dis = [0]*27
    
    uid, age, sex, smo, lan, hos = temp[0], temp[1], temp[2], temp[4], temp[5], temp[10]
    #print(uid, age, sex, smo, lan)

    if age in map_dict:
        dis[map_dict[age]] = 1.0
    else:
        dis[map_dict['Age']] = 1.0
        
    if sex in map_dict:
        dis[map_dict[sex]] = 1.0
    else:
        dis[map_dict['Sex']] = 1.0               
        
    if smo in map_dict:
        dis[map_dict[smo]] = 1.0
    else:
        dis[map_dict['Smoke']] = 1.0                
        
    lan = 'en'
    if lan in map_dict:
        dis[map_dict[lan]] = 1.0
    else:
        dis[map_dict['Others']] = 1.0  
    
    if hos == 'no':
        dis[map_dict['Hospital_no']] = 1.0
    elif hos == 'yes':
        dis[map_dict['Hospital_yes']] = 1.0
        
    return dis

demo_dict = {}

with open('../COVID19_prediction/data/preprocess/result_data_0426_en_all.csv') as f:
    for index, line in enumerate(f):
        if index>0:
            temp = line.strip().split(';')
            demo_dict[temp[0] + '/' + temp[7]] = temp
demo_label = sorted(map_dict.items(),key = lambda x:x[1],reverse = False) 
            
def get_metrics(probs,label):      
    predicted = []
    for i in range(len(probs)):
        if probs[i]> 0.5:       
            predicted.append(1)
        else:
            predicted.append(0)     

    pre = metrics.precision_score(label, predicted)
    acc = metrics.accuracy_score(label, predicted)
    auc = metrics.roc_auc_score(label, probs)
    precision, recall, _ = metrics.precision_recall_curve(label, probs)
    rec = metrics.recall_score(label, predicted)

    TN, FP, FN, TP = metrics.confusion_matrix(label,predicted).ravel()   
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP*1.0/(TP+FN)    
    # Specificity or true negative rate
    TNR = TN*1.0/(TN+FP) 

    return auc, TPR, TNR, 0

def get_CI(data, AUC, Sen, Spe):
    AUCs = []
    TPRs = []
    TNRs = []
    for s in range(1000):
        np.random.seed(s) #Para2
        sample = np.random.choice(range(len(data)), len(data), replace=True)
        samples = [data[i] for i in sample]
        sample_pro = [x[0] for x in samples]
        sample_label = [x[1] for x in samples]
        try:
            get_metrics(sample_pro,sample_label)
        except ValueError:
            np.random.seed(1001) #Para2
            sample = np.random.choice(range(len(data)), len(data), replace=True)
            samples = [data[i] for i in sample]
            sample_pro = [x[0] for x in samples]
            sample_label = [x[1] for x in samples]
        else:
            auc, TPR, TNR, _ = get_metrics(sample_pro,sample_label)
        AUCs.append(auc)
        TPRs.append(TPR)
        TNRs.append(TNR)
    
    
    q_0 = pd.DataFrame(np.array(AUCs)).quantile(0.025)[0] #2.5% percentile
    q_1 = pd.DataFrame(np.array(AUCs)).quantile(0.975)[0] #97.5% percentile
    
    q_2 = pd.DataFrame(np.array(TPRs)).quantile(0.025)[0] #2.5% percentile
    q_3 = pd.DataFrame(np.array(TPRs)).quantile(0.975)[0] #97.5% percentile

    q_4 = pd.DataFrame(np.array(TNRs)).quantile(0.025)[0] #2.5% percentile
    q_5 = pd.DataFrame(np.array(TNRs)).quantile(0.975)[0] #97.5% percentile
   
    return('&' + str(AUC.round(2)) + '(' + str(q_0.round(2)) + '-' + str(q_1.round(2)) + ')'
           + '&' + str(Sen.round(2)) + '(' + str(q_2.round(2)) + '-' + str(q_3.round(2)) + ')'
             '&' + str(Spe.round(2)) + '(' + str(q_4.round(2)) + '-' + str(q_5.round(2)) + ')' )


## Figure 2(b): Performance of three modalities (Breathing + Cough + Voice)

In [2]:
File = 'output/main_task_0.5.txt'    

#main results
user = []   
probs = []
labels = []
data = []
negative_user = []
with open(File) as f:
    for line in f:        
        uid, pro, label = line.split()
        UID, date = uid.split('/')
        
        temp = demo_dict[uid]
        demo = get_demo(temp)
        if True:
            user.append(UID)
            pro = float(pro)
            label = float(label)
            label = 1 if label > 0 else 0
            probs.append(pro)
            data.append([pro,label])
            labels.append(label)
            if label == 0:
                negative_user.append(UID)

    auc, TPR, TNR, _   = get_metrics(probs,labels)
ss = get_CI(data, auc, TPR, TNR)
print('user(samplse)&auc&sensitivity&specificity')
print('&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
      '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss ) 

user(samplse)&auc&sensitivity&specificity
&100(162)/100(162)&0.71(0.65-0.77)&0.65(0.58-0.72)&0.69(0.62-0.76)


## Figure 2(c): Performance of subgroups  

In [3]:
# demographic groups    
for d in [0,1,4,5,6,7]: 
    print(demo_label[d],'-------------------------------------------------------')
    user = []   
    probs = []
    labels = []
    data = []
    negative_user = []
    with open(File) as f:
        for line in f:        
            uid, pro, label = line.split()
            UID, date = uid.split('/')
            
            temp = demo_dict[uid]
            demo = get_demo(temp)
            if demo[d] == 1:
                user.append(UID)
                pro = float(pro)
                label = float(label)
                label = 1 if label > 0 else 0
                probs.append(pro)
                data.append([pro,label])
                labels.append(label)
                if label == 0:
                    negative_user.append(UID)
        auc, TPR, TNR,_  = get_metrics(probs,labels)


    ss = get_CI(data,auc, TPR, TNR)
    print('&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'
          + ss + '//') 

    
for d in [9]: 
    print('16-39','-------------------------------------------------------')
    user = []   
    probs = []
    labels = []
    data = []
    negative_user = []
    with open(File) as f:
        for line in f:        
            uid, pro, label = line.split()
            UID, date = uid.split('/')
            
            temp = demo_dict[uid]
            demo = get_demo(temp)
            if sum(demo[3:6]) >= 1:
                user.append(UID)
                pro = float(pro)
                label = float(label)
                label = 1 if label > 0 else 0
                probs.append(pro)
                data.append([pro,label])
                labels.append(label)
                if label == 0:
                    negative_user.append(UID)

        auc, TPR, TNR,_  = get_metrics(probs,labels)


    ss = get_CI(data,auc, TPR, TNR)
    print('&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'
          + ss + '//') 
         
for d in [9]: 
    print('40-59','-------------------------------------------------------')
    user = []   
    probs = []
    labels = []
    data = []
    negative_user = []
    with open(File) as f:
        for line in f:        
            uid, pro, label = line.split()
            UID, date = uid.split('/')
            
            temp = demo_dict[uid]
            demo = get_demo(temp)
            if sum(demo[6:8]) >= 1:
                user.append(UID)
                pro = float(pro)
                label = float(label)
                label = 1 if label > 0 else 0
                probs.append(pro)
                data.append([pro,label])
                labels.append(label)
                if label == 0:
                    negative_user.append(UID)

        auc, TPR, TNR,_  = get_metrics(probs,labels)


    ss = get_CI(data,auc, TPR, TNR)
    print('&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'
          + ss + '//') 

for d in [7]: 
    print('60-','-------------------------------------------------------')
    user = []   
    probs = []
    labels = []
    data = []
    negative_user = []
    with open(File) as f:
        for line in f:        
            uid, pro, label = line.split()
            UID, date = uid.split('/')
            
            temp = demo_dict[uid]
            demo = get_demo(temp)
            if sum(demo[8:12]) >= 1:
                user.append(UID)
                pro = float(pro)
                label = float(label)
                label = 1 if label > 0 else 0
                probs.append(pro)
                data.append([pro,label])
                labels.append(label)
                if label == 0:
                    negative_user.append(UID)
            
        auc, TPR, TNR,_  = get_metrics(probs,labels)

    ss = get_CI(data,auc, TPR, TNR)
    print('&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'
          + ss + '//')  
#Symptoms
for d in [' ']: 
    user = []   
    probs = []
    labels = []
    data = []
    negative_user = []
    with open(File) as f:
        for line in f:        
            uid, pro, label = line.split()
            UID, date = uid.split('/')           
            temp = demo_dict[uid]
            demo = get_demo(temp)
            covid,sym = get_covid(temp)

            if True:
                pro = float(pro)
                label = float(label)
                label = 1 if label > 0 else 0
                if covid == 'covidsym' or covid == 'healthsym':
                    user.append(UID)
                    probs.append(pro)
                    data.append([pro,label])
                    labels.append(label)
                    if label == 0:
                        negative_user.append(UID)

        auc, TPR, TNR,_  = get_metrics(probs,labels)
        
print('Symptomatic:--------------------------------------------')
ss = get_CI(data,auc, TPR, TNR)
print('&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
      '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'
      + ss + '//')     
for d in [' ']: 
    user = []   
    probs = []
    labels = []
    data = []
    negative_user = []
    with open(File) as f:
        for line in f:        
            uid, pro, label = line.split()
            UID, date = uid.split('/')           
            temp = demo_dict[uid]
            demo = get_demo(temp)
            covid,sym = get_covid(temp)

            if True:
                pro = float(pro)
                label = float(label)
                label = 1 if label > 0 else 0
                if covid == 'covidnosym' or covid == 'healthnosym':
                    user.append(UID)
                    probs.append(pro)
                    data.append([pro,label])
                    labels.append(label)
                    if label == 0:
                        negative_user.append(UID)

        auc, TPR, TNR,_  = get_metrics(probs,labels)
print('Asymptomatic:--------------------------------------------')
ss = get_CI(data,auc, TPR, TNR)
print('&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
      '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'
      + ss + '//')  

('Male', 0) -------------------------------------------------------
&58(85)/52(76)&0.71(0.63-0.78)&0.59(0.49-0.68)&0.74(0.63-0.83)//
('Female', 1) -------------------------------------------------------
&42(77)/46(84)&0.73(0.65-0.8)&0.71(0.61-0.81)&0.65(0.55-0.75)//
('20-29', 4) -------------------------------------------------------
&21(30)/21(30)&0.68(0.55-0.81)&0.57(0.38-0.74)&0.7(0.52-0.85)//
('30-39', 5) -------------------------------------------------------
&33(50)/33(54)&0.62(0.51-0.73)&0.56(0.42-0.7)&0.63(0.5-0.76)//
('40-49', 6) -------------------------------------------------------
&23(41)/24(38)&0.82(0.72-0.91)&0.8(0.68-0.92)&0.74(0.59-0.87)//
('50-59', 7) -------------------------------------------------------
&13(27)/10(12)&0.58(0.39-0.76)&0.59(0.39-0.78)&0.5(0.2-0.8)//
16-39 -------------------------------------------------------
&55(81)/54(84)&0.65(0.56-0.73)&0.57(0.46-0.68)&0.65(0.55-0.75)//
40-59 -------------------------------------------------------
&36(68)/34(50)&

## Figure 2(b):  Single Modality 

In [4]:
print('user(samplse)&auc&sensitivity&specificity')
for m in ['B','C','V']:
  
    File = 'output/main_test_' + m + '.txt'    

    #main results
    user = []   
    probs = []
    labels = []
    data = []
    negative_user = []
    with open(File) as f:
        for line in f:        
            uid, pro, label = line.split()
            UID, date = uid.split('/')

            temp = demo_dict[uid]
            demo = get_demo(temp)
            if True:
                user.append(UID)
                pro = float(pro)
                label = float(label)
                label = 1 if label > 0 else 0
                probs.append(pro)
                data.append([pro,label])
                labels.append(label)
                if label == 0:
                    negative_user.append(UID)

        auc, TPR, TNR, _   = get_metrics(probs,labels)
    ss = get_CI(data, auc, TPR, TNR)

    print( m + '&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss ) 

user(samplse)&auc&sensitivity&specificity
B&100(162)/100(162)&0.62(0.56-0.68)&0.64(0.56-0.71)&0.56(0.48-0.63)
C&100(162)/100(162)&0.66(0.6-0.71)&0.59(0.51-0.66)&0.66(0.58-0.73)
V&100(162)/100(162)&0.61(0.55-0.67)&0.57(0.49-0.64)&0.6(0.52-0.67)


## Figure 3(a):  Results of different prevalence levels.

In [5]:
print('user(samplse)&auc&sensitivity&specificity')
for p in [0.05,0.1,0.2]:
    print(p)
    File = 'output/main_pre_' + str(p) + '.txt'    

    #main results
    user = []   
    probs = []
    labels = []
    data = []
    negative_user = []
    with open(File) as f:
        for line in f:        
            uid, pro, label = line.split()
            UID, date = uid.split('/')

            temp = demo_dict[uid]
            demo = get_demo(temp)
            if True:
                user.append(UID)
                pro = float(pro)
                label = float(label)
                label = 1 if label > 0 else 0
                probs.append(pro)
                data.append([pro,label])
                labels.append(label)
                if label == 0:
                    negative_user.append(UID)

        auc, TPR, TNR, _   = get_metrics(probs,labels)
    ss = get_CI(data, auc, TPR, TNR)

    print(str(p) + '&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss ) 

user(samplse)&auc&sensitivity&specificity
0.05
0.05&80(120)/1520(2372)&0.71(0.66-0.75)&0.65(0.57-0.73)&0.65(0.63-0.67)
0.1
0.1&100(162)/900(1439)&0.69(0.65-0.74)&0.65(0.57-0.72)&0.65(0.63-0.68)
0.2
0.2&100(162)/400(635)&0.69(0.65-0.74)&0.65(0.58-0.72)&0.63(0.59-0.67)


## Figure 3(a):  Results of different health conditions.

In [6]:
print('user(samplse)&auc&sensitivity&specificity')
File = 'output/main_test_all.txt'
#main results

###########################################################################################################################
user = []   
probs = []
labels = []
data = []
negative_user = []
with open(File) as f:
    for line in f:        
        uid, pro, label = line.split()
        UID, date = uid.split('/')
        temp = demo_dict[uid]
        demo = get_demo(temp)
        if 'cough' in temp[8] or 'coough' in temp[8]:
            user.append(UID)
            pro = float(pro)
            label = float(label)
            label = 1 if label > 0 else 0
            probs.append(pro)
            data.append([pro,label])
            labels.append(label)
            if label == 0:
                negative_user.append(UID)
    auc, TPR, TNR, _   = get_metrics(probs,labels)
ss = get_CI(data, auc, TPR, TNR)
print('Cough&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss )   
###########################################################################################################################
user = []   
probs = []
labels = []
data = []
negative_user = []
with open(File) as f:
    for line in f:        
        uid, pro, label = line.split()
        UID, date = uid.split('/')
        temp = demo_dict[uid]
        demo = get_demo(temp)
        if 'asthma' in temp: 
            user.append(UID)
            pro = float(pro)
            label = float(label)
            label = 1 if label > 0 else 0
            probs.append(pro)
            data.append([pro,label])
            labels.append(label)
            if label == 0:
                negative_user.append(UID)
    auc, TPR, TNR, _   = get_metrics(probs,labels)
ss = get_CI(data, auc, TPR, TNR)
print('Asthma&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss )   

user = []   
probs = []
labels = []
data = []
negative_user = []
with open(File) as f:
    for line in f:        
        uid, pro, label = line.split()
        UID, date = uid.split('/')
        temp = demo_dict[uid]
        demo = get_demo(temp)
        if 'hbp' in temp:
            user.append(UID)
            pro = float(pro)
            label = float(label)
            label = 1 if label > 0 else 0
            probs.append(pro)
            data.append([pro,label])
            labels.append(label)
            if label == 0:
                negative_user.append(UID)
    auc, TPR, TNR, _   = get_metrics(probs,labels)
ss = get_CI(data, auc, TPR, TNR)
print('Hbp&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss )   


user = []   
probs = []
labels = []
data = []
negative_user = []
with open(File) as f:
    for line in f:        
        uid, pro, label = line.split()
        UID, date = uid.split('/')
        temp = demo_dict[uid]
        demo = get_demo(temp)
        if temp[3] == '' or temp[3] == 'none' or temp[3] == 'None':
            user.append(UID)
            pro = float(pro)
            label = float(label)
            label = 1 if label > 0 else 0
            probs.append(pro)
            data.append([pro,label])
            labels.append(label)
            if label == 0:
                negative_user.append(UID)
    auc, TPR, TNR, _   = get_metrics(probs,labels)
ss = get_CI(data, auc, TPR, TNR)
print('Non-&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss )   


###############################################################################################################################
user = []   
probs = []
labels = []
data = []
negative_user = []
with open(File) as f:
    for line in f:        
        uid, pro, label = line.split()
        UID, date = uid.split('/')
        temp = demo_dict[uid]
        demo = get_demo(temp)
        if temp[4] == 'never':
            user.append(UID)
            pro = float(pro)
            label = float(label)
            label = 1 if label > 0 else 0
            probs.append(pro)
            data.append([pro,label])
            labels.append(label)
            if label == 0:
                negative_user.append(UID)
    auc, TPR, TNR, _   = get_metrics(probs,labels)
ss = get_CI(data, auc, TPR, TNR)
print('Never-&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss )   
user = []   
probs = []
labels = []
data = []
negative_user = []
with open(File) as f:
    for line in f:        
        uid, pro, label = line.split()
        UID, date = uid.split('/')
        temp = demo_dict[uid]
        demo = get_demo(temp)
        if temp[4] == 'ex':
            user.append(UID)
            pro = float(pro)
            label = float(label)
            label = 1 if label > 0 else 0
            probs.append(pro)
            data.append([pro,label])
            labels.append(label)
            if label == 0:
                negative_user.append(UID)
    auc, TPR, TNR, _   = get_metrics(probs,labels)
ss = get_CI(data, auc, TPR, TNR)
print('Ex-&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss )   



user = []   
probs = []
labels = []
data = []
negative_user = []
with open(File) as f:
    for line in f:        
        uid, pro, label = line.split()
        UID, date = uid.split('/')
        temp = demo_dict[uid]
        demo = get_demo(temp)
        if temp[4] == 'ltOnce' or temp[4] == '1to10' or temp[4] == '10to20' or temp[4] == ' 21+' or temp[4] == 'ecig' :
            
            user.append(UID)
            pro = float(pro)
            label = float(label)
            label = 1 if label > 0 else 0
            probs.append(pro)
            data.append([pro,label])
            labels.append(label)
            if label == 0:
                negative_user.append(UID)
                
    auc, TPR, TNR, _   = get_metrics(probs,labels)
ss = get_CI(data, auc, TPR, TNR)
print('Smoking-&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss )   

###############################################################################################################################
user = []   
probs = []
labels = []
data = []
negative_user = []
with open(File) as f:
    for line in f:        
        uid, pro, label = line.split()
        UID, date = uid.split('/')
        temp = demo_dict[uid]
        demo = get_demo(temp)
        if temp[10] == 'yes':
            user.append(UID)
            pro = float(pro)
            label = float(label)
            label = 1 if label > 0 else 0
            probs.append(pro)
            data.append([pro,label])
            labels.append(label)
            if label == 0:
                negative_user.append(UID)
    auc, TPR, TNR, _   = get_metrics(probs,labels)
ss = get_CI(data, auc, TPR, TNR)
print('Hospitalisation_yes&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss )   



user = []   
probs = []
labels = []
data = []
negative_user = []
with open(File) as f:
    for line in f:        
        uid, pro, label = line.split()
        UID, date = uid.split('/')
        temp = demo_dict[uid]
        demo = get_demo(temp)
        if temp[10] == 'no' :
            
            user.append(UID)
            pro = float(pro)
            label = float(label)
            label = 1 if label > 0 else 0
            probs.append(pro)
            data.append([pro,label])
            labels.append(label)
            if label == 0:
                negative_user.append(UID)
                
    auc, TPR, TNR, _   = get_metrics(probs,labels)
ss = get_CI(data, auc, TPR, TNR)
print('Hospitalisation_no&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')'+ ss ) 

user(samplse)&auc&sensitivity&specificity
Cough&69(100)/705(891)&0.65(0.59-0.71)&0.65(0.55-0.74)&0.58(0.54-0.61)
Asthma&10(12)/142(236)&0.59(0.42-0.77)&0.33(0.07-0.64)&0.62(0.55-0.68)
Hbp&6(13)/90(159)&0.71(0.56-0.85)&0.54(0.25-0.82)&0.69(0.62-0.76)
Non-&73(112)/1082(1605)&0.71(0.66-0.76)&0.7(0.61-0.78)&0.65(0.63-0.68)
Never-&55(94)/840(1330)&0.73(0.67-0.79)&0.7(0.61-0.8)&0.66(0.63-0.68)
Ex-&17(28)/285(481)&0.71(0.62-0.8)&0.64(0.47-0.82)&0.67(0.62-0.71)
Smoking-&22(34)/257(357)&0.57(0.46-0.68)&0.47(0.31-0.66)&0.63(0.58-0.68)
Hospitalisation_yes&3(3)/6(6)&0.83(0.45-1.0)&1.0(1.0-1.0)&0.67(0.25-1.0)
Hospitalisation_no&97(159)/1517(2365)&0.69(0.65-0.74)&0.64(0.57-0.72)&0.65(0.63-0.67)


## Supplementary Figure 1(b) and 2(b): Impact of thresholds

In [7]:

def get_metrics2(probs,label,th):      
    predicted = []
    for i in range(len(probs)):
        if probs[i]> th:       
            predicted.append(1)
        else:
            predicted.append(0)     

    pre = metrics.precision_score(label, predicted)
    acc = metrics.accuracy_score(label, predicted)
    auc = metrics.roc_auc_score(label, probs)
    precision, recall, _ = metrics.precision_recall_curve(label, probs)
    rec = metrics.recall_score(label, predicted)

    TN, FP, FN, TP = metrics.confusion_matrix(label,predicted).ravel()   
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP*1.0/(TP+FN)    
    # Specificity or true negative rate
    TNR = TN*1.0/(TN+FP) 

    return TPR+TNR-1, TPR, TNR, 0
def get_CI2(data, AUC, Sen, Spe,th):
    AUCs = []
    TPRs = []
    TNRs = []
    for s in range(1000):
        np.random.seed(s) #Para2
        sample = np.random.choice(range(len(data)), len(data), replace=True)
        samples = [data[i] for i in sample]
        sample_pro = [x[0] for x in samples]
        sample_label = [x[1] for x in samples]
        try:
            get_metrics2(sample_pro,sample_label,th)
        except ValueError:
            np.random.seed(1001) #Para2
            sample = np.random.choice(range(len(data)), len(data), replace=True)
            samples = [data[i] for i in sample]
            sample_pro = [x[0] for x in samples]
            sample_label = [x[1] for x in samples]
        else:
            auc, TPR, TNR, _ = get_metrics2(sample_pro,sample_label,th)
        AUCs.append(TPR + TNR -1)
        TPRs.append(TPR)
        TNRs.append(TNR)
    
    
    q_0 = pd.DataFrame(np.array(AUCs)).quantile(0.025)[0] #2.5% percentile
    q_1 = pd.DataFrame(np.array(AUCs)).quantile(0.975)[0] #97.5% percentile
    
    q_2 = pd.DataFrame(np.array(TPRs)).quantile(0.025)[0] #2.5% percentile
    q_3 = pd.DataFrame(np.array(TPRs)).quantile(0.975)[0] #97.5% percentile

    q_4 = pd.DataFrame(np.array(TNRs)).quantile(0.025)[0] #2.5% percentile
    q_5 = pd.DataFrame(np.array(TNRs)).quantile(0.975)[0] #97.5% percentile
   
    return('&' + str(Sen.round(2)) + '(' + str(q_2.round(2)) + '-' + str(q_3.round(2)) + ')' +
           '&' + str(Spe.round(2)) + '(' + str(q_4.round(2)) + '-' + str(q_5.round(2)) + ')' +
           '&' + str(AUC.round(2)) + '(' + str(q_0.round(2)) + '-' + str(q_1.round(2)) + ')')


File = 'output/main_task_0.5.txt'  
print('Asymptomatic:')
for t in [0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8]:
    print(t)
    user = []   
    probs = []
    labels = []
    data = []
    negative_user = []
    with open(File) as f:
        for line in f:        
            uid, pro, label = line.split()
            UID, date = uid.split('/')           
            temp = demo_dict[uid]
            demo = get_demo(temp)
            covid,sym = get_covid(temp)

            if True:
                pro = float(pro)
                label = float(label)
                label = 1 if label > 0 else 0
                if covid == 'covidnosym' or covid == 'healthnosym':
                    user.append(UID)
                    probs.append(pro)
                    data.append([pro,label])
                    labels.append(label)
                    if label == 0:
                        negative_user.append(UID)
    auc, TPR, TNR,_  = get_metrics2(probs,labels, t)
    ss = get_CI2(data,auc, TPR, TNR,t)
    print('&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')' + ss + '//')  
    
print('Symptomatic:')
for t in [0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8]:
    print(t)
    # sympotom
    user = []   
    probs = []
    labels = []
    data = []
    negative_user = []
    with open(File) as f:
        for line in f:        
            uid, pro, label = line.split()
            UID, date = uid.split('/')           
            temp = demo_dict[uid]
            demo = get_demo(temp)
            covid,sym = get_covid(temp)

            if True:
                pro = float(pro)
                label = float(label)
                label = 1 if label > 0 else 0
                if covid == 'covidsym' or covid == 'healthsym':
                    user.append(UID)
                    probs.append(pro)
                    data.append([pro,label])
                    labels.append(label)
                    if label == 0:
                        negative_user.append(UID)
    auc, TPR, TNR,_  = get_metrics2(probs,labels, t)
    ss = get_CI2(data, auc, TPR, TNR,t)
    print('&'+str(len(set(user))-len(set(negative_user))) + '(' + str(len((user))-len((negative_user))) + ')'
          '/' + str(len(set(negative_user))) + '(' + str(len((negative_user))) + ')' + ss + '//')   
    
  
    

Asymptomatic:
0.1
&14(18)/45(73)&0.89(0.71-1.0)&0.56(0.46-0.67)&0.45(0.24-0.63)//
0.15
&14(18)/45(73)&0.78(0.57-0.95)&0.6(0.49-0.71)&0.38(0.15-0.6)//
0.2
&14(18)/45(73)&0.72(0.5-0.93)&0.62(0.51-0.72)&0.34(0.09-0.58)//
0.25
&14(18)/45(73)&0.67(0.42-0.9)&0.63(0.52-0.74)&0.3(0.04-0.56)//
0.3
&14(18)/45(73)&0.5(0.25-0.76)&0.7(0.59-0.8)&0.2(-0.08-0.48)//
0.35
&14(18)/45(73)&0.5(0.25-0.76)&0.74(0.63-0.84)&0.24(-0.04-0.52)//
0.4
&14(18)/45(73)&0.5(0.25-0.76)&0.79(0.7-0.88)&0.29(0.02-0.57)//
0.45
&14(18)/45(73)&0.5(0.25-0.76)&0.81(0.72-0.89)&0.31(0.03-0.58)//
0.5
&14(18)/45(73)&0.5(0.25-0.76)&0.85(0.77-0.92)&0.35(0.07-0.63)//
0.55
&14(18)/45(73)&0.44(0.19-0.69)&0.85(0.77-0.92)&0.29(0.03-0.56)//
0.6
&14(18)/45(73)&0.44(0.19-0.69)&0.86(0.78-0.94)&0.31(0.05-0.57)//
0.65
&14(18)/45(73)&0.44(0.19-0.69)&0.88(0.8-0.95)&0.32(0.06-0.58)//
0.7
&14(18)/45(73)&0.44(0.19-0.69)&0.88(0.8-0.95)&0.32(0.06-0.58)//
0.75
&14(18)/45(73)&0.39(0.14-0.64)&0.89(0.81-0.96)&0.28(0.03-0.53)//
0.8
&14(18)/45(73)&0.39(0.14