# Map semantic category to PIO

In [21]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import ast
import json

from sklearn.metrics import f1_score, classification_report, recall_score, confusion_matrix

In [39]:
#indir_path = '/mnt/nas2/results/Results/systematicReview/distant_pico/semantic_to_target/training_ebm_candidate_generation'
indir_path = '/mnt/nas2/results/Results/systematicReview/distant_pico/semantic_to_target/val_studytype_candidate_generation'

In [46]:
data_files = [f for f in listdir(indir_path) if isfile(join(indir_path, f))]

In [47]:
len(data_files)

127

In [48]:
semantic_type_labels = dict()

for eachSemantic in data_files:
    read_umls = f'{indir_path}/{eachSemantic}'
    sem_lf_data = pd.read_csv(read_umls, sep='\t')
    
    #flatten the labels
    labels_f = [ int(lab) for index, value in sem_lf_data.labels.items() for lab in ast.literal_eval(value) ]
    
    semantic_type_labels[ str(eachSemantic).replace('.tsv', '').replace('lf_','') ] = labels_f

In [49]:
print( 'Number of semantic type LFs loaded: ', len(semantic_type_labels) )

Number of semantic type LFs loaded:  127


### What semantic types are suitable for an entity?

In [10]:
data_files[0]

'lf_Language.tsv'

In [11]:
indir_path

'/mnt/nas2/results/Results/systematicReview/distant_pico/semantic_to_target/training_ebm_candidate_generation'

In [42]:
def getHandLabelled(picos, reverse:bool):
    
    infile = f'{indir_path}/{data_files[0]}'
    data_df = pd.read_csv(infile, sep='\t')    
    
    if picos == 'p':
        labels_f = [ int(lab) for index, value in sem_lf_data.p.items() for lab in ast.literal_eval(value) ]       
        
    if picos == 'i':
        labels_f = [ int(lab) for index, value in sem_lf_data.i.items() for lab in ast.literal_eval(value) ]
        
    if picos == 'o':
        labels_f = [ int(lab) for index, value in sem_lf_data.o.items() for lab in ast.literal_eval(value) ]
        
    if picos == 's':
        labels_f = [ int(lab) for index, value in sem_lf_data.s.items() for lab in ast.literal_eval(value) ]

    if reverse == True:
        labels_f = [ 1 if l==0 else 0 for l in labels_f ]
    else:
        labels_f = [ 1 if l>0 else 0 for l in labels_f ]
        
    return labels_f

In [43]:
reverse = False
#entity = 'p'
#labels_gt_p = getHandLabelled('p', reverse = reverse)
#labels_gt_i = getHandLabelled('i', reverse = reverse)
#labels_gt_o = getHandLabelled('o', reverse = reverse)
labels_gt_s = getHandLabelled('s', reverse = reverse)

In [31]:
def getHandLabelled_studytype(reverse:bool):
    
    infile = f'/mnt/nas2/data/systematicReview/PICO_datasets/StudyType/val_studytype.json'
    
    flattened_labs = []
    flattened_toks = []
    
    with open(infile, 'r') as rf:
        data_s = json.load(rf)
        
        for k,v in data_s.items():
            data = data_s[k]
            s_labs = data['studytype_fine']
            flattened_labs.extend( s_labs )
            s_toks = data['tokens']
            flattened_toks.extend( s_toks )


    if reverse == True:
        flattened_labs = [ 1 if l==0 else 0 for l in flattened_labs ]
    else:
        flattened_labs = [ 1 if l>0 else 0 for l in flattened_labs ]
        
    return flattened_toks, flattened_labs

In [32]:
#tokens_gt_s , labels_gt_s = getHandLabelled_studytype(reverse = reverse)

In [44]:
print(sum(labels_gt_p))
print(sum(labels_gt_i))
print(sum(labels_gt_o))
print(sum(labels_gt_s))

164591
125960
155722
4650


In [45]:
len(semantic_type_labels['Language'])

211870

In [50]:
def getMetrics(sem_labs, gt):

    r = []
    f1_all = []
    #tp_all = []

    for k,v in sem_labs.items():
        
        

        #f1 score
        f1 = f1_score(gt, v, average='binary',pos_label=1)
        f1 = f1 * 100
        f1_all.append( f1 )

        #Recall score
        recall = recall_score(gt, v, average='binary',pos_label=1)
        recall = recall * 100   
        r.append( recall )
        
        print(k, ' - - ', recall)
        
        
    assert len(r) == len(f1_all) == len( list(sem_labs.keys()) )


    metrics_df = pd.DataFrame(
        {'STY': list(sem_labs.keys()),
         'f1': f1_all,
         'R': r
        })
        
    return metrics_df

In [122]:
metrics_df_p = getMetrics(semantic_type_labels, labels_gt_p)

Language  - -  0.1458159923689631
Anatomical_Structure  - -  0.07047772964499881
Patient_or_Disabled_Group  - -  4.510574697279924
Organism_Function  - -  0.5376964718605514
Body_Location_or_Region  - -  1.1890079044419197
Archaeon  - -  0.007898366253318833
Chemical_Viewed_Functionally  - -  0.057111263677843874
Mammal  - -  0.2849487517543487
Nucleic_Acid_Nucleoside_or_Nucleotide  - -  0.047390197519913
Disease_or_Syndrome  - -  6.606679587583768
Organism_Attribute  - -  0.8366192562169256
Spatial_Concept  - -  2.8185016191650822
Medical_Device  - -  0.9405131507798118
Element_Ion_or_Isotope  - -  0.05285829723374911
Injury_or_Poisoning  - -  0.5091408400216294
Inorganic_Chemical  - -  0.12272846024387724
Food  - -  0.20961048903038443
Social_Behavior  - -  0.3062135839748224
Occupational_Activity  - -  0.23512828769495295
Fully_Formed_Anatomical_Structure  - -  0.06075666348706795
Laboratory_Procedure  - -  1.014028713599164
Group  - -  0.8894775534506747
Clinical_Drug  - -  0.03584

In [123]:
metrics_df_i = getMetrics(semantic_type_labels, labels_gt_i)

Language  - -  0.055573197840584315
Anatomical_Structure  - -  0.046046363925055574
Patient_or_Disabled_Group  - -  0.1635439822165767
Organism_Function  - -  0.5517624642743728
Body_Location_or_Region  - -  0.6176563988567799
Archaeon  - -  0.009526833915528738
Chemical_Viewed_Functionally  - -  0.2683391552873928
Mammal  - -  0.10717688154969832
Nucleic_Acid_Nucleoside_or_Nucleotide  - -  0.3493172435693871
Disease_or_Syndrome  - -  1.13369323594792
Organism_Attribute  - -  0.16275007939028263
Spatial_Concept  - -  2.8318513813909174
Medical_Device  - -  2.5103207367418228
Element_Ion_or_Isotope  - -  0.747856462369006
Injury_or_Poisoning  - -  0.4660209590346142
Inorganic_Chemical  - -  1.0741505239758653
Food  - -  1.1051127342013336
Social_Behavior  - -  0.49857097491267066
Occupational_Activity  - -  0.39933312162591295
Fully_Formed_Anatomical_Structure  - -  0.034137821530644645
Laboratory_Procedure  - -  4.693553509050492
Group  - -  0.7280088917116545
Clinical_Drug  - -  0.451

In [124]:
metrics_df_o = getMetrics(semantic_type_labels, labels_gt_o)

Language  - -  0.0841242727424513
Anatomical_Structure  - -  0.13614004443816546
Patient_or_Disabled_Group  - -  0.2607210284995055
Organism_Function  - -  2.1474165500057794
Body_Location_or_Region  - -  0.9915105123232426
Archaeon  - -  0.0012843400418694854
Chemical_Viewed_Functionally  - -  0.046878411528236213
Mammal  - -  0.041098881339823534
Nucleic_Acid_Nucleoside_or_Nucleotide  - -  0.08990380293086397
Disease_or_Syndrome  - -  4.957552561616214
Organism_Attribute  - -  1.4410295269775626
Spatial_Concept  - -  3.510743504450238
Medical_Device  - -  1.467358497835887
Element_Ion_or_Isotope  - -  0.2780596190647436
Injury_or_Poisoning  - -  0.701249662860739
Inorganic_Chemical  - -  0.28191263919035203
Food  - -  0.38530201256084556
Social_Behavior  - -  0.9870153221766995
Occupational_Activity  - -  0.391723712770193
Fully_Formed_Anatomical_Structure  - -  0.1367822144591002
Laboratory_Procedure  - -  4.8926933895018045
Group  - -  0.19008232619668383
Clinical_Drug  - -  0.0321

In [51]:
metrics_df_s = getMetrics(semantic_type_labels, labels_gt_s)

Language  - -  0.0
Anatomical_Structure  - -  0.0
Patient_or_Disabled_Group  - -  4.537634408602151
Organism_Function  - -  0.06451612903225806
Body_Location_or_Region  - -  0.0
Archaeon  - -  0.0
Chemical_Viewed_Functionally  - -  0.0
Mammal  - -  0.0
Nucleic_Acid_Nucleoside_or_Nucleotide  - -  0.0
Disease_or_Syndrome  - -  4.602150537634408
Organism_Attribute  - -  0.0
Spatial_Concept  - -  1.3548387096774193
Medical_Device  - -  0.0
Element_Ion_or_Isotope  - -  0.0
Injury_or_Poisoning  - -  0.0
Inorganic_Chemical  - -  0.021505376344086023
Food  - -  0.0
Social_Behavior  - -  0.27956989247311825
Occupational_Activity  - -  0.043010752688172046
Fully_Formed_Anatomical_Structure  - -  0.0
Laboratory_Procedure  - -  0.27956989247311825
Group  - -  4.881720430107527
Clinical_Drug  - -  0.0
Nucleotide_Sequence  - -  0.0
Age_Group  - -  0.0
Amino_Acid_Peptide_or_Protein  - -  0.021505376344086023
Hazardous_or_Poisonous_Substance  - -  0.0
Tissue  - -  0.0
Manufactured_Object  - -  0.96774

In [125]:
metrics_df_p_sorted = metrics_df_p.sort_values(by=['R'], ascending=False, inplace=False)

In [126]:
metrics_df_i_sorted = metrics_df_i.sort_values(by=['R'], ascending=False, inplace=False)

In [127]:
metrics_df_o_sorted = metrics_df_o.sort_values(by=['R'], ascending=False, inplace=False)

In [52]:
metrics_df_s_sorted = metrics_df_s.sort_values(by=['R'], ascending=False, inplace=False)

In [53]:
def writeMetrics(picos, df, filename):

    writedir = '/mnt/nas2/results/Results/systematicReview/distant_pico/semantic_to_target'
    if reverse == True:
        filename = f'{filename}_{picos}' + '.tsv'
    else: 
        filename = f'{filename}_{picos}' + '.tsv'
    df.to_csv(f'{writedir}/{filename}', sep='\t')

    
#writeMetrics('p', metrics_df_p_sorted, 'sty_metrics')
#writeMetrics('i', metrics_df_i_sorted, 'sty_metrics')
#writeMetrics('o', metrics_df_o_sorted, 'sty_metrics')
writeMetrics('s', metrics_df_s_sorted, 'sty_metrics')

In [None]:
# Measure gains to the best recall by addition of more semantic groups. This will increase the "seen" entity for the label model

In [128]:
metrics_df_o_sorted

Unnamed: 0,STY,f1,R
81,Finding,21.636211,25.403604
120,Intellectual_Product,19.879014,20.427428
83,Qualitative_Concept,13.950256,12.244256
92,Functional_Concept,13.105904,12.207010
55,Quantitative_Concept,15.012789,12.004726
...,...,...,...
115,Carbohydrate_Sequence,0.000000,0.000000
96,Experimental_Model_of_Disease,0.000000,0.000000
79,Vertebrate,0.000000,0.000000
75,Professional_Society,0.000000,0.000000


In [129]:
metrics_df_i_sorted

Unnamed: 0,STY,f1,R
32,Pharmacologic_Substance,23.947819,16.876786
66,Organic_Chemical,20.487305,13.337567
54,Therapeutic_or_Preventive_Procedure,17.467852,12.650048
81,Finding,9.401867,12.536520
92,Functional_Concept,10.275775,10.618450
...,...,...,...
50,Reptile,0.004762,0.002382
115,Carbohydrate_Sequence,0.000000,0.000000
96,Experimental_Model_of_Disease,0.000000,0.000000
79,Vertebrate,0.000000,0.000000


In [160]:
metrics_df_p_sorted

Unnamed: 0,STY,f1,R
81,Finding,19.323282,21.986014
120,Intellectual_Product,16.151168,16.137577
92,Functional_Concept,14.063127,12.771658
71,Idea_or_Concept,12.750300,10.021812
55,Quantitative_Concept,10.664961,8.355864
...,...,...,...
99,Environmental_Effect_of_Humans,0.002430,0.001215
41,Drug_Delivery_Device,0.002430,0.001215
115,Carbohydrate_Sequence,0.000000,0.000000
79,Vertebrate,0.000000,0.000000


In [54]:
metrics_df_s_sorted

Unnamed: 0,STY,f1,R
42,Research_Activity,36.648415,30.946237
83,Qualitative_Concept,9.806978,17.591398
120,Intellectual_Product,6.889232,15.354839
92,Functional_Concept,5.676588,11.935484
54,Therapeutic_or_Preventive_Procedure,8.761252,10.989247
...,...,...,...
47,Receptor,0.000000,0.000000
46,Cell_Component,0.000000,0.000000
45,Diagnostic_Procedure,0.000000,0.000000
43,Amphibian,0.000000,0.000000


In [155]:
merged_score_p_r = dict()
merged_score_p_f1 = dict()

merged_p = [0] * len(labels_gt_p)

for i in list(metrics_df_p_sorted.STY):
    
    lf_labs = list(semantic_type_labels[i])
    gt = labels_gt_p
    
    for counter, (o,n) in enumerate( zip(merged_p, lf_labs) ):
        
        if (o + n) >= 1:
            merged_p[counter] = 1

    recall_updated = recall_score(gt, merged_p, average='binary',pos_label=1)
    recall_updated = recall_updated * 100   
    merged_score_p_r[i] = recall_updated
    print( i ,  recall_updated )
    
    f1_updated = f1_score(gt, merged_p, average='binary',pos_label=1)
    f1_updated = f1_updated * 100
    merged_score_p_f1[i] = f1_updated
    #print( '\t\t\t\t' ,  f1_updated )

Finding 21.986013816065277
Intellectual_Product 26.817383696556917
Functional_Concept 28.923209653018695
Idea_or_Concept 29.997995030104928
Quantitative_Concept 32.1287312185964
Clinical_Attribute 32.67493362334514
Qualitative_Concept 35.361593282743286
Disease_or_Syndrome 37.16181322186511
Body_Substance 37.266922249697735
Therapeutic_or_Preventive_Procedure 38.39578105728746
Patient_or_Disabled_Group 38.50089008512009
Temporal_Concept 39.186225249254214
Population_Group 40.6784089044966
Organism 40.68448457084531
Neoplastic_Process 41.216712942992025
Health_Care_Activity 41.36981973497944
Conceptual_Entity 41.442120164529044
Pharmacologic_Substance 42.38081061540424
Body_Part_Organ_or_Organ_Component 42.77512136143532
Spatial_Concept 43.2769714018385
Mental_or_Behavioral_Dysfunction 43.5856152523528
Manufactured_Object 43.76120200983043
Organic_Chemical 43.91673906835732
Research_Activity 44.03156916234788
Age_Group 44.07592152669344
Sign_or_Symptom 44.195004587128096
Classification 

In [156]:
merged_score_i_r = dict()
merged_score_i_f1 = dict()

merged_i = [0] * len(labels_gt_i)

for i in list(metrics_df_i_sorted.STY):
    
    lf_labs = list(semantic_type_labels[i])
    gt = labels_gt_i
    
    for counter, (o,n) in enumerate( zip(merged_i, lf_labs) ):
        
        if (o + n) >= 1:
            merged_i[counter] = 1

    recall_updated = recall_score(gt, merged_i, average='binary',pos_label=1)
    recall_updated = recall_updated * 100   
    print( i ,  recall_updated )
    merged_score_i_r[i] = recall_updated
    
    f1_updated = f1_score(gt, merged_i, average='binary',pos_label=1)
    f1_updated = f1_updated * 100
    merged_score_i_f1[i] = f1_updated
    #print( '\t\t\t\t' ,  f1_updated )

Pharmacologic_Substance 16.876786281359163
Organic_Chemical 18.159733248650365
Therapeutic_or_Preventive_Procedure 29.509368053350272
Finding 35.963798031120994
Functional_Concept 39.085423944109245
Intellectual_Product 41.64734836456018
Qualitative_Concept 43.12956494125119
Health_Care_Activity 43.59241028898062
Laboratory_Procedure 43.967132422991426
Idea_or_Concept 44.53318513813909
Temporal_Concept 45.07780247697682
Quantitative_Concept 45.784375992378536
Biomedical_or_Dental_Material 46.076532232454745
Conceptual_Entity 46.18450301683074
Amino_Acid_Peptide_or_Protein 46.36233724992061
Research_Activity 46.46157510320737
Spatial_Concept 47.16656081295649
Manufactured_Object 47.62623054938076
Medical_Device 48.017624642743726
Clinical_Attribute 48.12003810733566
Biologically_Active_Substance 48.214512543664654
Gene_or_Genome 48.28040647824706
Diagnostic_Procedure 48.52016513178787
Body_Part_Organ_or_Organ_Component 48.75516036837091
Educational_Activity 48.79247380120673
Body_Substa

In [158]:
merged_score_o_r = dict()
merged_score_o_f1 = dict()

merged_o = [0] * len(labels_gt_o)

for i in list(metrics_df_o_sorted.STY):
    
    lf_labs = list(semantic_type_labels[i])
    gt = labels_gt_o
    
    for counter, (o,n) in enumerate( zip(merged_o, lf_labs) ):
        
        if (o + n) >= 1:
            merged_o[counter] = 1

    recall_updated = recall_score(gt, merged_o, average='binary',pos_label=1)
    recall_updated = recall_updated * 100   
    print( i ,  recall_updated )
    merged_score_o_r[i] = recall_updated
    
    f1_updated = f1_score(gt, merged_o, average='binary',pos_label=1)
    f1_updated = f1_updated * 100
    merged_score_o_f1[i] = f1_updated
    #print( '\t\t\t\t' ,  f1_updated )

Finding 25.403603858157485
Intellectual_Product 34.31435506864797
Qualitative_Concept 38.22645483618243
Functional_Concept 42.08846534208397
Quantitative_Concept 44.82924699143345
Idea_or_Concept 45.83552741423819
Clinical_Attribute 46.714016002876924
Pharmacologic_Substance 48.61355492480189
Temporal_Concept 49.57552561616214
Disease_or_Syndrome 50.474563645470774
Laboratory_Procedure 51.34213534375361
Health_Care_Activity 51.61505760265088
Mental_Process 52.10760200870782
Therapeutic_or_Preventive_Procedure 52.62454887556029
Sign_or_Symptom 52.87178433362016
Organic_Chemical 53.04517023927255
Pathologic_Function 53.28534182710214
Body_Part_Organ_or_Organ_Component 54.04567113188888
Activity 54.42455144424038
Spatial_Concept 55.00314663310258
Diagnostic_Procedure 55.21955793015759
Body_Substance 55.36276184482604
Conceptual_Entity 55.48284763874083
Organism_Function 55.61641900309526
Physiologic_Function 55.66907694481191
Amino_Acid_Peptide_or_Protein 55.87264484144823
Manufactured_Ob

In [55]:
merged_score_s_r = dict()
merged_score_s_f1 = dict()

merged_s = [0] * len(labels_gt_s)

for i in list(metrics_df_s_sorted.STY):
    
    lf_labs = list(semantic_type_labels[i])
    gt = labels_gt_s
    
    for counter, (o,n) in enumerate( zip(merged_s, lf_labs) ):
        
        if (o + n) >= 1:
            merged_s[counter] = 1

    recall_updated = recall_score(gt, merged_s, average='binary',pos_label=1)
    recall_updated = recall_updated * 100   
    merged_score_s_r[i] = recall_updated
    print( i ,  recall_updated )
    
    f1_updated = f1_score(gt, merged_s, average='binary',pos_label=1)
    f1_updated = f1_updated * 100
    merged_score_s_f1[i] = f1_updated
    #print( '\t\t\t\t' ,  f1_updated )

Research_Activity 30.946236559139784
Qualitative_Concept 40.58064516129033
Intellectual_Product 49.01075268817204
Functional_Concept 54.236559139784944
Therapeutic_or_Preventive_Procedure 64.9247311827957
Biomedical_or_Dental_Material 64.9247311827957
Idea_or_Concept 64.98924731182795
Classification 65.46236559139786
Group 65.46236559139786
Disease_or_Syndrome 69.3763440860215
Conceptual_Entity 69.44086021505377
Patient_or_Disabled_Group 69.44086021505377
Regulation_or_Law 69.44086021505377
Population_Group 69.48387096774194
Finding 70.64516129032258
Temporal_Concept 70.75268817204301
Spatial_Concept 71.78494623655915
Quantitative_Concept 72.10752688172043
Cell_Function 72.10752688172043
Manufactured_Object 72.10752688172043
Professional_or_Occupational_Group 72.34408602150538
Health_Care_Activity 72.34408602150538
Genetic_Function 72.6236559139785
Laboratory_Procedure 72.6236559139785
Social_Behavior 72.88172043010752
Occupation_or_Discipline 73.01075268817205
Substance 73.11827956989

In [159]:
writeMetrics('p', pd.DataFrame(merged_score_p_r.items()), 'metrics_gain_r')
writeMetrics('p', pd.DataFrame(merged_score_p_f1.items()), 'metrics_gain_f1')

writeMetrics('i', pd.DataFrame(merged_score_i_r.items()), 'metrics_gain_r')
writeMetrics('i', pd.DataFrame(merged_score_i_f1.items()), 'metrics_gain_f1')

writeMetrics('o', pd.DataFrame(merged_score_o_r.items()), 'metrics_gain_r')
writeMetrics('o', pd.DataFrame(merged_score_o_f1.items()), 'metrics_gain_f1')

In [56]:
writeMetrics('s', pd.DataFrame(merged_score_s_r.items()), 'metrics_gain_r')
writeMetrics('s', pd.DataFrame(merged_score_s_f1.items()), 'metrics_gain_f1')