# Evaluate UMLS

In [2]:
import pandas as pd
from pathlib import Path

In [116]:
annotations = pd.read_csv(f'../annotations/combined_pd.csv')
patients = annotations[annotations.group_name == 'patients']
# Only keep those where `umls` annotation is not empty
patients = patients[patients.umls.notnull()]
fields_keep = ['pmcid', 'group_name', 'subgroup_name', 'count', 'umls', 'diagnosis']
patients = patients[fields_keep]
outputs = Path('../outputs')

In [117]:
# Add index for later
patients = patients.reset_index()

In [118]:
all_predictions = []
for pred in list(outputs.glob('*umls*')):
    md = pred.stem.split('_')
    if md[0] == 'chunked':
        chunk = True
        prompt = md[1]
        model_name = md[2]
    else:
        chunk = False
        source = md[1]
        prompt = md[2]
        model_name = md[3]
    pred_df = pd.read_csv(pred)

    pred_df['chunk'] = chunk    
    pred_df['source'] = source
    pred_df['prompt'] = prompt
    pred_df['model_name'] = model_name

    all_predictions.append(pred_df)



In [103]:
# For each PMCID, get the best match for each UMLS code, without replacement (i.e. only one match per UMLS code)
# If no match is found, then the UMLS code is not matched

def match_umls_codes(umls_predicitons, annot_patients):
    annot_patients = annot_patients.copy()


    for pmcid, group in annot_patients.groupby('pmcid'):
        group_preds = umls_predicitons[umls_predicitons.pmcid == pmcid]

        if len(group_preds) == 0:
            continue

        used_group_ixs = []
        for row in group.itertuples():
            if pd.isnull(row.umls):
                continue

            correct = [c.strip() for c in row.umls.split(', ')]
            
            matches = group_preds[(group_preds['umls_cui'].isin(correct)) & (group_preds['group_ix'].isin(group) == False)]
            if len(matches) > 0:
                m =  matches[matches['umls_prob'] == matches['umls_prob'].max()].iloc[0]
                used_group_ixs.append(m['group_ix'])
                
                annot_patients.at[row.Index, 'umls_match'] = 1
                annot_patients.at[row.Index, 'umls_prob'] = m['umls_prob']
                annot_patients.at[row.Index, 'diagnosis_pred'] = m['diagnosis']
            else:
                annot_patients.at[row.Index, 'umls_match'] = 0


    annot_patients['source'] = umls_predicitons['source'].iloc[0]
    annot_patients['prompt'] = umls_predicitons['prompt'].iloc[0]
    annot_patients['model_name'] = umls_predicitons['model_name'].iloc[0]
    annot_patients['chunk'] = umls_predicitons['chunk'].iloc[0]

    return annot_patients



In [104]:
# For each prediction file, get the best match for each UMLS code, without replacement (i.e. only one match per UMLS code)
matches = []
for pred_df in all_predictions:
    matches.append(match_umls_codes(pred_df, patients))

matches = pd.concat(matches)

summary = matches.groupby(['source', 'prompt', 'model_name', 'chunk']).agg(match_n=('umls_match','sum'), 
                                            match_mean_accuracy= ('umls_match','mean'),
                                            mean_umls_prob=('umls_prob','mean')).reset_index()
    

In [165]:
summary.sort_values(['chunk', 'source'])

Unnamed: 0,source,prompt,model_name,chunk,match_n,match_mean_accuracy,mean_umls_prob
0,html,demographics-zeroshot,gpt-4o-mini-2024-07-18,False,43.0,0.623188,0.937813
2,md,demographics-zeroshot,gpt-4o-2024-05-13,False,94.0,0.657343,0.934063
4,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False,96.0,0.680851,0.930933
1,md,demographics-fewshot,gpt-4o-2024-05-13,True,88.0,0.765217,0.954726
3,md,demographics-zeroshot,gpt-4o-2024-05-13,True,90.0,0.756303,0.955219


In summary, HTML extraction did the worst, as it had a high rate of no prediction at all for diagnosis type.
One culprit is that abbreviation / parsing of HTML is not working in spacy.

For Markdown extraction, all approaches did reasonable well
Chunked approaches had a slightly higher rate of null_prediction, but within the ones that were extracted (perhaps easier cases), accuracy was higher
However, using the full text resulted in the highest total *number of matches, suprirsingly using GPT 4o mini model!

### Dischordant examples

Excluding HTML and few shot prompts

In [134]:

matches_ex = matches[matches.source != 'html']
matches_ex = matches_ex[matches_ex.prompt != 'demographics-fewshot']

all_predictions_ex = pd.concat(all_predictions)
all_predictions_ex = all_predictions_ex[all_predictions_ex.source != 'html']
all_predictions_ex = all_predictions_ex[all_predictions_ex.prompt != 'demographics-fewshot']

In [141]:
# Groupby pmcid and only show examples where umls_match is not the same for all rows

for pmcid, group in matches_ex.groupby('pmcid'):
    if len(group.umls_match.unique()) > 1:
        display(group[['pmcid', 'umls', 'umls_match', 'umls_prob', 'diagnosis', 'diagnosis_pred', 'source', 'prompt', 'model_name', 'chunk']])

Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
38,3742334,C4518790,0.0,,chronic marijuana use,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
38,3742334,C4518790,,,chronic marijuana use,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
38,3742334,C4518790,0.0,,chronic marijuana use,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
142,4215530,C3897192,0.0,,Very preterm birth\n,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
142,4215530,C3897192,,,Very preterm birth\n,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
142,4215530,C3897192,0.0,,Very preterm birth\n,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
36,4473263,C0586325,0.0,,chronic left-hemisphere stroke,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
36,4473263,C0586325,0.0,,chronic left-hemisphere stroke,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
36,4473263,C0586325,,,chronic left-hemisphere stroke,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
0,4522562,C4049190,0.0,,thermo-coagulation lesions in the right parah...,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
37,4522562,C0457949,0.0,,chronic lumbar back-pain,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
53,4522562,C0014544,1.0,1.0,epilepsy,epilepsy,md,demographics-zeroshot,gpt-4o-2024-05-13,False
110,4522562,C1096063,0.0,,refractory epilepsy,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
131,4522562,C4049190,0.0,,thermo-coagulation lesions in the left hippoca...,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
132,4522562,C4049190,0.0,,thermo-coagulation lesions in the left parahip...,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
133,4522562,C4049190,0.0,,thermo-coagulation lesions in the right hippoc...,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
0,4522562,C4049190,0.0,,thermo-coagulation lesions in the right parah...,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
37,4522562,C0457949,0.0,,chronic lumbar back-pain,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
53,4522562,C0014544,1.0,0.816453,epilepsy,Epilepsy control,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
47,4589842,"C2362914, C1269683, C0006012",0.0,,Depression,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
47,4589842,"C2362914, C1269683, C0006012",0.0,,Depression,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
47,4589842,"C2362914, C1269683, C0006012",1.0,0.822259,Depression,"depression (2 major depressive disorder, 9 bip...",md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
54,4732188,C0270736,1.0,1.0,Essential tremor,Essential Tremor,md,demographics-zeroshot,gpt-4o-2024-05-13,False
54,4732188,C0270736,1.0,1.0,Essential tremor,Essential tremor,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
54,4732188,C0270736,,,Essential tremor,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
76,4983635,C1269683,0.0,,major depressive disorder,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
76,4983635,C1269683,0.0,,major depressive disorder,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
76,4983635,C1269683,1.0,1.0,major depressive disorder,major depressive disorder,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
61,4990879,C0019337,0.0,,heroin addiction,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
61,4990879,C0019337,1.0,1.0,heroin addiction,heroin addiction,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
61,4990879,C0019337,0.0,,heroin addiction,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
57,5339238,C0162316),0.0,,former iron deficiency anemia,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
57,5339238,C0162316),0.0,,former iron deficiency anemia,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
57,5339238,C0162316),,,former iron deficiency anemia,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
44,5371603,C0011053,,,deafness,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
44,5371603,C0011053,0.0,,deafness,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
44,5371603,C0011053,,,deafness,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
139,5416685,C0041696,1.0,0.824893,unipolar major depression,unipolar major depression,md,demographics-zeroshot,gpt-4o-2024-05-13,False
140,5416685,C0041696,1.0,0.824893,unipolar major depression,unipolar major depression,md,demographics-zeroshot,gpt-4o-2024-05-13,False
139,5416685,C0041696,1.0,0.824893,unipolar major depression,unipolar major depression,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
140,5416685,C0041696,1.0,0.824893,unipolar major depression,unipolar major depression,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
139,5416685,C0041696,,,unipolar major depression,,md,demographics-zeroshot,gpt-4o-2024-05-13,True
140,5416685,C0041696,,,unipolar major depression,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
52,5598991,C1269683,0.0,,early adult onset depression,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
71,5598991,C1269683,0.0,,later adult onset depression,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
52,5598991,C1269683,0.0,,early adult onset depression,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
71,5598991,C1269683,0.0,,later adult onset depression,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
52,5598991,C1269683,1.0,1.0,early adult onset depression,Major depressive disorder,md,demographics-zeroshot,gpt-4o-2024-05-13,True
71,5598991,C1269683,1.0,1.0,later adult onset depression,Major depressive disorder,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
68,5665859,C0948008,0.0,,ischemic stroke,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
68,5665859,C0948008,0.0,,ischemic stroke,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
68,5665859,C0948008,1.0,1.0,ischemic stroke,ischemic stroke,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
109,6290711,C0948238,0.0,,Psychogenic erectile dysfunction,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
109,6290711,C0948238,0.0,,Psychogenic erectile dysfunction,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
109,6290711,C0948238,,,Psychogenic erectile dysfunction,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
59,6509414,C3887873,1.0,1.0,hearing loss,hearing loss,md,demographics-zeroshot,gpt-4o-2024-05-13,False
59,6509414,C3887873,0.0,,hearing loss,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
59,6509414,C3887873,0.0,,hearing loss,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
40,6667657,C0040264,0.0,,chronic tinnitus,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
40,6667657,C0040264,1.0,1.0,chronic tinnitus,tinnitus,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
40,6667657,C0040264,1.0,0.689254,chronic tinnitus,problematic tinnitus,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
99,6678781,C0524400,0.0,,Pedophilic Child Sexual Offenders,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
99,6678781,C0524400,,,Pedophilic Child Sexual Offenders,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
99,6678781,C0524400,0.0,,Pedophilic Child Sexual Offenders,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
128,6699415,C0017638,0.0,,supratentorial gliomas,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
128,6699415,C0017638,0.0,,supratentorial gliomas,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
128,6699415,C0017638,,,supratentorial gliomas,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
2,7275020,C3508472,1.0,1.0,acute mild traumatic brain injury,mild traumatic brain injury,md,demographics-zeroshot,gpt-4o-2024-05-13,False
2,7275020,C3508472,1.0,1.0,acute mild traumatic brain injury,mild traumatic brain injury,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
2,7275020,C3508472,,,acute mild traumatic brain injury,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
103,7518235,"C0267167, C1269683",1.0,0.58426,post-prandial distress subtype functional dysp...,Functional Dyspepsia-PDS with comorbid Major D...,md,demographics-zeroshot,gpt-4o-2024-05-13,False
104,7518235,C0267167,0.0,,post-prandial distress subtype functional dysp...,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
103,7518235,"C0267167, C1269683",1.0,0.60415,post-prandial distress subtype functional dysp...,Functional Dyspepsia-PDS without Major Depress...,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
104,7518235,C0267167,0.0,,post-prandial distress subtype functional dysp...,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
103,7518235,"C0267167, C1269683",0.0,,post-prandial distress subtype functional dysp...,,md,demographics-zeroshot,gpt-4o-2024-05-13,True
104,7518235,C0267167,0.0,,post-prandial distress subtype functional dysp...,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
8,7838677,C5394908,0.0,,amnestic mild cognitive impairment,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
83,7838677,C0002395,1.0,0.617026,mild Alzheimer's dementia,amnestic mild cognitive impairment (amnestic m...,md,demographics-zeroshot,gpt-4o-2024-05-13,False
8,7838677,C5394908,0.0,,amnestic mild cognitive impairment,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
83,7838677,C0002395,1.0,0.836751,mild Alzheimer's dementia,Alzheimer's dementia of mild degree,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
8,7838677,C5394908,0.0,,amnestic mild cognitive impairment,,md,demographics-zeroshot,gpt-4o-2024-05-13,True
83,7838677,C0002395,1.0,0.611661,mild Alzheimer's dementia,amnestic mild cognitive impairment/mild Alzhei...,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
119,8550949,C0036341,1.0,0.792832,schizophrenia,schizophrenia or schizoaffective disorder,md,demographics-zeroshot,gpt-4o-2024-05-13,False
119,8550949,C0036341,1.0,0.792832,schizophrenia,schizophrenia or schizoaffective disorder,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
119,8550949,C0036341,0.0,,schizophrenia,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
12,8785614,C5680049,1.0,0.801475,arterial ischemic stroke,arterial ischemic stroke (arterial ischemic st...,md,demographics-zeroshot,gpt-4o-2024-05-13,False
100,8785614,C1300444,0.0,,periventricular venous infarction,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
12,8785614,C5680049,1.0,0.860808,arterial ischemic stroke,arterial ischemic stroke,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
100,8785614,C1300444,0.0,,periventricular venous infarction,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
12,8785614,C5680049,,,arterial ischemic stroke,,md,demographics-zeroshot,gpt-4o-2024-05-13,True
100,8785614,C1300444,,,periventricular venous infarction,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
130,8933317,C0014556,1.0,0.608328,Temporal lobe epilepsy,Temporal lobe epilepsy with memory intact (mem...,md,demographics-zeroshot,gpt-4o-2024-05-13,False
130,8933317,C0014556,0.0,,Temporal lobe epilepsy,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
130,8933317,C0014556,0.0,,Temporal lobe epilepsy,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
73,8933759,C3266633,0.0,,Left Mesial Temporal Lobe Epilepsy,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
112,8933759,C3266633,0.0,,Right Mesial Temporal Lobe Epilepsy,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
73,8933759,C3266633,0.0,,Left Mesial Temporal Lobe Epilepsy,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
112,8933759,C3266633,0.0,,Right Mesial Temporal Lobe Epilepsy,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
73,8933759,C3266633,,,Left Mesial Temporal Lobe Epilepsy,,md,demographics-zeroshot,gpt-4o-2024-05-13,True
112,8933759,C3266633,,,Right Mesial Temporal Lobe Epilepsy,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
58,8978988,C1853926,0.0,,GNE myopathy with congenital thrombocytopenia,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
58,8978988,C1853926,0.0,,GNE myopathy with congenital thrombocytopenia,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
58,8978988,C1853926,1.0,1.0,GNE myopathy with congenital thrombocytopenia,GNE myopathy,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
91,9230060,C0699726,0.0,,offenders,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
91,9230060,C0699726,0.0,,offenders,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
91,9230060,C0699726,,,offenders,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
27,9407088,C1510586,0.0,,autism spectrum disorder,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
28,9407088,C1510586,0.0,,autism spectrum disorder,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
27,9407088,C1510586,1.0,0.680409,autism spectrum disorder,autism spectrum disorder patients and normal c...,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
28,9407088,C1510586,1.0,0.680409,autism spectrum disorder,autism spectrum disorder patients and normal c...,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
27,9407088,C1510586,1.0,1.0,autism spectrum disorder,autism spectrum disorder,md,demographics-zeroshot,gpt-4o-2024-05-13,True
28,9407088,C1510586,1.0,1.0,autism spectrum disorder,autism spectrum disorder,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
122,9435010,C0036341,0.0,,schizophrenia,,md,demographics-zeroshot,gpt-4o-2024-05-13,False
122,9435010,C0036341,0.0,,schizophrenia,,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
122,9435010,C0036341,,,schizophrenia,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
32,10870473,C0006012,1.0,1.0,BPD,borderline personality disorder,md,demographics-zeroshot,gpt-4o-2024-05-13,False
32,10870473,C0006012,1.0,1.0,BPD,borderline personality disorder,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
32,10870473,C0006012,,,BPD,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
87,10958407,C2349426,1.0,0.948853,NDPH,new daily persistent headache (new daily persi...,md,demographics-zeroshot,gpt-4o-2024-05-13,False
87,10958407,C2349426,1.0,1.0,NDPH,new daily persistent headache,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
87,10958407,C2349426,,,NDPH,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


Unnamed: 0,pmcid,umls,umls_match,umls_prob,diagnosis,diagnosis_pred,source,prompt,model_name,chunk
4,11024046,C0002395,1.0,1.0,AD,Alzheimer's disease,md,demographics-zeroshot,gpt-4o-2024-05-13,False
4,11024046,C0002395,1.0,1.0,AD,Alzheimer's disease,md,demographics-zeroshot,gpt-4o-mini-2024-07-18,False
4,11024046,C0002395,,,AD,,md,demographics-zeroshot,gpt-4o-2024-05-13,True


In [163]:
all_predictions_ex[all_predictions_ex.pmcid == 9435010]

Unnamed: 0,pmcid,diagnosis,umls_cui,umls_name,umls_prob,count,group_ix,start_char,end_char,chunk,source,prompt,model_name
169,9435010,schizophrenia with auditory verbal hallucinati...,C0233762,auditory hallucinations,0.752388,50.0,68,,,False,md,demographics-zeroshot,gpt-4o-2024-05-13
163,9435010,auditory verbal hallucinations (auditory verba...,C0233762,auditory hallucinations,0.80907,50.0,68,,,False,md,demographics-zeroshot,gpt-4o-mini-2024-07-18
164,9435010,without auditory verbal hallucination (NAVH),C0233762,AUDITORY HALLUCINATION,0.629468,50.0,69,,,False,md,demographics-zeroshot,gpt-4o-mini-2024-07-18


In [158]:
# This is an example of chunked approach doing better
all_predictions_ex[all_predictions_ex.pmcid == 5416685]

Unnamed: 0,pmcid,diagnosis,umls_cui,umls_name,umls_prob,count,group_ix,start_char,end_char,chunk,source,prompt,model_name
102,5416685,unipolar major depression,C0041696,Unipolar Depression,0.824893,20.0,37,,,False,md,demographics-zeroshot,gpt-4o-2024-05-13
103,5416685,unipolar major depression,C1269683,major depression,0.803354,20.0,37,,,False,md,demographics-zeroshot,gpt-4o-2024-05-13
104,5416685,unipolar major depression,C0005587,depression bipolar,0.658109,20.0,37,,,False,md,demographics-zeroshot,gpt-4o-2024-05-13
105,5416685,unipolar major depression,C0041696,Unipolar Depression,0.824893,19.0,39,,,False,md,demographics-zeroshot,gpt-4o-2024-05-13
106,5416685,unipolar major depression,C1269683,major depression,0.803354,19.0,39,,,False,md,demographics-zeroshot,gpt-4o-2024-05-13
107,5416685,unipolar major depression,C0005587,depression bipolar,0.658109,19.0,39,,,False,md,demographics-zeroshot,gpt-4o-2024-05-13
100,5416685,unipolar major depression,C0041696,Unipolar Depression,0.824893,20.0,39,,,False,md,demographics-zeroshot,gpt-4o-mini-2024-07-18
101,5416685,unipolar major depression,C1269683,major depression,0.803354,20.0,39,,,False,md,demographics-zeroshot,gpt-4o-mini-2024-07-18
102,5416685,unipolar major depression,C0005587,depression bipolar,0.658109,20.0,39,,,False,md,demographics-zeroshot,gpt-4o-mini-2024-07-18
103,5416685,unipolar major depression,C0041696,Unipolar Depression,0.824893,19.0,40,,,False,md,demographics-zeroshot,gpt-4o-mini-2024-07-18


## Summary
Sometimes, the chunked approach outperforms the full text approach by chance, and sometimes it does so because it's more likely to use the specific terminology used in the paper
(e.g. Major Depressive Disorded (MDD) vs a more general term (severe depression)). Or a more specific description of the population (e.g. ischemic stroke) vs the topic studies (depression in stroke patients)


However, the chunked approach is more likely to not make a diagnosis prediction overall, and sometimes miss abbreviations.

One additional approach we should try is Abstract and Title + Chunking. This might yield the best overall performance. 
We should also evaluate GPT 4o mini on chunked version.

Or to run full text extraction only when no response is given for diagnosis?