# Qualitative Assesment of Models



In [19]:
import  pandas as pd
from IPython.display import display, HTML

In [20]:
# Load predictions
gpt_4 = pd.read_csv('outputs/eval_demographics-fewshot_gpt-4-0125-preview_minc-40_maxc-4000_clean.csv')
gpt_4_o = pd.read_csv('outputs/eval_demographics-fewshot_gpt-4o-2024-05-13_minc-40_maxc-4000_clean.csv')
ff = pd.read_csv('outputs/eval_demographics-fewshot_firefunction-v1_minc-40_maxc-4000_clean.csv')

# Load only subset of columns
embeddings = pd.read_parquet('outputs/eval_embeddings_minc-40_maxc-4000.parquet', 
                             columns=['content', 'start_char', 'end_char', 'pmcid', 'section_0', 'section_1'])

# Load annotations
annotations = pd.read_csv('annotations/jerome_pd.csv').drop(columns=['Unnamed: 0'])

In [21]:
def _get_corr_groups(annotations, predictions):
    annotations = annotations.copy()
    annotations = annotations[annotations.pmcid.isin(predictions.pmcid.unique())]
    pred_n_groups = predictions.groupby('pmcid').size()
    n_groups = annotations.groupby('pmcid').size()
    return (n_groups == pred_n_groups)

In [22]:
gpt_4_corr_groups = _get_corr_groups(annotations, gpt_4)
ff_corr_groups = _get_corr_groups(annotations, ff)

# IDs where both got it right
both_right = gpt_4_corr_groups & ff_corr_groups
both_right = both_right.index[both_right]


In [58]:
def _print_context(pmcid, annotation, predictions, embeddings):
    print("PMC ID: ", pmcid)
    subset_rows = ['count', 'diagnosis', 'group_name', 'subgroup_name',
       'male_count', 'female_count', 'age_mean', 'age_minimum', 'age_maximum',
       'age_median']
    
    pred_subset_cols = ['imaging_sample']
    
    if 'assesment_type' in predictions.columns:
        pred_subset_cols += ['assesment_type']
    
    annotation = annotation[(annotation.pmcid == pmcid)]
    annotation = annotation[subset_rows]

    pred = predictions[predictions.pmcid == pmcid]

    for start_char, g in pred.groupby('start_char'):
        print("Context: ")
        print(embeddings[(embeddings.pmcid == pmcid) & (embeddings.start_char == start_char)].content.values[0])

        print("Predictions: ")
        display(g[subset_rows + pred_subset_cols])

    print("Annotation: ")
    display(annotation)

In [24]:
combined = pd.concat([gpt_4_o, gpt_4, ff], keys=['gpt-4o', 'gpt4-turbo', 'firefunction-v1'], names=['model'])

In [25]:
combined

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_range,age_minimum,age_maximum,age_median,imaging_sample,pmcid,rank,start_char,end_char
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
gpt-4o,0,15.0,,healthy,,5.0,10.0,31.60,22-54,22.0,54.0,,yes,7981326,0.0,9579,10553
gpt-4o,1,23.0,,healthy,participants,18.0,5.0,222.00,,,,,no,4330553,0.0,10707,11145
gpt-4o,2,16.0,,healthy,,9.0,7.0,,18-35,18.0,35.0,,no,3183226,0.0,28053,28467
gpt-4o,3,15.0,,healthy,fMRI,8.0,7.0,22.21,,,,,yes,6195265,0.0,13097,13950
gpt-4o,4,45.0,,healthy,participants,18.0,27.0,24.49,,,,,yes,5428836,0.0,26550,27302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
firefunction-v1,529,15.0,TSC,patients,reduction,7.0,8.0,43.50,20-60,20.0,60.0,43.0,yes,9308181,,5201,8942
firefunction-v1,530,15.0,TSC,patients,total,7.0,8.0,44.50,20-60,20.0,60.0,44.0,yes,9308181,,5201,8942
firefunction-v1,531,15.0,TSC,patients,volume,7.0,8.0,45.50,20-60,20.0,60.0,45.0,yes,9308181,,5201,8942
firefunction-v1,532,15.0,TSC,patients,target AML,7.0,8.0,46.50,20-60,20.0,60.0,46.0,yes,9308181,,5201,8942


In [26]:
_print_context(both_right[2], annotations, combined, embeddings)

PMC ID:  2525845
Context: 

## Material and methods 
  
### Subjects 
  
Thirty-four healthy, right-handed, heterosexual German subjects (age-range 23–45 years) participated in the study after giving their informed consent. The study was approved by the local Institutional Ethical Review Board and was performed in accordance with the ethical standards laid down in the 1964 Declaration of Helsinki. Men were examined once, women twice in pseudo-randomized order—once 1–3 days after onset of menses (early follicular phase) and once in the midluteal phase adjusted for cycle length (individual cycle length −7 days with a range of ±2 days). Only pre-menopausal women with stable cycle length were included and any form of hormonal treatment was excluded. The phase of the menstrual cycle was confirmed by sex steroid hormone analysis (for details see below). None of the subjects had a history of a serious medical disease, or any neurological or psychiatric illness. Twelve females and 12 males wer

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,316,24.0,,healthy,final analysis,12.0,12.0,,23.0,45.0,,yes
gpt4-turbo,312,24.0,,healthy,final analysis,12.0,12.0,,23.0,45.0,,yes
firefunction-v1,359,34.0,,healthy,all,12.0,12.0,2345.0,23.0,45.0,2345.0,no


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
4,24,,healthy,_,12.0,12.0,,,,


In this example, firefunction does well, but is not able to recognize that the final sample was 12 male + 12 female subjects = 24. 
GPT-4, impressively, is able to extract this complex annotation

In [27]:
_print_context(both_right[3], annotations, combined, embeddings)

PMC ID:  2648877
Context: 

## Methods 
  
### Participants 
  
Thirty-three individuals participated in the study: 15 participants with Autism Spectrum Disorder (12 males; 3 females) and 18 non-autistic control participants (13 males; 5 females). Groups were matched on age (ASD   M  : 38 years, SD: 13; control   M  : 32 years, SD: 8;   t  (31) = 1.6,   p   = .13), and IQ (ASD   M  : 119, SD: 14; control   M  : 119, SD: 11;   t  (31) = 0.1,   p   = .93). Full-scale IQ was measured using the Wechsler Adult Intelligence Scale 3rd UK Edition ( ), apart from one control participant for whom IQ was estimated from the National Adult Reading Test (NART;  ). All participants in the ASD group had previously received a diagnosis from an independent clinician according to standard criteria. The Autism Diagnostic Observational Schedule-Generic (ADOS-G,  ), was used to characterise the participants’ level of current functioning. This measure was chosen because all participants were adults; it was t

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,133,15.0,Autism Spectrum Disorder,patients,ASD,12.0,3.0,38.0,,,,no
gpt-4o,134,18.0,,healthy,control,13.0,5.0,32.0,,,,no
gpt4-turbo,128,15.0,Autism Spectrum Disorder,patients,ASD,12.0,3.0,38.0,,,,no
gpt4-turbo,129,18.0,,healthy,control,13.0,5.0,32.0,,,,no
firefunction-v1,140,33.0,Autism Spectrum Disorder,patients,ASD,12.0,3.0,38.0,38.0,38.0,38.0,yes
firefunction-v1,141,18.0,,healthy,control,13.0,5.0,32.0,32.0,32.0,32.0,yes


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
7,18,,healthy,_,13.0,5.0,32.0,,,
8,15,Autism Spectrum Disorder,patients,_,12.0,3.0,38.0,,,


Here, FireFunction mistakes the total sample size (33), for the subgroup size (Autism Spectrum Disorder)

In [28]:
_print_context(both_right[4], annotations, combined, embeddings)

PMC ID:  2775905
Context: 

## Materials and methods 
  
### Participants 
  
Twenty-two participants (13M, 9F) took part in the study although four (2M, 2F) were subsequently excluded due to: i) excessive head motion (> 10 mm), ii) an unexpected brain abnormality, iii) chance level performance in the scanner, and iv) an error acquiring the scanning data. The 18 remaining participants were right-handed, native speakers of British English (mean 26.7 years, median 22.5 years, range 18–60 years) without any history of neurological or psychiatric disease. The behavioural and neuroimaging data from the older participant (the one 60 year-old) did not differ qualitatively from the younger participants and therefore was included in all analyses. None had any form of oral or written language impairment or any previous experience with time-compressed speech. None of the participants reported any hearing difficulties, but were not audiometrically screened. In-scanner preliminary testing revealed 

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,73,18.0,,healthy,,11.0,7.0,26.7,18.0,60.0,22.5,yes
gpt4-turbo,68,18.0,,healthy,,11.0,7.0,26.7,18.0,60.0,22.5,yes
firefunction-v1,119,22.0,,healthy,participants,13.0,9.0,26.7,18.0,60.0,22.0,no


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
10,18,,healthy,_,,,26.7,18.0,60.0,22.5


Here, FireFunction gets the correct *total* sample size, but not the imaging sample size. 

In [29]:
_print_context(both_right[7], annotations, combined, embeddings)

PMC ID:  3002948
Context: 

## Materials and Methods 
  
### Participants 
  
Fifteen participants were phobic prone (PP) (6 females; mean age 39.2; standard deviation [SD] 7.4) and 15 were eating disorders prone (EDP) (5 females; mean age 34.4; standard deviation [SD] 8.65). The couples enrolled had been together in a committed relationship for the last three years and had been living together for at least one year. To assign the participants to a group, they were assessed with a semi-structured interview  –  and the Personality Meaning Questionnaire (PMQ)   one month before the scanning session. Concordance between the two investigators was 100%. As in our previous study  – , the semi-structured interview was administered independently by two trained investigators who were blind to each other's results. The aim of the semi-structured interview was to assess the key themes characterizing different affective-cognitive styles in the matter of emotional activation, duration and regulatio

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,372,15.0,phobic prone,patients,PP,9.0,6.0,39.2,,,,yes
gpt-4o,373,15.0,eating disorders prone,patients,EDP,10.0,5.0,34.4,,,,yes
gpt4-turbo,375,15.0,phobic prone,patients,PP,9.0,6.0,39.2,,,,yes
gpt4-turbo,376,15.0,eating disorders prone,patients,EDP,10.0,5.0,34.4,,,,yes
firefunction-v1,438,15.0,phobic prone,patients,phobic prone,6.0,9.0,39.2,32.0,46.0,39.0,no
firefunction-v1,439,15.0,eating disorders prone,patients,eating disorders prone,5.0,10.0,34.4,26.0,42.0,34.0,no


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
15,15,,healthy,phobic prone,9.0,6.0,39.2,,,
16,15,,healthy,eating disorders prone,10.0,5.0,34.4,,,


This is a fairly impressive result for both models. They got the correct sample size for each subgroup.

In general, in a large amount of examples, both models extract the correct information.

The column `imaging_sample` is often wrong for short contexsts. It only seems correct when there's a behavioral and non behavioral sample.

In [30]:
_print_context(both_right[17], annotations, combined, embeddings)

PMC ID:  3422286
Context: 

## Methods 
  
### Participants 
  
Twelve subjects (six females) with a mean age of 26 years (range 23–39 years) participated in the first experiment, 14 in the second (seven females, mean age 23 years, range 24–30 years). The participants gave informed written consent to the study, which adhered to the Declaration of Helsinki and was approved by the human subjects committee of the Otto-von-Guericke University Magdeburg. 

All subjects were right-handed as assessed by the Edinburgh Handedness Inventory  . 


Predictions: 


Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,5,12.0,,healthy,first experiment,6.0,6.0,26.0,23.0,39.0,,no
gpt-4o,6,14.0,,healthy,second experiment,7.0,7.0,23.0,24.0,30.0,,no
gpt4-turbo,9,12.0,,healthy,first experiment,6.0,6.0,26.0,23.0,39.0,,no
gpt4-turbo,10,14.0,,healthy,second experiment,7.0,7.0,23.0,24.0,30.0,,no
firefunction-v1,8,12.0,,healthy,first experiment,6.0,6.0,26.0,23.0,39.0,26.0,no
firefunction-v1,9,14.0,,healthy,second experiment,7.0,7.0,23.0,24.0,30.0,23.0,no


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
30,14,,healthy,study 2,7.0,7.0,23.0,24.0,30.0,
31,12,,healthy,study 1,6.0,6.0,26.0,23.0,39.0,


In [31]:
_print_context(9564100, annotations, combined, embeddings)

PMC ID:  9564100
Context: 

## Materials and Methods 
  
### Participants. 
  
We recruited healthy, young participants (872 females, 546 males, mean age = 22.39 y, SD = 3.27). Advertising was done mainly at the University of Basel and in local newspapers. The participants were free of any neurological or psychiatric illness, did not take any medication at the time of the experiment (except hormonal contraceptives), and were between ages 18 and 35 y. Physical and mental health was assessed based on standard questionnaires. The experiment was approved by the ethics committee of the Canton of Basel, Switzerland. All participants gave written informed consent before participating in the study. Prior to the analysis, the sample was divided into a discovery sample (  n   = 945, 2/3 of all participants) and a replication sample (  n   = 473, 1/3 of all participants) by randomly assigning participants to one of the samples. Randomization was performed using the Matlab function randperm. There

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,39,1418.0,,healthy,overall,546.0,872.0,22.39,18.0,35.0,,no
gpt-4o,40,945.0,,healthy,discovery,,,,,,,no
gpt-4o,41,473.0,,healthy,replication,,,,,,,no
gpt4-turbo,78,1418.0,,healthy,total,546.0,872.0,22.39,18.0,35.0,,no
gpt4-turbo,79,945.0,,healthy,discovery,,,,,,,no
gpt4-turbo,80,473.0,,healthy,replication,,,,,,,no
firefunction-v1,34,1418.0,,healthy,discovery and replication samples,546.0,872.0,22.39,18.0,35.0,22.0,no


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
423,1418,,healthy,_,546.0,872.0,22.39,18.0,35.0,


In this example, FF actually got closer, bc GPT exracted the discovery sample and replication sample in addition, both of which are subsample of the total. 
Perhaps I can modify the instruction for "imaging_sample" to say "FINAL total".

Alternative both extractions are relatively useful, if incomplete

## Worst examples

For FireFunction model

In [32]:
ff_mean = ff.groupby('pmcid').apply(lambda x: x['count'].mean())
annot_mean = annotations.groupby('pmcid').apply(lambda x: x['count'].mean())
((ff_mean-annot_mean) / annot_mean).abs().sort_values(ascending=False).head(10)

pmcid
5832413    57.687500
3423412     2.000000
7004957     1.941176
5598991     1.387931
3701149     1.221277
3913832     1.129032
7395771     1.000000
8782893     1.000000
3554651     1.000000
4048172     1.000000
dtype: float64

In [33]:
_print_context(5832413, annotations, combined, embeddings)

PMC ID:  5832413
Context: 

## Materials and methods 
  
### Participants 
  
#### Behavior-based visual similarity rating task and conceptual feature generation task 
  
A total of 2846 individuals completed online behavioral tasks using Amazon’s Mechanical Turk ( ). Data from 61 participants were discarded due to technical errors, incomplete submissions, or missed catch trials. Of the remaining 2785 participants, 1185 completed the visual similarity rating task (616 males, 569 females; age range = 18–53; mean age = 30.1), and 1600 completed the semantic feature generation task (852 males, 748 females; age range = 18–58 years; mean age = 31.7). These sample sizes are proportionally in line with those reported by  . Individuals who completed the visual similarity rating task were excluded from completing the feature generation task, and vice versa. All participants provided informed consent and were compensated for their time. Both online tasks were approved by the University of Toront

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,201,1185.0,,healthy,visual similarity rating task,616.0,569.0,30.1,18.0,53.0,,no
gpt-4o,202,1600.0,,healthy,semantic feature generation task,852.0,748.0,31.7,18.0,58.0,,no
gpt-4o,203,16.0,,healthy,fMRI task,6.0,10.0,23.1,19.0,29.0,,yes
gpt4-turbo,201,2785.0,,healthy,behavior-based visual similarity rating task a...,1468.0,1317.0,30.9,18.0,58.0,,no
gpt4-turbo,202,16.0,,healthy,brain-based fMRI task,6.0,10.0,23.1,19.0,29.0,,yes
firefunction-v1,226,2785.0,,healthy,behavioral,616.0,569.0,30.1,18.0,53.0,30.0,no
firefunction-v1,227,16.0,,healthy,fMRI,10.0,6.0,23.1,19.0,29.0,23.0,yes
firefunction-v1,228,16.0,,healthy,fMRI,10.0,6.0,23.1,19.0,29.0,23.0,yes


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
214,16,,healthy,_,6.0,10.0,23.1,19.0,29.0,


Actually, in this example, both models do a good job of filtering the `imaging_sample` from the behavioral sample. FF dobule extracts sample size, however. 

Could add a rule that if two extractions are identical across all rows, a duplicate is removed. Could accidentally remove real equal sample sizes, however.

In [34]:
_print_context(3423412, annotations, combined, embeddings)

PMC ID:  3423412
Context: 

## Materials and Methods 
  
### Subjects 
  
Thirty-six right-handed subjects participated in this study after giving written informed consent, including 14 patients with AD, 8 patients with MCI and 14 healthy controls. This study was approved by the Medical Research Ethics Committee of Xuanwu Hospital. The AD and MCI subjects were recruited from patients who had consulted the memory clinic at Xuanwu Hospital for memory complaints. The healthy elderly controls were recruited from the local community. 

All AD patients underwent a complete physical and neurological examination, standard laboratory tests and an extensive battery of neuropsychological assessments. The diagnosis of AD fulfilled the Diagnostic and Statistical Manual of Mental Disorders 4th Edition criteria for dementia (American Psychiatric Association, 1994), and the National Institute of Neurological and Communicative Disorders and Stroke/Alzheimer Disease and Related Disorders Association (NI

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,324,14.0,AD,patients,AD,,,,,,,yes
gpt-4o,325,8.0,MCI,patients,MCI,,,,,,,yes
gpt-4o,326,14.0,,healthy,healthy controls,,,,,,,yes
gpt4-turbo,347,36.0,,healthy,total participants,,,,,,,yes
gpt4-turbo,348,14.0,AD,patients,AD patients,,,,,,,yes
gpt4-turbo,349,8.0,MCI,patients,MCI patients,,,,,,,yes
gpt4-turbo,350,14.0,,healthy,healthy controls,,,,,,,yes
firefunction-v1,380,36.0,"AD, MCI, healthy",patients,AD,14.0,22.0,70.5,60.0,80.0,70.0,yes
firefunction-v1,381,36.0,"AD, MCI, healthy",patients,MCI,8.0,12.0,72.5,65.0,85.0,72.0,yes
firefunction-v1,382,36.0,,healthy,healthy,14.0,22.0,68.5,60.0,80.0,68.0,yes


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
32,8,mild cognitive impairment,patients,MCI,,,,,,
33,14,Alzheimer disease,patients,AD,,,,,,
34,14,,healthy,_,,,,,,


GPT-4 handled this better than FireFunction, by identifying subgroups (total participants), and subgroups. 

In [35]:
_print_context(7004957, annotations, combined, embeddings)

PMC ID:  7004957
Context: 

## Procedure and Methods 
  
### Open fMRI Dataset and Preprocessing 
  
The dataset used in the current study, from the open-fMRI repository , consists of 28 healthy younger adults (YA: M  = 24.39 years, age range = 18–34; nine females) and 24 healthy older adults (OA: M  = 66.95 years, age range = 55–75; nine females). There were no significant differences between groups in terms of intellectual ability, as measured by either education or the Wechsler Test of Adult Reading (Wechsler,  ):   M  : YA = 16.85 vs. OA = 16.38 years;   M  : YA = 43.96/50 vs. OA = 39.75/50. 

We used high-resolution structural images for our analyses (MPRAGE; TR = 1,950 ms; TE = 2.26 ms; FA = 7°; 1-mm isotropic voxel; FOV = 256 mm) and five identical task runs of the dataset (142 volumes for each EPI; 41 interleaved 4-mm slices with no gap; TR = 2,000 ms; TE = 25 ms; FA = 90°; matrix size = 64 × 64; FOV = 256), in which participants were exposed to different levels of auditory and

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,382,28.0,,healthy,younger adults,19.0,9.0,24.39,18.0,34.0,,yes
gpt-4o,383,23.0,,healthy,older adults,14.0,9.0,66.95,55.0,75.0,,yes
gpt4-turbo,411,28.0,,healthy,younger adults,19.0,9.0,24.39,18.0,34.0,,yes
gpt4-turbo,412,23.0,,healthy,older adults,15.0,9.0,66.95,55.0,75.0,,yes


Context: 

## Ethics Statement 
  
The dataset and code used in this study is from a public data repository. The original data collection procedure involving human participants was reviewed and approved by IRB at the University of Southern California. 


Predictions: 


Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
firefunction-v1,460,100.0,,healthy,subgroup1,50.0,50.0,25.5,20.0,30.0,25.0,yes
firefunction-v1,461,50.0,depression,patients,subgroup2,25.0,25.0,30.5,25.0,35.0,30.0,no
firefunction-v1,462,75.0,anxiety,patients,subgroup3,37.0,38.0,28.5,22.0,32.0,28.0,yes


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
294,28,,healthy,young,19.0,9.0,24.39,18.0,34.0,
295,23,,healthy,old,,,,,,


FF hallucinated results from the wrong context

In [36]:
_print_context(5598991, annotations, combined, embeddings)

PMC ID:  5598991
Context: 

### 1.2 Participants 
  
All patients were recruited from the outpatient clinic or inpatient wards of the Psychiatry Department of the First Affiliated Hospital of Kunming Medical University. Right-handed, 147 MDD patients (50 males and 97 females) met the criteria were recruited in the depression group. The inclusion criteria were as follows: ① The diagnosis of MDD was independently made by two experienced psychiatrists in accordance with the Diagnostic and Statistical Manual of Mental Disorders, fourth edition (DSM-IV, American Psychiatry Association, 1994), ② first episode without a history of antidepressants treatment, ③ be aged between 18–45 years, ④ the total score of 17-item Hamilton Depression Rating Scale (HDRS) was not less than 17, ⑤ right handedness, ⑥ the patients or their legal guardian signed the informed consent form. The exclusion criteria included the following items: ① having a history of Axis I psychiatric disorders. ② having a history of

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,159,147.0,MDD,patients,depression,50.0,97.0,,18.0,45.0,,yes
gpt-4o,160,130.0,,healthy,control,49.0,81.0,,,,,yes
gpt4-turbo,190,147.0,MDD,patients,depression,50.0,97.0,,18.0,45.0,,yes
gpt4-turbo,191,130.0,,healthy,control,49.0,81.0,,,,,yes
firefunction-v1,177,147.0,MDD,patients,depression,50.0,97.0,30.5,18.0,45.0,30.0,yes
firefunction-v1,178,130.0,,healthy,control,49.0,81.0,31.5,18.0,45.0,31.0,yes


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
195,52,,healthy,old hc,,,,,,
196,60,,healthy,young hc,,,,,,
197,58,early adult onset depression,patients,eod,,,,,,
198,62,later adult onset depression,patients,lod,,,,,,


Both models did fine, just didn't break groups into finer subsections

In [37]:
_print_context(3701149, annotations, combined, embeddings)


PMC ID:  3701149
Context: 

## Materials and methods 
  
### Participants 
  
Eighty adolescents aged from 12 to 20 years participated in the study. Exclusion criteria included age, the presence of any neurological problem or a diagnosis of schizophrenia or schizoaffective disorder according to DSM-IV-TR criteria. Thirty-two subjects were excluded for head movement exceeding 4.7 mm in any of the 6 directions during the scan sessions (Control group:   N   = 9, AH group:   N   = 6, 22q11.2DS group:   N   = 17). In the Control group, we excluded subjects with maladaptive functioning above the clinical cut-off of the Internalizing and Externalizing scales (  t  -score >64) in the Youth Self-Report and Adult Behavior Checklist (Achenbach,  ,  ) (  N   = 1). After excluding these 33 subjects, the 47 remaining youths were distributed in the following three groups: typically developing adolescents (Control group:   N   = 22), adolescents with transient AHs (AH group:   N   = 12) and adolescent

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,335,22.0,,healthy,Control,16.0,6.0,16.0,14.0,18.0,,yes
gpt-4o,336,12.0,transient AHs,patients,AH,5.0,7.0,15.97,13.0,18.0,,yes
gpt-4o,337,13.0,22q11.2 Deletion Syndrome,patients,22q11.2DS,9.0,4.0,16.14,13.0,19.0,,yes
gpt4-turbo,343,22.0,,healthy,Control group,16.0,6.0,16.0,,,,yes
gpt4-turbo,344,12.0,subclinical AHs,patients,AH group,5.0,7.0,15.97,,,,yes
gpt4-turbo,345,13.0,22q11.2 Deletion Syndrome,patients,22q11.2DS group,9.0,4.0,16.14,,,,yes
firefunction-v1,394,80.0,,healthy,all,32.0,48.0,16.0,12.0,20.0,16.0,no
firefunction-v1,395,47.0,,healthy,final,16.0,31.0,16.0,12.0,20.0,16.0,no
firefunction-v1,396,22.0,,healthy,control,16.0,6.0,16.0,12.0,20.0,16.0,no
firefunction-v1,397,12.0,,healthy,ah,5.0,7.0,16.0,12.0,20.0,16.0,no


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
55,12,auditory hallucination,patients,ah,5.0,7.0,15.97,,,
56,22,,healthy,_,16.0,6.0,16.0,,,
57,13,22q11.2 Deletion Syndrome,patients,22q11.2DS,9.0,4.0,16.14,,,


FF hallucinated several groups, and did not annotate diagnosis using the correct column

In [38]:
_print_context(3913832, annotations, combined, embeddings)


PMC ID:  3913832
Context: 

## Methods 
  
### Participants 
  
A total of sixty-six female participants took part in this study. Thirty-one individuals with a current diagnosis of AN according to DSM-IV criteria were recruited from the hospital and community services of the South London and Maudsley (SLaM) National Health Service Trust and from an online advertisement on the b-eat website (Beating Eating Disorders — ), the UK's largest eating disorder charity (inpatients = 9, outpatients = 8, daycare patients = 7, community = 7). Twenty-five (81%) were diagnosed as restrictive (AN-R) and six (19%) as binge-purging (AN-BP). Fourteen (45%) reported taking antidepressant (SSRI = 12, SNRI = 1) or anti-anxiety medication. Thirty-five age-matched healthy individuals with no personal or family history of eating disorders were recruited from the community, staff and students of the Institute of Psychiatry, King's College London. Two healthy participants were excluded from further analysis due

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gpt-4o,146,31.0,anorexia nervosa,patients,AN,0.0,31.0,,,,,no
gpt-4o,147,31.0,anorexia nervosa restrictive,patients,AN-R,0.0,25.0,,,,,no
gpt-4o,148,6.0,anorexia nervosa binge-purging,patients,AN-BP,0.0,6.0,,,,,no
gpt-4o,149,31.0,,healthy,HC,0.0,31.0,,,,,no
gpt4-turbo,140,31.0,AN (Anorexia Nervosa),patients,AN patients,0.0,31.0,,,,,no
gpt4-turbo,141,33.0,,healthy,Healthy controls,0.0,33.0,,,,,no
firefunction-v1,153,66.0,AN,patients,female,0.0,66.0,,,,,no


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
79,31,,healthy,_,,31.0,,,,
80,31,anorexia nervosa,patients,_,,31.0,,,,


FF combined healthy and diseased groups

# Few Shot 2

In [43]:

ff2 = pd.read_csv('outputs/eval_demographics-fewshot2_firefunction-v1_minc-40_maxc-4000_clean.csv')
gpt4o2 = pd.read_csv('outputs/eval_demographics-fewshot2_gpt-4o-2024-05-13_minc-40_maxc-4000_clean.csv')

In [44]:
ff2_mean = ff2.groupby('pmcid').apply(lambda x: x['count'].mean())
annot_mean = annotations.groupby('pmcid').apply(lambda x: x['count'].mean())
worst = ((ff2_mean-annot_mean) / annot_mean).abs().sort_values(ascending=False).head(10)

combined2 = pd.concat([gpt_4_o, gpt4o2, ff, ff2], keys=['gpt-4o', 'gpt-4o_fs2', 'ff', 'ff_fs2'], names=['model'])

In [59]:
for i in worst.index:
    _print_context(i, annotations, combined2, embeddings)


PMC ID:  5832413
Context: 

## Materials and methods 
  
### Participants 
  
#### Behavior-based visual similarity rating task and conceptual feature generation task 
  
A total of 2846 individuals completed online behavioral tasks using Amazon’s Mechanical Turk ( ). Data from 61 participants were discarded due to technical errors, incomplete submissions, or missed catch trials. Of the remaining 2785 participants, 1185 completed the visual similarity rating task (616 males, 569 females; age range = 18–53; mean age = 30.1), and 1600 completed the semantic feature generation task (852 males, 748 females; age range = 18–58 years; mean age = 31.7). These sample sizes are proportionally in line with those reported by  . Individuals who completed the visual similarity rating task were excluded from completing the feature generation task, and vice versa. All participants provided informed consent and were compensated for their time. Both online tasks were approved by the University of Toront

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4o,201,1185.0,,healthy,visual similarity rating task,616.0,569.0,30.1,18.0,53.0,,no,
gpt-4o,202,1600.0,,healthy,semantic feature generation task,852.0,748.0,31.7,18.0,58.0,,no,
gpt-4o,203,16.0,,healthy,fMRI task,6.0,10.0,23.1,19.0,29.0,,yes,
gpt-4o_fs2,196,1185.0,,healthy,visual similarity rating task,616.0,569.0,30.1,18.0,53.0,,,behavioral
gpt-4o_fs2,197,1600.0,,healthy,semantic feature generation task,852.0,748.0,31.7,18.0,58.0,,,behavioral
gpt-4o_fs2,198,16.0,,healthy,,6.0,10.0,23.1,19.0,29.0,,,imaging
ff,226,2785.0,,healthy,behavioral,616.0,569.0,30.1,18.0,53.0,30.0,no,
ff,227,16.0,,healthy,fMRI,10.0,6.0,23.1,19.0,29.0,23.0,yes,
ff,228,16.0,,healthy,fMRI,10.0,6.0,23.1,19.0,29.0,23.0,yes,
ff_fs2,185,2785.0,healthy,healthy,behavioral,616.0,569.0,30.1,18.0,53.0,30.0,,behavioral


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
214,16,,healthy,_,6.0,10.0,23.1,19.0,29.0,


PMC ID:  3423412
Context: 

## Materials and Methods 
  
### Subjects 
  
Thirty-six right-handed subjects participated in this study after giving written informed consent, including 14 patients with AD, 8 patients with MCI and 14 healthy controls. This study was approved by the Medical Research Ethics Committee of Xuanwu Hospital. The AD and MCI subjects were recruited from patients who had consulted the memory clinic at Xuanwu Hospital for memory complaints. The healthy elderly controls were recruited from the local community. 

All AD patients underwent a complete physical and neurological examination, standard laboratory tests and an extensive battery of neuropsychological assessments. The diagnosis of AD fulfilled the Diagnostic and Statistical Manual of Mental Disorders 4th Edition criteria for dementia (American Psychiatric Association, 1994), and the National Institute of Neurological and Communicative Disorders and Stroke/Alzheimer Disease and Related Disorders Association (NI

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4o,324,14.0,AD,patients,AD,,,,,,,yes,
gpt-4o,325,8.0,MCI,patients,MCI,,,,,,,yes,
gpt-4o,326,14.0,,healthy,healthy controls,,,,,,,yes,
gpt-4o_fs2,318,14.0,AD,patients,,,,,,,,,behavioral
gpt-4o_fs2,319,8.0,MCI,patients,,,,,,,,,behavioral
gpt-4o_fs2,320,14.0,,healthy,,,,,,,,,behavioral
ff,380,36.0,"AD, MCI, healthy",patients,AD,14.0,22.0,70.5,60.0,80.0,70.0,yes,
ff,381,36.0,"AD, MCI, healthy",patients,MCI,8.0,12.0,72.5,65.0,85.0,72.0,yes,
ff,382,36.0,,healthy,healthy,14.0,22.0,68.5,60.0,80.0,68.0,yes,
ff_fs2,300,36.0,"AD, MCI, healthy",patients,AD,14.0,0.0,70.5,60.0,80.0,70.0,,imaging


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
32,8,mild cognitive impairment,patients,MCI,,,,,,
33,14,Alzheimer disease,patients,AD,,,,,,
34,14,,healthy,_,,,,,,


PMC ID:  6024199
Context: 

## Materials & methods 
  
### Participants 
  
Fifty-seven participants (35 women, mean age = 21.9 years, range = 18–37 years) completed the scan session. Participants gave written informed consent for a protocol approved by the Duke University Institutional Review Board and the Duke University Medical Center Institutional Review Board. All participants had an A1 trauma in which they were exposed to death, threatened death, actual or threatened serious injury, or actual or threatened sexual violence ( ). After excluding participants who did not meet inclusion criteria (see Supplemental materials) there were 21 participants in the PTSD group (15 women, mean age = 21.5 years, range = 18–31 years) and 21 in the trauma-exposed control group (14 women, mean age = 22.1 years, range = 18–37 years). Additional demographic and clinical data are shown in   and described in the Supplemental materials.   
Participant characteristics. 
  Table 1   


Predictions: 


Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4o,162,21.0,PTSD,patients,PTSD,6.0,15.0,21.5,18.0,31.0,,yes,
gpt-4o,163,21.0,,healthy,trauma-exposed control,7.0,14.0,22.1,18.0,37.0,,yes,
gpt-4o_fs2,158,21.0,PTSD,patients,,6.0,15.0,21.5,18.0,31.0,,,imaging
gpt-4o_fs2,159,21.0,,healthy,trauma-exposed control,7.0,14.0,22.1,18.0,37.0,,,imaging
ff,182,57.0,,healthy,scan session,22.0,35.0,21.9,18.0,37.0,22.0,yes,
ff,183,21.0,PTSD,patients,scan session,6.0,15.0,21.5,18.0,31.0,21.0,yes,
ff,184,21.0,,healthy,scan session,8.0,13.0,22.1,18.0,37.0,22.0,yes,
ff_fs2,148,57.0,PTSD,patients,PTSD,22.0,35.0,21.9,18.0,37.0,21.0,,imaging


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
227,21,posttraumatic stress disorder,patients,_,6.0,15.0,21.5,18.0,31.0,
228,21,,healthy,_,7.0,14.0,22.1,18.0,37.0,


PMC ID:  7260173
Context: 

## Methods 
  
### Participants 
  
A total of 70 healthy adults (age range 21–50 years, 41 males) participated in a 5-day and 4-night in-laboratory controlled sleep deprivation experiment , including 54 adults in the experimental group with one night of TSD (Fig.  ) and 16 adults in the control group without sleep loss (Fig.  ). Fifteen participants in the TSD group and one participant in the control group were excluded due to head motion, MR hardware problem, falling asleep during the scans, and/or missing behavioral data. Thus, fifty-four participants (age range 21–50 yrs, 28 males) were included in the present study, including 39 in the TSD group (mean age = 33.5 ± 8.8 yrs, 22 males) and 15 in the control group (mean age = 34.5 ± 9.0 yrs, 7 males). The sleep-wake schedule of each participant was assessed by at least one week actigraphy, sleep-wake diaries, sleep and circadian rhythm questionnaires, and a night of laboratory polysomnography and oximetry m

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4o,141,39.0,,healthy,TSD,22.0,17.0,33.5,21.0,50.0,,yes,
gpt-4o,142,15.0,,healthy,control,7.0,8.0,34.5,21.0,50.0,,yes,
gpt-4o_fs2,139,39.0,,healthy,TSD,22.0,17.0,33.5,21.0,50.0,,,behavioral
gpt-4o_fs2,140,15.0,,healthy,control,7.0,8.0,34.5,21.0,50.0,,,behavioral
ff,157,70.0,,healthy,all,41.0,29.0,33.5,21.0,50.0,33.0,no,
ff,158,54.0,,healthy,TSD,22.0,12.0,33.58,21.0,50.0,33.0,yes,
ff,159,15.0,,healthy,control,7.0,8.0,34.5,21.0,50.0,34.0,yes,
ff_fs2,132,70.0,healthy,healthy,all,41.0,29.0,33.58,21.0,50.0,34.0,,other


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
309,39,,healthy,tsd,22.0,17.0,33.5,,,
310,15,,healthy,control,7.0,8.0,34.5,,,


PMC ID:  5598991
Context: 

### 1.2 Participants 
  
All patients were recruited from the outpatient clinic or inpatient wards of the Psychiatry Department of the First Affiliated Hospital of Kunming Medical University. Right-handed, 147 MDD patients (50 males and 97 females) met the criteria were recruited in the depression group. The inclusion criteria were as follows: ① The diagnosis of MDD was independently made by two experienced psychiatrists in accordance with the Diagnostic and Statistical Manual of Mental Disorders, fourth edition (DSM-IV, American Psychiatry Association, 1994), ② first episode without a history of antidepressants treatment, ③ be aged between 18–45 years, ④ the total score of 17-item Hamilton Depression Rating Scale (HDRS) was not less than 17, ⑤ right handedness, ⑥ the patients or their legal guardian signed the informed consent form. The exclusion criteria included the following items: ① having a history of Axis I psychiatric disorders. ② having a history of

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4o,159,147.0,MDD,patients,depression,50.0,97.0,,18.0,45.0,,yes,
gpt-4o,160,130.0,,healthy,control,49.0,81.0,,,,,yes,
gpt-4o_fs2,155,147.0,MDD,patients,,50.0,97.0,,18.0,45.0,,,imaging
gpt-4o_fs2,156,130.0,,healthy,,49.0,81.0,,,,,,imaging
ff,177,147.0,MDD,patients,depression,50.0,97.0,30.5,18.0,45.0,30.0,yes,
ff,178,130.0,,healthy,control,49.0,81.0,31.5,18.0,45.0,31.0,yes,
ff_fs2,145,147.0,MDD,patients,depression,50.0,97.0,26.1,18.0,45.0,26.0,,imaging
ff_fs2,146,130.0,healthy,healthy,control,49.0,81.0,24.7,18.0,45.0,24.0,,imaging


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
195,52,,healthy,old hc,,,,,,
196,60,,healthy,young hc,,,,,,
197,58,early adult onset depression,patients,eod,,,,,,
198,62,later adult onset depression,patients,lod,,,,,,


PMC ID:  3660406
Context: 

### Participants 
  
A group of 38 participants, comprising younger and older adults, took part in the fMRI experiment. Three participants (one younger adult and two older adults) were excluded from statistical analyses after medical examination of their anatomical scans in which structural abnormalities were diagnosed that might have an influence on their functional images. In addition, one younger adult was excluded due to experience in professional modern dance for six years in adolescence. The final sample consisted of 19 younger (14 women, mean age = 22.6±2.27 years, range 18–27) and 15 older adults (10 women, mean age = 61.1±5.68 years, range 51–71),   t  (32) = 24.7,   p  <0.001. The majority of the participants already took part in the behavioral action prediction experiment reported in Diersch et al.  . One younger adult and four older adults were additionally recruited from the participant database of the MPI for Human Cognitive and Brain Sciences,

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4o,152,19.0,,healthy,younger adults,5.0,14.0,22.6,18.0,27.0,,yes,
gpt-4o,153,15.0,,healthy,older adults,5.0,10.0,61.1,51.0,71.0,,yes,
gpt-4o_fs2,148,34.0,,healthy,younger adults,5.0,14.0,22.62,18.0,27.0,,,imaging
gpt-4o_fs2,149,34.0,,healthy,older adults,5.0,10.0,61.15,51.0,71.0,,,imaging
ff,171,38.0,,healthy,fMRI,14.0,24.0,22.62,18.0,27.0,22.0,yes,
ff,172,19.0,,healthy,behavioral,14.0,5.0,22.62,18.0,27.0,22.0,yes,
ff,173,15.0,,healthy,behavioral,10.0,5.0,61.15,51.0,71.0,61.0,yes,
ff_fs2,136,38.0,healthy,healthy,younger adults,5.0,14.0,22.6,18.0,27.0,22.0,,imaging
ff_fs2,137,38.0,healthy,healthy,older adults,5.0,10.0,61.2,51.0,71.0,61.0,,imaging


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
51,19,,healthy,young,5.0,14.0,22.6,18.0,27.0,
52,15,,healthy,old,5.0,10.0,61.1,51.0,71.0,


PMC ID:  7406917
Context: 

## 2. Materials 
  
### 2.1. Subjects 
  
Seven healthy male right-handed subjects aged from 21 to 28 years (23.5 yo ± 2.5) were involved in this study. All subjects gave written informed consent before participating in the study. We have maintained the homogeneity of the population in order to limit the influence of factors, such as gender or age. 


### 2.2. MR Acquisitions 
  
The subjects were scanned on a 3.0T whole body Siemens MR scanner (Magnetom Verio, Siemens Healthcare, Erlangen, Germany) with a 32-channel head coil. A 3D anatomical T1-weighted MP2RAGE image was acquired for each subject. The resting-state ASL imaging was performed using a 2D EPI pseudo-continuous (pCASL) sequence. Subjects were asked to keep their eyes closed, to relax (mind-wandering), to lie still and to not fall asleep. We used common parameters reported in the literature: TR = 3,500 ms, FoV = 224 × 224 mm , TE = 12 ms, LD = 1,500 ms and a 1250 ms post-labeling delay (PLD) at 

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4o,343,7.0,,healthy,,7.0,0.0,23.5,21.0,28.0,,yes,
gpt-4o_fs2,336,7.0,,healthy,right handed,7.0,0.0,23.5,21.0,28.0,,,imaging


Context: 

## Ethics Statement 
  
The studies involving human participants were reviewed and approved by Ethics Committee of the University Hospital of Rennes CHU Rennes Hôtel-Dieu, 2 rue de l'Hôtel-Dieu, CS 26419, 35064 Rennes Cedex, France. The patients/participants provided their written informed consent to participate in this study. 


Predictions: 


Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ff,390,10.0,,healthy,subgroup1,5.0,5.0,25.5,20.0,30.0,25.0,yes,
ff,391,15.0,patients,patients,subgroup2,7.0,8.0,30.5,25.0,35.0,30.0,yes,
ff_fs2,318,15.0,schizophrenia,patients,schizophrenia,5.0,10.0,26.1,21.0,35.0,26.0,,imaging


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
320,7,,healthy,_,7.0,,23.5,21.0,28.0,


PMC ID:  3913832
Context: 

## Methods 
  
### Participants 
  
A total of sixty-six female participants took part in this study. Thirty-one individuals with a current diagnosis of AN according to DSM-IV criteria were recruited from the hospital and community services of the South London and Maudsley (SLaM) National Health Service Trust and from an online advertisement on the b-eat website (Beating Eating Disorders — ), the UK's largest eating disorder charity (inpatients = 9, outpatients = 8, daycare patients = 7, community = 7). Twenty-five (81%) were diagnosed as restrictive (AN-R) and six (19%) as binge-purging (AN-BP). Fourteen (45%) reported taking antidepressant (SSRI = 12, SNRI = 1) or anti-anxiety medication. Thirty-five age-matched healthy individuals with no personal or family history of eating disorders were recruited from the community, staff and students of the Institute of Psychiatry, King's College London. Two healthy participants were excluded from further analysis due

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4o,146,31.0,anorexia nervosa,patients,AN,0.0,31.0,,,,,no,
gpt-4o,147,31.0,anorexia nervosa restrictive,patients,AN-R,0.0,25.0,,,,,no,
gpt-4o,148,6.0,anorexia nervosa binge-purging,patients,AN-BP,0.0,6.0,,,,,no,
gpt-4o,149,31.0,,healthy,HC,0.0,31.0,,,,,no,
gpt-4o_fs2,144,31.0,anorexia nervosa,patients,AN-R,0.0,31.0,,,,,,behavioral
gpt-4o_fs2,145,35.0,,healthy,,0.0,35.0,,,,,,behavioral
ff,153,66.0,AN,patients,female,0.0,66.0,,,,,no,
ff_fs2,134,66.0,AN,patients,female,0.0,66.0,0.0,0.0,0.0,0.0,,other


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
79,31,,healthy,_,,31.0,,,,
80,31,anorexia nervosa,patients,_,,31.0,,,,


PMC ID:  5104469
Context: 

## Materials and Methods 
  
### Participants 
  
Seventy (23 young people with a history of childhood abuse, 20 psychiatric controls, and 27 healthy controls) right-handed, medication-naïve, drug-free and age-matched young people came to the laboratory for two sessions, and those below the age of 18 were accompanied by their guardians on both visits. In the first visit, demographic, clinical and abuse measures and IQ data were collected and the participants were acclimated to the scanner environment with an MRI replica. The fMRI sustained attention task was administered in the MRI scanner on the second visit. All participants were assessed by a child psychiatrist (KM) using the Development and Well-Being Assessment (DAWBA) [ ] designed to generate ICD-10 and DSM-IV psychiatric diagnoses. The Strengths and Difficulties Questionnaires (SDQ) [ ] and Beck’s Depression Inventory (BDI) [ ] were used to provide psychopathology symptom scores. IQ was assessed using

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4o,355,21.0,"PTSD, depression, anxiety, conduct disorder an...",patients,childhood abuse,15.0,6.0,17.5,,,,yes,
gpt-4o,356,20.0,,healthy,psychiatric controls,,,,,,,yes,
gpt-4o,357,27.0,,healthy,healthy controls,,,,,,,yes,
gpt-4o_fs2,348,21.0,"PTSD, depression, anxiety, conduct disorder an...",patients,history of childhood abuse,15.0,6.0,17.5,,,,,imaging
gpt-4o_fs2,349,20.0,,healthy,psychiatric controls,,,,,,,,imaging
gpt-4o_fs2,350,27.0,,healthy,healthy controls,,,,,,,,imaging
ff,422,70.0,,healthy,young people,23.0,20.0,25.5,18.0,30.0,25.0,no,
ff,423,23.0,"PTSD, depression, anxiety, conduct disorder an...",patients,young people,15.0,10.0,17.5,12.0,18.0,17.0,yes,
ff,424,20.0,psychiatric controls,patients,young people,10.0,10.0,25.5,18.0,30.0,25.0,no,
ff_fs2,328,70.0,healthy,healthy,young people,23.0,20.0,24.7,18.0,60.0,24.0,,imaging


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
163,21,Severe Childhood Abuse,patients,severe childhood abuse,15.0,6.0,17.5,,,
164,19,,patients,psychiatric controls,9.0,10.0,16.9,,,
165,27,,healthy,_,21.0,6.0,17.5,,,


PMC ID:  9407088
Context: 

### 3.2. Experiment Results 
  
Taking an 8-year-old girl with autism, numbered 50,795 in the Autism Brain Imaging Data Exchange (ABIDE) dataset, as an example, this paper analyzes the custom template in the form of slices. The fMRI data size of the subject is  . After discarding the data images of the first four time points and eliminating the invalid data whose head movement exceeds the limit at two time points, the final effective data are  , and the size of the final built custom template is  , as shown in  . 

The custom template has individual pertinence, that is, it is constructed by using the subjects’ fMRI sequence. Therefore, the constructed custom template is related to the subjects’ age and brain structure, and the subjects are different from each other.   shows the fMRI data of four example subjects with autism at five different time points and the custom built template in which sub50795 is an 8-year-old girl, sub50625 is a 7-year-old boy, sub51

Unnamed: 0_level_0,Unnamed: 1_level_0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median,imaging_sample,assesment_type
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gpt-4o,426,4.0,autism,patients,example subjects,3.0,1.0,32.25,7.0,64.0,,yes,
gpt-4o_fs2,425,1.0,autism,patients,sub50795,0.0,1.0,8.0,8.0,8.0,8.0,,imaging
gpt-4o_fs2,426,1.0,autism,patients,sub50625,1.0,0.0,7.0,7.0,7.0,7.0,,imaging
gpt-4o_fs2,427,1.0,autism,patients,sub51581,1.0,0.0,64.0,64.0,64.0,64.0,,imaging
gpt-4o_fs2,428,1.0,autism,patients,sub50526,1.0,0.0,50.0,50.0,50.0,50.0,,imaging
ff,507,1114.0,autism,patients,ABIDE dataset,557.0,557.0,10.5,,18.0,10.0,yes,
ff,508,1.0,autism,patients,custom template,1.0,0.0,8.5,8.0,8.0,8.0,yes,
ff,509,1.0,autism,patients,custom template,0.0,1.0,7.5,7.0,7.0,7.0,yes,
ff_fs2,402,1114.0,autism,patients,autism,557.0,557.0,8.0,8.0,8.0,8.0,,imaging


Annotation: 


Unnamed: 0,count,diagnosis,group_name,subgroup_name,male_count,female_count,age_mean,age_minimum,age_maximum,age_median
415,539,autism spectrum disorder,patients,abide 1,,,,,,
416,521,autism spectrum disorder,patients,abide 2,,,,,,
417,573,,healthy,abide 1,,,,,,
418,593,,healthy,abide 2,,,,,,


In several examples, the fewshot2 prompt combined groups that should be separate, by either repeating the total count for several groups, or just reporting a single group.

Not sure why this happened. Should try to change to prompt to only identify what assesment type the original groups refer to, to exclude behavioral samples and no more.

### Summary

In general, GPT-4 does a much better job of reporting sub-groups and not the total sample size, like we want the output
FF sometimes combines diseased and healthy groups together, either repeating sample size, or reporting only a single total sample size.

It's possible better prompting could help here, but it's also worth looking at and assesing other models that are better than FF. Perhaps models based on LLama3 instead of LLama2 or Mixtral. Should also look at gpt-4-turbo and see if there's a way to motivate it to prevent laziness. Perhaps it can close the gap. 

At the hackathon we should also have a discussion about what a reasonable schema is for information we want to extract. 