# Prototype Sample Size extraction using GPT

Goal: Test prompt engineering on the easiest use case: single 

In [20]:
import pandas as pd
from labelrepo.projects.participant_demographics import (
    get_participant_demographics,
)

subgroups = get_participant_demographics()

In [21]:
abstracts = pd.read_csv('data/abstracts.csv')

In [22]:
jerome_pd = subgroups[(subgroups.project_name == 'participant_demographics') & (subgroups.annotator_name == 'Jerome_Dockes')]

In [23]:
counts = jerome_pd.groupby('pmcid').count().reset_index()
single_group_pmcids = counts[counts['count'] == 1].pmcid

### Single group with Sample Size in Abstract

In [24]:
single_group = jerome_pd[jerome_pd.pmcid.isin(single_group_pmcids)]

In [25]:
single_group = pd.merge(single_group, abstracts[abstracts.pmcid.isin(single_group.pmcid)])

In [26]:
# Heuristic for finding in abstract studies
# Using fixed list below after manual checking
# single_group['in_abstract'] = \
#  single_group.apply(lambda x: (re.search(f"\D{x['count']}", x.abstract) != None), axis = 1)

In [27]:
# Manual list
not_in_abstract = [5548834, 4029023, 4318429, 5324609, 8752963, 5218407, 3147157, 2775905, 6344389, 8837589, 7430162, 7563756, 7156375, 4330553, 6989437, 6328158, 3409150, 3775427, 3483694, 6787094, 6528067, 3869649, 3183226, 6868994, 7002496, 6492297, 3780305]

In [28]:
single_group['in_abstract'] = single_group.pmcid.isin(not_in_abstract) == False

## Extract simple sample size

In [32]:
import openai
from templates import ZERO_SHOT_SAMPLE_SIZE_FUNCTION
from extract import extract_from_multiple

openai.api_key = open('/home/zorro/.keys/open_ai.key').read().strip()

In [33]:
predictions = extract_from_multiple(single_group.abstract.to_list(), 
                                    **ZERO_SHOT_SAMPLE_SIZE_FUNCTION, num_workers=2)

100%|███████████████████████████████████████████| 75/75 [00:45<00:00,  1.64it/s]


In [37]:
predictions['count'].astype('int', errors='ignore')

0      29.0
1      21.0
2       NaN
3       NaN
4     161.0
      ...  
70     12.0
71      NaN
72     33.0
73      NaN
74     27.0
Name: count, Length: 75, dtype: float64

In [39]:
single_group['count'].astype('float')

0      29.0
1      21.0
2      10.0
3      18.0
4     173.0
      ...  
70     12.0
71     20.0
72     33.0
73    820.0
74     26.0
Name: count, Length: 75, dtype: float64

In [40]:
# Normalize values
predictions['pmcid'] = single_group['pmcid']
single_group['count'] = single_group['count'].astype('float')

predictions['correct'] = single_group['count'] == predictions['count']

### Only look at those with sample size in abstract

In [41]:
in_abstract_predictions = predictions[single_group['in_abstract']]

In [43]:
# Percentage correct
in_abstract_predictions['correct'].mean()

0.8333333333333334

In [44]:
# Incorrect predictions
in_abstract_predictions[in_abstract_predictions['correct'] == False]

Unnamed: 0,count,pmcid,correct
4,161.0,9308181,False
10,149.0,4115625,False
12,41.0,2144768,False
14,69.0,5460048,False
26,0.0,8978988,False
41,,3182403,False
56,79.0,6678781,False
74,27.0,6509414,False


Notes: First error is due to pulling out sample size for behavioral task, not fMRI task. 

Second error is due to study being a single subject study.

### Any hallucinations?

In all these cases where the information is not in the abstract, the model should have predicted either null, or 0

In [55]:
no_abstract_predictions = predictions[single_group['in_abstract'] == False]

In [56]:
# Predicted `n/a` or got correct answer (i.e. misclassified as not having info in abstract)
(pd.isna(no_abstract_predictions['count'])).mean()

0.7777777777777778

In [57]:
# Studies where the model was wrong, and made a prediction
no_abstract_predictions[(pd.isna(no_abstract_predictions['count']) == False) & (no_abstract_predictions['correct'] == False)]

Unnamed: 0,count,pmcid,correct
7,0.0,5218407,False
18,0.0,8837589,False
19,0.0,7430162,False
40,0.0,5548834,False
43,0.0,3483694,False
52,0.0,3869649,False


Overall notes: 

Errors:
- 2 error due to discrepancy between methods and abstract count (final n vs starting n)
- 1 n/a should be = 1
- 1 was for behavioral count, not fMRI count

False positives:
- Age was extracted instead of sample size (fixed after improving prompt)
- Returns 0 occasional instead of `n/a` (could be fixed in serialization). 


Excluding abstract/full-text discrepancies, error rate is 2/75