# Prototype Sample Size extraction using GPT

Goal: Test prompt engineering on the easiest use case: single 

In [1]:
import pandas as pd
from labelrepo.projects.participant_demographics import (
    get_participant_demographics,
)

subgroups = get_participant_demographics()

In [2]:
abstracts = pd.read_csv('data/abstracts.csv')

In [3]:
jerome_pd = subgroups[(subgroups.project_name == 'participant_demographics') & (subgroups.annotator_name == 'Jerome_Dockes')]

In [4]:
counts = jerome_pd.groupby('pmcid').count().reset_index()
single_group_pmcids = counts[counts['count'] == 1].pmcid

### Single group with Sample Size in Abstract

In [5]:
single_group = jerome_pd[jerome_pd.pmcid.isin(single_group_pmcids)]

In [6]:
single_group = pd.merge(single_group, abstracts[abstracts.pmcid.isin(single_group.pmcid)])

In [7]:
# Heuristic for finding in abstract studies
# Using fixed list below after manual checking
# single_group['in_abstract'] = \
#  single_group.apply(lambda x: (re.search(f"\D{x['count']}", x.abstract) != None), axis = 1)

In [8]:
# Manual list
not_in_abstract = [5548834, 4029023, 4318429, 5324609, 8752963, 5218407, 3147157, 2775905, 6344389, 8837589, 7430162, 7563756, 7156375, 4330553, 6989437, 6328158, 3409150, 3775427, 3483694, 6787094, 6528067, 3869649, 3183226, 6868994, 7002496, 6492297, 3780305]

In [9]:
single_group['in_abstract'] = single_group.pmcid.isin(not_in_abstract) == False

## Extract simple sample size

In [10]:
import openai
from templates import ZERO_SHOT_SINGLE_GROUP_DEMOGRAPHICS
from extract import extract_from_multiple

openai.api_key = open('/home/zorro/.keys/open_ai.key').read().strip()

In [11]:
predictions = extract_from_multiple(single_group.abstract.to_list(), 
                                    **ZERO_SHOT_SINGLE_GROUP_DEMOGRAPHICS, num_workers=2)

100%|███████████████████████████████████████████| 75/75 [00:49<00:00,  1.51it/s]


In [12]:
# Normalize values
predictions['pmcid'] = single_group['pmcid']
predictions['count'] = predictions['count'].astype('str')
single_group['count'] = single_group['count'].astype('str')

predictions['correct'] = single_group['count'] == predictions['count']

In [13]:
ZERO_SHOT_SINGLE_GROUP_DEMOGRAPHICS

{'template': '\n                You will be provided with a text sample from a scientific journal. \n                The sample is delimited with triple backticks.\n\n                Perform the following tasks:\n                1. Identify the total number of participants that underwent fMRI or neuroimaging in the study, if any. \n                2. Provide your response in a JSON format containing a single key `count` and a integer value corresponding to the number of participants. \n                Do not provide any additional information except the JSON. If the number of participants is not mentioned in the text, provide `n/a` as the value. If the number of participants is 0 return `n/a`.\n\n                Text sample: ```{text}```\n\n                Your JSON response:\n                ',
 'expected_keys': ['count']}

### Only look at those with sample size in abstract

In [14]:
in_abstract_predictions = predictions[single_group['in_abstract']]

In [15]:
# Percentage correct
in_abstract_predictions['correct'].mean()

0.9166666666666666

In [16]:
# Incorrect predictions
in_abstract_predictions[in_abstract_predictions['correct'] == False]

Unnamed: 0,count,pmcid,correct
14,69.0,5460048,False
26,,8978988,False
56,79.0,6678781,False
74,27.0,6509414,False


Notes: First error is due to pulling out sample size for behavioral task, not fMRI task. 

Second error is due to study being a single subject study.

### Any hallucinations?

In [17]:
no_abstract_predictions = predictions[single_group['in_abstract'] == False]

In [18]:
# Predicted `n/a` or got correct answer (i.e. misclassified as not having info in abstract)
((no_abstract_predictions['count'] == 'n/a')).mean()

0.9259259259259259

In [19]:
# Studies where the model was wrong, and did not predict an `n/a`
no_abstract_predictions[(no_abstract_predictions['count'] != 'n/a') & (no_abstract_predictions['correct'] == False)]

Unnamed: 0,count,pmcid,correct
2,0,4029023,False
40,0,5548834,False


Overall notes: 

Errors:
- 2 error due to discrepancy between methods and abstract count (final n vs starting n)
- 1 n/a should be = 1
- 1 was for behavioral count, not fMRI count

False positives:
- Age was extracted instead of sample size (fixed after improving prompt)
- Returns 0 occasional instead of `n/a` (could be fixed in serialization). 


Excluding abstract/full-text discrepancies, error rate is 2/75

### View incorrect abstracts

In [25]:
pmcid = 6678781
text = abstracts[abstracts.pmcid == pmcid]['abstract'].tolist()[0].encode("ascii", "ignore").decode()

In [26]:
text

' \nHigh prevalence of child sexual offending stand in contradiction to low conviction rates (one-tenth at most) of child sexual offenders (CSOs). Little is known about possible differences between convicted and non-convicted pedophilic CSOs and why only some become known to the judicial system. This investigation takes a closer look at the two sides of child sexual offending by focusing on clinical and neurobiological characteristics of convicted and non-convicted pedophilic CSOs as presented in the Neural Mechanisms Underlying Pedophilia and sexual offending against children (NeMUP)*-study. Seventy-nine male pedophilic CSOs were examined, 48 of them convicted. All participants received a thorough clinical examination including the structured clinical interview (SCID), intelligence, empathy, impulsivity, and criminal history. Sixty-one participants (38 convicted) underwent an inhibition performance task (Go/No-go paradigm) combined with functional magnetic resonance imaging (fMRI). Co

In [22]:
single_group[single_group.pmcid == pmcid]

Unnamed: 0,group_name,subgroup_name,project_name,annotator_name,pmcid,diagnosis,count,male count,age mean,female count,age minimum,age maximum,age median,abstract,in_abstract
56,patients,_,participant_demographics,Jerome_Dockes,6678781,Pedophilic Child Sexual Offenders,61,61.0,,,,,,\nHigh prevalence of child sexual offending s...,True
