In [174]:
import json
import pandas as pd

#### Load GPT Extracted Data

In [175]:
with open('../../outputs/nv_task/extractions/full_lb_nv_task-zeroshot_gpt-4o-mini-2024-07-18.json') as f:
    nv_task_gpt_json = json.load(f)

# Convert to dictionary and clean
nv_task_gpt = {}
for item in nv_task_gpt_json:
    pmcid = item['pmcid']
    nv_task_gpt[pmcid] = {}
    for key, value in item.items():
        value = None if value in ['null', []] else value
        if key == 'Designtype':
            key = 'DesignType'

        if key == 'Modality':
            value = [v.replace(' ', '') for v in value]

        nv_task_gpt[pmcid][key] = value



#### Load Annotations

In [176]:
from labelrepo.projects.nv_task import load_annotations

annotations = load_annotations()
annotations = annotations[annotations['annotator_name'] != 'alice_chen']


Inserting documents from /home/zorro/repos/labelbuddy-annotations/documents
Inserting labels from /home/zorro/repos/labelbuddy-annotations/projects/NER_biomedical/labels
Inserting annotations from /home/zorro/repos/labelbuddy-annotations/projects/NER_biomedical/annotations
Inserting labels from /home/zorro/repos/labelbuddy-annotations/projects/autism_mri/labels
Inserting annotations from /home/zorro/repos/labelbuddy-annotations/projects/autism_mri/annotations
Inserting labels from /home/zorro/repos/labelbuddy-annotations/projects/cluster_inference/labels
Inserting annotations from /home/zorro/repos/labelbuddy-annotations/projects/cluster_inference/annotations
Inserting labels from /home/zorro/repos/labelbuddy-annotations/projects/cobidas/labels
Inserting annotations from /home/zorro/repos/labelbuddy-annotations/projects/cobidas/annotations
Inserting labels from /home/zorro/repos/labelbuddy-annotations/projects/dynamic_functional_connectivity/labels
Inserting annotations from /home/zorr

In [177]:
def _get_task_name(rows):
    # Add TaskName, replacing 'None' and 'Unsure' with 'n/a'
    rows = rows[rows.label_name == 'TaskName']
    task_names = []
    for _, row in rows.iterrows():
        if row['None'] or row['Unsure']:
            task_names.append('n/a')
        else:
            task_names.append(row['selected_text'])
    return task_names

# Convert to comparable dictionary
annotations_summary = {}
for pmcid, df in annotations.groupby('pmcid'):
    design_type = []
    if 'DesignType-RestingState' in df.label_name.values:
        design_type.append('RestingState')
    if 'TaskName' in df.label_name.values:
        design_type.append('Task-based')

    s = {
        'pmcid': pmcid,
        'DesignType': design_type,
        'annotator_name': df.annotator_name.iloc[0],
        'Exclude': next(
            (label.split('-', 1)[1] for label in df.label_name if label.startswith('Exclude')), None
        ),
        'Modality': [
            label.split('-', 1)[1] for label in df.label_name if label.startswith('Modality')
        ] or None,
    }

    df_abstract = df[df.section == 'abstract']
    abstract_tasks = _get_task_name(df_abstract)

    df_body = df[df.section == 'body']
    body_tasks = _get_task_name(df_body)

    # Use body tasks if available, otherwise use abstract tasks
    s['TaskName'] = body_tasks or abstract_tasks

    for k in ['TaskDescription', 'Condition', 'ContrastDefinition']:
        s[k] = df_body.loc[df_body.label_name == k, 'selected_text'].tolist() or None

    annotations_summary[pmcid] = s

### Compare Extractions to Annotations

In [229]:
import re
from thefuzz import process, fuzz

def _clean(x):
    _li =  [x.lower().replace('paradigm', '').replace('task', '').replace('‐', '').strip() for x in (x or [])]

    # Remove abbreviations
    return [re.sub(r'\s*\([^)]*\)', '', x) for x in _li]

def _compare(x, y):
    x = x or ''
    y = y or ''
    return x.replace('-', '').lower() == y.replace('-', '').lower()

def score_fuzzy(correct_labels, extracted_labels):
    correct_labels = _clean(correct_labels)
    extracted_labels = _clean(extracted_labels)
    
    matched_labels = []
    
    while correct_labels and extracted_labels:
        # Collect all matches and their scores
        all_matches = []
        for correct_label in correct_labels:
            matches = process.extract(correct_label, extracted_labels, limit=None, scorer=fuzz.token_set_ratio
            for matched_label, score in matches:
                all_matches.append((correct_label, matched_label, score))

        # Sort all matches by score in descending order
        all_matches.sort(key=lambda x: x[2], reverse=True)

        # Take the highest match
        best_match = all_matches[0]
        correct_label, matched_label, score = best_match

        # Append to results and remove matched labels
        matched_labels.append(score)
        correct_labels.remove(correct_label)
        extracted_labels.remove(matched_label)

    matched_labels = (sum(matched_labels) / len(matched_labels)) / 100 if matched_labels else 0

    return matched_labels


def _compare(x, y):
    x = x or ''
    y = y or ''
    return x.replace('-', '').lower() == y.replace('-', '').lower()

In [230]:
# Compare annotations to extractions
all_scores = {}


for pmcid, ann in annotations_summary.items():
    all_scores[pmcid] = {
        'Exclude': _compare(ann['Exclude'], nv_task_gpt[pmcid]['Exclude'])
    }

    # N items match
    for k in ['TaskName', 'Condition', 'ContrastDefinition', 'DesignType', 'Modality']:
        all_scores[pmcid][k] = score_fuzzy(ann[k], nv_task_gpt[pmcid][k])

# Calculate scores
n_articles = len(annotations_summary.keys())

all_scores_df = pd.DataFrame(all_scores).T



In [231]:
# Overall scores
all_scores_df.mean()

Exclude               0.980769
TaskName              0.601971
Condition             0.542547
ContrastDefinition    0.412943
DesignType            0.867692
Modality              0.913462
dtype: object

In [232]:
# Excluding articles with 'Exclude' label
exclude_idx = [_p for _p, v in annotations_summary.items() if v['Exclude'] is not None]
all_scores_df.loc[~all_scores_df.index.isin(exclude_idx)].mean()

Exclude                    1.0
TaskName                 0.659
Condition             0.593946
ContrastDefinition    0.452064
DesignType            0.949895
Modality                   1.0
dtype: object

In [233]:
# For papers with a clearly defined task name
has_task_name = [_p for _p, v in annotations_summary.items() if 'Task-based' in v['DesignType'] and 'n/a' not in v['TaskName']]
all_scores_df.loc[all_scores_df.index.isin(has_task_name)].mean()

Exclude                    1.0
TaskName              0.925078
Condition             0.719061
ContrastDefinition    0.571641
DesignType                 1.0
Modality                   1.0
dtype: object

In [234]:
# For papers with a task-based design but no annotated task name
has_task_noname = [_p for _p, v in annotations_summary.items() if 'Task-based' in v['DesignType'] and 'n/a' in v['TaskName']]
all_scores_df.loc[all_scores_df.index.isin(has_task_noname)].mean()

Exclude                    1.0
TaskName              0.188889
Condition             0.578056
ContrastDefinition    0.353395
DesignType                 1.0
Modality                   1.0
dtype: object

### Manual comparison

In [204]:
combined_df = pd.concat([pd.DataFrame(annotations_summary).T, pd.DataFrame(nv_task_gpt).T])

In [205]:
import pandas as pd
from IPython.display import display

pd.set_option('max_colwidth', 300)
    

def _display(df, pmcids=None):
    cols = list(set(df.keys()) - set(['pmcid']))
    if pmcids:
        df = df[df.pmcid.isin(pmcids)]
    for _, _df in df.groupby('pmcid'):
        display(_df[cols])

In [206]:
# Diff on Exclude

diff_exclude = all_scores_df[all_scores_df['Exclude'] == False].index.tolist()

if diff_exclude:
    _display(combined_df, diff_exclude)

Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
3555187,,MetaAnalysis,delavega_nv,,,,[],[]
3555187,"[sensorimotor tasks, SRTT variants]",,,"[activity in the basal ganglia and cerebellum was significantly stronger for sensorimotor tasks, activity in cortical structures and the thalamus was significantly stronger for SRTT variants]",[The study identified consistent activations across 70 motor learning experiments using activation likelihood estimation (ALE) meta-analysis. A global analysis of all tasks revealed a bilateral cortical-subcortical network consistently underlying motor learning across tasks.],[fMRI-BOLD],"[motor learning, serial response time task]",[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
10634720,,MetaAnalysis,delavega-aliceoverlap,,,,[],[]
10634720,"[Exploration, Exploitation]",,,"[exploration > exploitation, exploitation > exploration]","[Participants engage in tasks that require them to make decisions about whether to explore new options or exploit known ones, using various task designs such as n-armed bandit and foraging tasks.]",[fMRI-BOLD],[Explore-Exploit Decision Making],[Task-based]


In [207]:
# From now on, we will exclude Excluded articles
combined_df = combined_df[~combined_df.index.isin(exclude_idx)]

In [235]:
# Diff on DesignType

diff_dt = all_scores_df[all_scores_df['DesignType'] < 1].index.tolist()

if diff_dt:
    _display(combined_df, diff_dt)

Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
6331309,,,delavega_nv,,,[StructuralMRI],[],[]
6331309,"[Migraine, Restless Legs Syndrome, Comorbid Migraine and RLS, Healthy Controls]",,,"[Migraine vs Healthy Controls, RLS vs Healthy Controls, Migraine vs RLS, Comorbid Migraine and RLS vs Healthy Controls]","[High-resolution T1-weighted images were acquired from 116 subjects: 27 RLS patients, 22 migraine patients, 22 patients with comorbid migraine and RLS, and 45 healthy controls.]",[StructuralMRI],[MRI data acquisition],[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
7426775,,,delavega-aliceoverlap,,,[StructuralMRI],[],[]
7426775,[Age prediction based on structural MRI.],,,[Age-related structural changes in the brain.],[Predicting chronological age from structural MRI scans using a deep learning model.],[StructuralMRI],[Brain Age Prediction],[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
7582181,,,delavega-aliceoverlap,,,[StructuralMRI],[],[]
7582181,"[Social reward expectancy, Social threat expectancy]",,,[Social reward expectancy vs. social threat expectancy],[Participants were asked to vividly imagine themselves in a novel self-relevant event that was ambiguous with regards to possible social acceptance or rejection.],[StructuralMRI],[Imagining social interactions],[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
8564184,,,delavega-aliceoverlap,,,[StructuralMRI],[],[]
8564184,"[Chronic visceral pain, Healthy controls]",,,"[GMV in ulcerative colitis vs. healthy controls, GMV in irritable bowel syndrome vs. healthy controls, Correlations between GMV and symptom severity, Correlations between GMV and chronic stress]",[Parallelized whole-brain voxel-based morphometry analyses in two patient cohorts with chronic visceral pain (ulcerative colitis in remission and irritable bowel syndrome) and healthy individuals.],[StructuralMRI],[Voxel-based morphometry],[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
9202476,,,delavega-aliceoverlap,,,"[fMRI-BOLD, DiffusionMRI]",[],[RestingState]
9202476,"[poor sleepers (PSs), good sleepers (GSs)]",,,"[PS > GS, GS > PS]","[In this computerized task, subjects were presented with a complex figure in the middle of the screen. Then, a few patterns were shown in the periphery, from which one was matched with the presented pattern. In the first trials, two patterns were presented in the periphery, and it was increased ...","[fMRI-BOLD, DiffusionMRI]",[match-to-sample (MTS) task],[Task-based]


In [236]:
all_scores_df.loc[2686646]

Exclude                   True
TaskName                   0.8
Condition             0.913333
ContrastDefinition         1.0
DesignType                 1.0
Modality                   1.0
Name: 2686646, dtype: object

It seems most misses by GPT were due to task performed *outside the scanner*, yet listed as Task-based design, which in this instance should only refer to Task-based fMRI design

In [237]:
# Diff on TaskName

has_taskname_df = combined_df[combined_df.index.isin(has_task_name)]

diff_taskname = all_scores_df[all_scores_df['TaskName'] < 0.8].index.tolist()

if diff_taskname:
    _display(has_taskname_df, diff_taskname)

Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
4374765,"[spiders, control animals, IAPSnegative, IAPSneutral]",,delavega_nv,"[SPIDERS > ANIMALS, IAPSnegative > IAPSneutral]",,"[MRS, fMRI-BOLD]",[fear inducing paradigm],[Task-based]
4374765,"[SPIDERS, ANIMALS, IAPSnegative, IAPSneutral]",,,"[SPIDERS > ANIMALS, IAPSnegative > IAPSneutral]","[Participants were presented with still pictures of spiders, control animals (birds, caterpillars, snails, and lizards), negative pictures from the International Affective Picture System (IAPS), and neutral pictures from IAPS. They performed a covert task of detecting the presence of a human in ...","[fMRI-BOLD, MRS]",[Fear provocation paradigm],[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
5324609,,,delavega-aliceoverlap,"[linear increases in activity across the four learning stage, HRF amplitude estimates relating to generalization test trials were correlated with generalization performance at the second‐level]","[participants learned a set of visual discriminations via trial‐and‐error (learning phase), and were subsequently tested on their ability to generalize what they had learned (generalization phase). Both learning and generalization occurred within a single scanning session and took place in the c...","[fMRI-BOLD, fMRI-BOLD]",[acquired equivalence task],[Task-based]
5324609,"[rewarded, unrewarded]",,,"[generalization performance, hippocampal activity during the test, BOLD activity during the final stage of learning]","[During scanning participants learned a set of visual discriminations via trial-and-error (learning phase), and were subsequently tested on their ability to generalize what they had learned (generalization phase). Both learning and generalization occurred within a single scanning session and too...",[fMRI-BOLD],"[learning phase, generalization phase]",[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
5776089,"[Task, Task+AgonistStim, Task+ControlStim]",,delavega-other,[“Task+AgonistStim minus Task”],[he fMRI session was composed of nine rest–task cycles with 30 s for each period. Eyes were kept closed during scanning. The motor task consisted of repetitive alternating dorsiflexion and relaxation of the right foot (with range reaching 15°). Foot movements were paced following an audio cue th...,[fMRI-BOLD],"[Motor Task, Somatosensory Stimulation]",[Task-based]
5776089,"[Task, Task+AgonistStim, Task+ControlStim]",,,"[Task+AgonistStim vs. Task, Task+ControlStim vs. Task, Task+AgonistStim vs. Task+ControlStim]",[The motor task consisted of repetitive alternating dorsiflexion and relaxation of the right foot (with range reaching 15). Foot movements were paced following an audio cue that was sounded every 1.5 s.],[fMRI-BOLD],"[right ankle dorsiflexion, ankle dorsiflexion coupled with simultaneous stimulation to the agonist muscle, ankle dorsiflexion coupled with simultaneous stimulation to a control area]",[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
6219793,"[self-distanced, self-immersed]",,delavega_nv,,"[Following previous studies [ – ], negative social feedback was used to induce emotions for two reasons: (a) in daily life emotions are often caused by social stimuli [ , ] and (b) social feedback elicits emotional responses that are long enough to study emotion dynamics [ ]. The social feedback...",[fMRI-BOLD],[Social feedback paradigm],[Task-based]
6219793,"[Self-distanced perspective, Self-immersed perspective]",,,"[Self-distanced > Self-immersed, Self-immersed > Self-distanced]",[Participants were asked to adopt a self-immersed or self-distanced perspective while reading and thinking about negative social feedback.],[fMRI-BOLD],[Self-distanced vs. self-immersed perspective],[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
6699247,,,delavega_nv,,"[In this task, a simple line drawing of an object is presented (250 ms), and following a short delay (75 ms), a word appears. Subjects are instructed to respond by indicating whether the word was a semantic match (50% of trials) or non-match (unmatched, 50% of trials) to the preceding picture. P...","[EEG, fMRI-BOLD, MRS]",[picture–word verification task],[Task-based]
6699247,"[matched, in-category, out of category]",,,"[IC vs. matched, OC vs. matched]","[Participants performed a picture-word matching task, in which words were either matched by preceding pictures, or were unmatched by semantically related or unrelated pictures.]",[fMRI-BOLD],[picture-word matching task],[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
7018765,"[“both correct”, “costly error”, participant responded incorrectly, both the participant and their partner responded incorrectly,]",,delavega-aliceoverlap,"[“costly error” – “both correct”, “costly error”, “both correct”]","[his task builds on past studies that have utilized error processing paradigms in the study of ER (e.g., ; ; ) by specifically examining dyadic error processing. Parent-adolescent dyads completed the TEAM task while simultaneously undergoing fMRI scanning. This task was developed to examine b...",[fMRI-BOLD],[Testing Emotional Attunement and Mutuality (TEAM)],[Task-based]
7018765,"[costly error condition, both correct condition]",,,[costly error vs. both correct],"[The TEAM task is an event-related design and consists of 17 trials during which participants first see a pattern of colored arrows presented sequentially on the screen for 3 s, twice in a row (totaling 6 s). They are then given 4 s to reconstruct the sequence by pressing colored buttons on a re...",[fMRI-BOLD],[TEAM task],[Task-based]


Unnamed: 0,Condition,Exclude,annotator_name,ContrastDefinition,TaskDescription,Modality,TaskName,DesignType
11063816,"[open-loop, delay, open-loop, no-delay, closed-loop, delay, closed-loop, no-delay]",,delavega-other,"[people, locations, objects, nontarget reinstatement effect, collapsing across the delay and no-delay condition, nontarget reinstatement effect differed between the delay and no-delay conditions , differences between closed- and open-loops in the cue and target regions in the delay and no-delay ...","[On each trial, a cue and six potential targets were presented simultaneously on the screen. The cue was presented in the middle of the screen with the six possible targets; one target and five foils form the same category (e.g., if the target was hammer, the five foils would be other randomly s...","[fMRI-BOLD, fMRI-BOLD]",[six-alternative forced-choice cued-recognition task],[Task-based]
11063816,"[Delay, No-delay]",,,"[closed-loop vs open-loop, immediate vs delayed retrieval]","[Participants learned events that comprised multiple overlapping pairs of event elements (e.g., person-location, object-location, location-person). Encoding occurred either immediately before or 24h before retrieval. Using fMRI during the retrieval of events, the study assessed whether episodic ...",[fMRI-BOLD],[Episodic memory retrieval],[Task-based]


In [238]:
all_scores_df.shape

(104, 6)

Many misses are due to "Resting-state" being listed as a task for Resting-State paradigms, whereas they were coded as None by annotators.
Sometimes it will also list a task-name that is outside the scanner.

The vast majority of the remaining tasks could be classified as "correct".

Using fuzzy matching, only 7 / 96 studies had poor matching, in 3-4 of these, it could be argued to be semantically correct. 
Only in 1-2 of these cases would I outright call it incorrect.