In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)

In [None]:
# For sentence tokenization
from nltk import tokenize
import pandas as pd

def read_paper(pid, section, source):
    with open(f'data/ebm_nlp_2_00/documents/{pid}.txt', 'r') as f:
        alltext = ''.join(f.readlines())
        sentences = tokenize.sent_tokenize(alltext)
        return pd.DataFrame({
            'URL': [f'https://pubmed.ncbi.nlm.nih.gov/{pid}'] * (2),
            'ID': [pid] * (2),
            'Type': ['Title', 'Abstract'],
            'Text': [sentences[0], ' '.join(sentences[1:])],
            'ann_section': [section] * (2),
            'ann_source': [source] * (2)
        })

read_paper(6989377, 'test', 'gold')

In [None]:
# Build up the entire list of paper abstracts we have
import os

# directory = f'data/ebm_nlp_2_00/annotations/aggregated/hierarchical_labels/interventions/test/crowd'
# directory = f'data/ebm_nlp_2_00/annotations/aggregated/starting_spans/participants/test/crowd'
# directory = f'data/ebm_nlp_2_00/annotations/aggregated/starting_spans/outcomes/test/crowd'
# directory = f'data/ebm_nlp_2_00/annotations/aggregated/starting_spans/interventions/train/crowd'
# directory = f'data/ebm_nlp_2_00/annotations/aggregated/starting_spans/interventions/test/crowd'
directory = f'data/ebm_nlp_2_00/annotations/aggregated/hierarchical_labels/interventions/test/gold'
df = None

for filename in os.scandir(directory):
    # assume that the filepath does not contain .s except for the final extension!
#     print(filename.path)
#     print(filename.name.split('.')[0])
    df_file = read_paper(filename.name.split('.')[0], 'test', 'gold')
    # and append it to the summary dataframe
    if df is None:
        df = df_file
    else:
        df = df.append(df_file)

df = df.reset_index(drop=True)
df

In [None]:
# Split the full-abstract CSV into a CSV containing individual sentences instead
def sentence_tokenize(group):
    row = group.reset_index(drop=True).loc[0]
    sentences = tokenize.sent_tokenize(row['Text'])
    return pd.DataFrame({
        'URL': [row['URL']] * (len(sentences)),
        'ID': [row['ID']] * (len(sentences)),
        'Type': [row['Type']] * (len(sentences)),
        'Index': list(range(len(sentences))),
        'Text': sentences,
        'ann_section': ['test'] * (len(sentences)),
        'ann_source': ['gold'] * (len(sentences))
    })

df_sentences = df.groupby(['ID', 'Type', 'ann_section', 'ann_source'], group_keys=False).apply(
    lambda group: sentence_tokenize(group)
).reset_index(drop=True)

df_sentences

In [None]:
def extract_labeled_terms(tokens, tags):
    in_tag = False
    hits = []
    hit_indices = []
    sofar = []
    for i in range(len(tokens)):
        if int(tags[i]) != 0:
            if not in_tag:
                in_tag = True
            sofar.append(tokens[i])
        elif in_tag:
            in_tag = False
            hits.append(sofar)
            hit_indices.append(i - len(sofar))
            sofar = []
    if in_tag:
        hits.append(sofar)
        hit_indices.append(len(tokens) - len(sofar))
    return hits, hit_indices

test_extract_labeled_terms = extract_labeled_terms(
    ['This', 'is', 'an', 'example', 'sentence', '.'],
    ['1',    '0',  '0',  '1',       '1',        '0']
)
test_extract_labeled_terms

In [None]:
# split_0 is the text literally preceding the anchor
# split_1 is the text that the anchor consists of
# split_2 is the text literally following the anchor
# split_tokens is the list of tokens that split identifies
# split_anchor_span is a tuple (anchor_first_token_index, anchor_last_token_index) or null if there is no anchor
# split_anchor_indices is a tuple (anchor_start_char_index, anchor_end_char_index) or null if there is no anchor
splitting_headers = ['split_0','split_1','split_2', 
                     'split_tokens', 'split_anchor_span', 'split_anchor_indices', 
                     'within_anchor_index']
# Use columns ID, Type, Index to join with df_sentences
join_headers = ['ID', 'Type', 'Index', 'ann_section', 'ann_source']

In [None]:
# Extract tokenization, part-of-speech tags from EBMNLP dataset
def ebmnlp_addtdata(group):
    group = df_sentences
    group = group.sort_values(by=['Index'])
    group = group.sort_values(by=['Type'], ascending=False)
    group = group.reset_index()
    id_num = group.iloc[0]['ID']
    ann_section = group.iloc[0]['ann_section']
    ann_source = group.iloc[0]['ann_source']
    # extract split_tokens data from .tokens file
    with open(f'data/ebm_nlp_2_00/documents/{id_num}.tokens', 'r') as f:
        tokens = [s.strip() for s in f.readlines()]
    split_tokens = []
    for i in range(len(group.index)):
        row = group.iloc[i]
        rowtext = row['Text'].strip()
        i_tokens = []
        while len(tokens) > 0 and rowtext.find(tokens[0]) == 0:
            i_tokens.append(tokens[0])
            rowtext = rowtext[len(tokens[0]):].strip()
            tokens = tokens[1:]
        split_tokens.append(i_tokens)
    metadata = pd.DataFrame({'split_tokens': split_tokens})
    # extract POS data from .pos file
    with open(f'data/ebm_nlp_2_00/documents/{id_num}.pos', 'r') as f:
        pos = [s.strip() for s in f.readlines()]
    pos_labels = []
    for i in range(len(group.index)):
        sent_tok_len = len(metadata.iloc[i]['split_tokens'])
        pos_labels.append(pos[:sent_tok_len])
        pos = pos[sent_tok_len:]
    metadata['split_tokens_pos'] = pos_labels
    # Extract the direct annotations...
    for annlevel in [('starting_spans', 'ss'), ('hierarchical_labels', 'hl')]:
        for anntype in [('participants', 'p'), ('interventions', 'i'), ('outcomes', 'o')]:
            # read an intersection of (ss/hl)*(p/i/o)
            # (also get annotation clumps while we're doing that)
            with open(f'data/ebm_nlp_2_00/annotations/aggregated/{annlevel[0]}/{anntype[0]}/{ann_section}/{ann_source}/{id_num}.AGGREGATED.ann', 'r') as f:
                annotations = [s.strip() for s in f.readlines()]
            ann_col = []
            ann_col_clumps = []
            for i in range(len(group.index)):
                sent_tok = metadata.iloc[i]['split_tokens']
                sent_tok_len = len(sent_tok)
                ann_col.append(annotations[:sent_tok_len])
                ann_col_clumps.append(extract_labeled_terms(sent_tok, annotations[:sent_tok_len]))
                annotations = annotations[sent_tok_len:]
            metadata[f'{annlevel[1]}_{anntype[1]}'] = ann_col
            metadata[f'{annlevel[1]}_{anntype[1]}_clumps'] = ann_col_clumps
    for c in join_headers:
        metadata[c] = group[c]
    return metadata

df_metadata = df_sentences.groupby(['ID', 'ann_source'], group_keys=False).apply(
    lambda group: ebmnlp_addtdata(group)
).reset_index(drop=True)
df_metadata

In [None]:
# For now, I'm just going to pretend that EBM-NLP is directly "handling" the issue of splitting
# for us, because we can use the P/I/O annotations as splits!

def pio_split(group):
    row = group.iloc[0]
    output = []
    for anntype in ['hl_i']: # ['ss_p', 'ss_i', 'ss_o', 'hl_p', 'hl_i', 'hl_o']:
        clumps, clumps_loc = row[anntype+'_clumps']
        for i in range(len(clumps)):
            # set up split_n
            output_i = [' '.join(row['split_tokens'][:clumps_loc[i]]), 
                        ' '.join(clumps[i]), 
                        ' '.join(row['split_tokens'][clumps_loc[i]+len(clumps[i]):])]
            # split_tokens (copy what we already got as input, haha)
            output_i.append(row['split_tokens'])
            # split_anchor_span
            output_i.append((clumps_loc[i], clumps_loc[i]+len(clumps[i])))
            # split_anchor_indices
            output_i.append((len(output_i[0]), len(output_i[0])+len(output_i[1])))
            # within_anchor_index
            # TODO - perhaps we could assign an actual anchorindex eventually
            # but I'm ignoring that these are known entity names in PIO annotations for now
            output_i.append(-1)
            # Add join headers
            output_i += list(row[join_headers])
            output.append(output_i)
    # Transpose the output format so we can use it in zip for dataframe generation
    output_t = [list(t) for t in list(zip(*output))]
    splits = pd.DataFrame(dict(zip(splitting_headers+join_headers,output_t)))
    return splits

pio_output = df_metadata.groupby(df_metadata.index, group_keys=False).apply(
    lambda group: pio_split(group)
).reset_index(drop=True)
pio_output

In [None]:
# Parse the join of df_sentences and df_metadata into something that includes all of the annotations IDed
df_sentences.merge(
    pio_output,
    how='outer',
    left_on=join_headers,
    right_on=join_headers)

In [None]:
df.to_pickle(f'temp/ebm-df.pkl')
df_sentences.to_pickle(f'temp/ebm-df_sentences.pkl')
df_metadata.to_pickle(f'temp/ebm-df_metadata.pkl')
pio_output.to_pickle(f'temp/ebm-pio_output.pkl')