In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)

In [None]:
# Do one-time large imports

# For sentence tokenization
from nltk import tokenize

# For coreference resolution
from allennlp.predictors.predictor import Predictor
import allennlp_models.coref
coref_predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
)

# For part-of-speech tagging
import nltk
nltk.download('averaged_perceptron_tagger')

# For dependency parsing
from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
dependency_predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz"
)


In [None]:
import os.path

### Basic input flags for the notebook / pipeline

# Identify the term we are splitting on (the "anchor")
# This also serves as the name for this entire cluster of results / practical pipeline run name
search_word = 'BERT'

# Other common names for this term that we should also consider as anchors
# e.g. [search_word, 'GPT', 'GPT-2', 'GPT-3']
# e.g. [search_word, 'Transformers', 'Transformer', 'transfer learning', 'transfer']
anchor_synonyms = [search_word]

# Flags
flag_rerun_coreference = False or (not os.path.isfile(f'outputs/{search_word}/partial-coreference.pkl'))

In [None]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Read in the dataframe containing entire paper abstracts (NOT pre-split into sentences)
df = pd.read_csv(f'data/nlp-align_{search_word}.csv')

# Create the outputs directory for this search word
Path(f"outputs/{search_word}").mkdir(parents=True, exist_ok=True)

# Split apart the 'Title' and 'Abstract' columns, add period to 'Title' if not present
def separate_title_abstract(group):
    row = group.loc[0]
    abs_text = tokenize.sent_tokenize(row['Abstract'])
    return pd.DataFrame({
        'URL': [row['URL']] * 2,
        'ID': [row['ID']] * 2,
        'Type': ['Title', 'Abstract'],
        'Text': [row['Title']+'.' if not row['Title'].endswith('.') else row['Title'], 
                 row['Abstract']]
    })

# Restructure the dataframe to be more usable...
df = df.groupby('ID', group_keys=False).apply(
    lambda row: separate_title_abstract(row)
).reset_index(drop=True)

df

In [None]:
# Split the full-abstract CSV into a CSV containing individual sentences instead
def sentence_tokenize(group):
    row = group.reset_index(drop=True).loc[0]
    sentences = tokenize.sent_tokenize(row['Text'])
    return pd.DataFrame({
        'URL': [row['URL']] * (len(sentences)),
        'ID': [row['ID']] * (len(sentences)),
        'Type': [row['Type']] * (len(sentences)),
        'Index': list(range(len(sentences))),
        'Text': sentences
    })

df_sentences = df.groupby(['ID', 'Type'], group_keys=False).apply(
    lambda row: sentence_tokenize(row)
).reset_index(drop=True)

df_sentences

In [None]:
# # Create a test dataframe so we can run models without taking impractically long
# # TODO: this is causing some type inconsistencies, fix those?

# temp_df = pd.DataFrame.from_dict(
#     {'URL': 'abc', 
#      'ID': '0', 
#      'Title': 'Paper Title',
#      'Abstract': 'The BERT language model (LM) (Devlin et al., 2019) is surprisingly good at answering cloze-style questions about relational facts. Petroni et al. (2019) take this as evidence that it memorizes factual knowledge during pre-training. We take issue with this interpretation and argue that the performance of BERT is partly due to reasoning about (the surface form of) entity names, e.g., guessing that a person with an Italian-sounding name speaks Italian.'
#     }
# )

# Splitting functions

Assume we have an input dataframe with some number of columns, at least one of which is titled `Text` and is the column containing each sentence of the abstract.

In [None]:
# split_0 is the text literally preceding the anchor
# split_1 is the text that the anchor consists of
# split_2 is the text literally following the anchor
# split_tokens is the list of tokens that split identifies
# split_anchor_span is a tuple (anchor_first_token_index, anchor_last_token_index) or null if there is no anchor
splitting_headers = ['split_0','split_1','split_2', 'split_tokens', 'split_anchor_span']
# Include ID, Type, Index in the split output to be able to join with df_sentences
join_headers = ['ID', 'Type', 'Index']
# The headers used for checking if rows should be eliminated as duplicate
duplicate_check_headers = splitting_headers[:3]+join_headers

In [None]:
import re

# Split on the search word, regardless of whitespace (if search word is A and we have word CAR, it slices it up)
# Splits on ALL instances of the search word
def split_term_literal(group, search_word):
    row = group.iloc[0]
    output = []
    anchors = [re.search(f'({a})', row['Text'], flags=re.IGNORECASE) for a in anchor_synonyms]
    anchors = [a.group(1) for a in anchors if (a is not None)]
    for anchor in anchors:
        splits = row['Text'].split(anchor)
        for i in range(len(splits) - 1):
            output_i = [anchor.join(splits[:i+1]), anchor.strip(), anchor.join(splits[i+1:])]
            output_i = [i.strip() for i in output_i]
            # if the string is empty don't include it in the tokens list
            pre_split = output_i[0].split(' ') if output_i[0] != '' else []
            mid_split = anchor.split(' ')
            post_split = output_i[2].split(' ') if output_i[2] != '' else []
            output_i.append(pre_split + mid_split + post_split)
            output_i.append((len(pre_split), len(pre_split)+len(mid_split)))
            output_i += list(row[join_headers])
            output.append(output_i)
    if output == []:
        output = [[row['Text'].strip(),'','',row['Text'].strip().split(' '),None]+list(row[join_headers])]
    # Transpose the output format so we can use it in zip for dataframe generation
    output_t = [list(t) for t in list(zip(*output))]
    return pd.DataFrame(
        dict(zip(splitting_headers+join_headers,output_t))
    ).drop_duplicates(duplicate_check_headers)

literal_output = df_sentences.groupby(df_sentences.index, group_keys=False).apply(
    lambda group: split_term_literal(group, search_word)).reset_index(drop=True)

literal_output

In [None]:
import re

# Split on the search word, taking care to only split on whitespace
# Splits on ALL instances of the search word
def split_term_whitespace(group, search_word):
    row = group.iloc[0]
    output = []
    anchors = [re.search(rf'(^|\W)(\w*{a}\w*)($|\W)', row['Text'], flags=re.IGNORECASE) for a in anchor_synonyms]
    anchors = [a.group(2) for a in anchors if (a is not None)]
    for anchor in anchors:
        splits = row['Text'].split(anchor)
        for i in range(len(splits) - 1):
            output_i = [anchor.join(splits[:i+1]), anchor.strip(), anchor.join(splits[i+1:])]
            output_i = [i.strip() for i in output_i]
            # if the string is empty don't include it in the tokens list
            pre_split = output_i[0].split(' ') if output_i[0] != '' else []
            mid_split = anchor.split(' ')
            post_split = output_i[2].split(' ') if output_i[2] != '' else []
            output_i.append(pre_split + mid_split + post_split)
            output_i.append((len(pre_split), len(pre_split)+len(mid_split)))
            output_i += list(row[join_headers])
            output.append(output_i)
    if output == []:
        output = [[row['Text'].strip(),'','',row['Text'].strip().split(' '),None]+list(row[join_headers])]
    # Transpose the output format so we can use it in zip for dataframe generation
    output_t = [list(t) for t in list(zip(*output))]
    return pd.DataFrame(
        dict(zip(splitting_headers+join_headers,output_t))
    ).drop_duplicates(duplicate_check_headers)

whitespace_output = df_sentences.groupby(df_sentences.index, group_keys=False).apply(
    lambda group: split_term_whitespace(group, search_word)).reset_index(drop=True)

whitespace_output

In [None]:
# Run coreference resolution over the entire abstract, not individual sentences
if flag_rerun_coreference:
    output = df.apply(
        lambda row: coref_predictor.predict(row['Text']), axis=1, result_type='expand')
    df_merged = df.join(output)

In [None]:
import re

# transform the output of coreference resolution into something that is more easily manipulated
# split it across multiple sentences so each indiv sentence row can still work
def reinterpret_coref_clusters(row, search_word, sentences):
    # Create dicts to map full-document to indiv sentence data
    src = sentences.loc[sentences['ID'] == row['ID']].loc[sentences['Type'] == row['Type']]['Text']
    curr_sentence = 0
    consumed = 0
    sent_mapping = {}
    sent_content = {}
    last_sent_end = 0
    doct_mapping = {}
    doct_split = []
    for i in range(len(row['document'])):
        if row['document'][i].strip() != '':
            if row['document'][i] not in src.iloc[curr_sentence][consumed:]:
                doct_split.append(row['document'][last_sent_end:i])
                last_sent_end = i
                curr_sentence += 1
                consumed = 0
            offset = src.iloc[curr_sentence][consumed:].index(row['document'][i])
            sent_mapping[i] = curr_sentence
            if curr_sentence not in sent_content:
                sent_content[curr_sentence] = []
            sent_content[curr_sentence].append(i)
            doct_mapping[i] = i - last_sent_end
            consumed += offset + len(row['document'][i])
        else:
            sent_mapping[i] = curr_sentence
            sent_content[curr_sentence].append(i)
            doct_mapping[i] = i - last_sent_end
        doct_split.append(row['document'][last_sent_end:])
    # Select the clusters that contain search words
    selcluster_idxs = set()
    for i in range(len(row['clusters'])):
        currcluster_ct = 0
        for c in row['clusters'][i]:
            for anchor in anchor_synonyms:
                # TODO this does overcounting if an anchor synonym is contained within another
                currcluster_ct += len(
                    re.findall(f'{anchor}', ''.join(row['document'][c[0]:c[1]+1]), flags=re.IGNORECASE)
                )
        if currcluster_ct > 0:
            selcluster_idxs.add(i)
    # Build the output row
    output = [sent_mapping, sent_content, doct_mapping, list(selcluster_idxs)]
    return dict(zip(['sent_mapping', 'sent_content', 'doct_mapping', 'selcluster_idxs'],output))

if flag_rerun_coreference:
    output = df_merged.apply(
        lambda row: reinterpret_coref_clusters(row, search_word, df_sentences), 
        axis=1, result_type='expand')
    df_merged = df_merged.join(output)
    
    df_merged.to_pickle(f'outputs/{search_word}/partial-coreference.pkl')

In [None]:
df_merged = pd.read_pickle(f'outputs/{search_word}/partial-coreference.pkl')
df_merged

In [None]:
# Split based on co-references to any phrase containing search term, using allennlp coreference resolution
# This does NOT preserve the original sentence spacing
# REQUIRES THAT WE ALREADY RAN THE COREFERENCE PREDICTOR - this func does NOT do all of the work!
# Splits on ALL instances of references to the search word
def split_term_coreference(group, search_word, lookup, fallback):
    row = group.iloc[0]
    # there's probably a cleaner way to do this...
    lookup_row = lookup.loc[lookup['ID']==row['ID']].loc[lookup['Type']==row['Type']].to_dict(orient='records')[0]
    if len(lookup_row['selcluster_idxs']) == 0:
        # if we didn't identify any clusters that match the search term, use our fallback method
        return fallback(group, search_word)
    output = []
    for cluster_id in lookup_row['selcluster_idxs']:
        split_clusters = lookup_row['clusters'][cluster_id]
        for i in range(len(split_clusters)):
            c = split_clusters[i]
            if lookup_row['sent_mapping'][c[0]] == lookup_row['sent_mapping'][c[0]] == row['Index']:
                sentence_start = lookup_row['sent_content'][row['Index']][0]
                sentence_end = lookup_row['sent_content'][row['Index']][-1]
                pre_split = lookup_row['document'][sentence_start:c[0]]
                anchor = lookup_row['document'][c[0]:c[1]+1]
                post_split = lookup_row['document'][c[1]+1:sentence_end+1]
                output_i=[' '.join(pre_split),
                        ' '.join(anchor),
                        ' '.join(post_split)]
                output_i.append(lookup_row['document'][sentence_start:sentence_end+1])
                output_i.append((len(pre_split), len(pre_split)+len(anchor)))
                output_i += list(row[join_headers])
                output.append(output_i)
    if output == []:
        # if there wasn't any reference in the sentence found, use our fallback method
        return fallback(group, search_word)
    # Transpose the output format so we can use it in zip for dataframe generation
    output_t = [list(t) for t in list(zip(*output))]
    return pd.DataFrame(dict(zip(splitting_headers+join_headers,output_t)))

coreference_output = df_sentences.groupby(df_sentences.index, group_keys=False).apply(
    lambda group: split_term_coreference(group, search_word, df_merged, split_term_whitespace)
).reset_index(drop=True)

coreference_output

# Grouping functions

Assume we have an input dataframe with column headers `['split_0','split_1','split_2', 'split_tokens', 'split_anchor_span']`

`'split_1'` is the column that contains our search term / anchor point

`'split_0'` and `'split_2'` are the columns that contain text before and after the search terms respectively

In [None]:
# group is the text uniquely identifying a group
grouping_headers = ['group', 'group2', 'group3']

In [None]:
sample_input = df_sentences.merge(whitespace_output,
                                  how='outer',
                                  left_on=join_headers,
                                  right_on=join_headers)

In [None]:
# Group on the literal first word that comes after the anchor point
# skips things that are punctuation
def group_first_word(row):
    output = None
    if row['split_anchor_span'] is not None:
        index = row['split_anchor_span'][1]
        while (index < len(row['split_tokens'])) and (output is None):
            next_token = row['split_tokens'][index]
            next_token_r = re.search(rf'^[.,():-]*(\w+(.+\w+)*)[.,():-]*$', next_token, flags=re.IGNORECASE)
            if next_token_r is not None:
                output = [next_token_r.group(1)]
            else:
                index += 1
    if output is None:
        output = ['']
    return dict(zip(grouping_headers, output))

output = sample_input.apply(
    lambda row: group_first_word(row), 
    axis=1, result_type='expand').sort_values(by=['group'])

output = sample_input.join(output)
output

In [None]:
# Group on the first verb that comes after the anchor point, using NLTK part-of-speech tagging
def group_first_verb(row):
    tokens = [nltk.word_tokenize(row['split_0']), 
              nltk.word_tokenize(row['split_1']),
              nltk.word_tokenize(row['split_2'])]
    tokens_pos = nltk.pos_tag([item for sublist in tokens for item in sublist])
    verb = ''
    for i in range(len(tokens[0])+len(tokens[1]), len(tokens_pos)):
        if tokens_pos[i][1].startswith('V'):
            verb = tokens_pos[i][0]
            break
    output = [verb]
    return dict(zip(grouping_headers, output))

output = sample_input.apply(
    lambda row: group_first_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])

output = sample_input.join(output)
output

In [None]:
# Do dependency parsing once for the entire sample_input to save processing time
# for groupings that require dependency parsing later
def parse_dependency(row):
    p = dependency_predictor.predict(
        sentence=' '.join(row['split_tokens']).strip()
    )
    return dict(zip(['dependency_parse'], [p]))

sample_input_dep = sample_input.apply(
    lambda row: parse_dependency(row), 
    axis=1, result_type='expand')

sample_input_dep.iloc[0][0]

In [None]:
# Group on the main verb in the sentence, using allennlp dependency parsing (based on demo code)
def group_main_verb(row):
    p = row['dependency_parse']
    output = [p['hierplane_tree']['root']['word']]
    return dict(zip(grouping_headers, output))

output = sample_input.join(sample_input_dep).apply(
    lambda row: group_main_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])

output = sample_input.join(output)
output

In [None]:
# Overlaps two strings, filling in any whitespace characters in s1 with the non-whitespace char in s2
# If len(s2) > len(s1), the extra is added to the end
def string_union(s1, s2):
    output = list(s1)
    for i in range(min(len(s1), len(s2))):
        if s1[i].isspace():
            output[i] = s2[i]
    # add on the extra if s2 is longer
    if len(s2) > len(s1):
        output += list(s2[len(s1):])
    return ''.join(output)

In [None]:
# Helper function
# Returns the entire phrase that comprises a dependency tree node and its children
def unroll_dependency_node(node, allowed_links=None, allowed_types=None):
    node_str = ' '*node['spans'][0]['start'] + node['word']
    if 'children' in node:
        for i in range(len(node['children'])):
            if (allowed_links is None) or (node['children'][i]['link'] in allowed_links):
                if (allowed_types is None) or (node['children'][i]['attributes'][0] in allowed_types):
                    child_str = unroll_dependency_node(node['children'][i],
                                                      allowed_links=allowed_links,
                                                      allowed_types=allowed_types)
                    node_str = string_union(node_str, child_str)
    return node_str

# print(unroll_dependency_node(sample_input_dep.iloc[0][0]['hierplane_tree']['root']))

In [None]:
def get_node_from_index(p, i):
    pathway = []
    while p['predicted_heads'][i] != 0:
        # count what index of its parent this node is
        parent_idx = 0
        for test_i in range(i):
            if p['predicted_heads'][i] == p['predicted_heads'][test_i]:
                parent_idx += 1
        pathway.insert(0, parent_idx)
        i = p['predicted_heads'][i] - 1
    curr_node = p['hierplane_tree']['root']
    for child_i in pathway:
        curr_node = curr_node['children'][child_i]
    return curr_node

In [None]:
def get_node_children(node, types):
    child_nodes = node['children'] if ('children' in node) else []
    matches = []
    for i in range(len(child_nodes)):
        if child_nodes[i]['nodeType'] in types:
            matches.append(' '.join(unroll_dependency_node(child_nodes[i]).strip().split()))
    return matches

In [None]:
# Group on the main verb in the sentence, using allennlp dependency parsing (based on demo code)
# Expanded to include subject, object, expand upon verb form
def group_main_verb_expanded(row):
    p = row['dependency_parse']
    root_node = p['hierplane_tree']['root']
    root_phrase = ' '.join(unroll_dependency_node(
        root_node,
        allowed_links=['aux', 'auxpass', 'cop'],
        allowed_types=['VERB', 'AUX', 'PART']
    ).strip().split())
    subjects = get_node_children(root_node, ['nsubj', 'nsubjpass'])
    objects = get_node_children(root_node, ['dobj'])
    output = [root_phrase, subjects, objects]
    return dict(zip(grouping_headers, output))

output = sample_input.join(sample_input_dep).apply(
    lambda row: group_main_verb_expanded(row), 
    axis=1, result_type='expand').sort_values(by=['group'])

output = sample_input.join(output)
output

In [None]:
# Helper function for phrase POS
# Return the match bounds of the sequence of elements of given sizes starting at list1[i1] and list2[i2] 
# that match
# If no given size is returned, returns max matching sequence length
# (ratio of element matches must be 1:some or some:1 between l1 and l2)
# Returns [(l1 bounds), (l2 bounds)] or None if they do not match
def list_elements_match(list1, list2, i1, i2, size1=None, size2=None):
    matchlen = 0
    if size1 is not None and size2 is not None:
        # check for exact text match
        matchlen = len(''.join(list1[i1:i1+size1]))
        if ''.join(list1[i1:i1+size1]) != ''.join(list2[i2:i2+size2]):
            return None
    elif size1 is not None:
        # and size2 is none
        matchlen = len(''.join(list1[i1:i1+size1]))
        if ''.join(list1[i1:i1+size1]) != ''.join(list2[i2:])[:matchlen]:
            return None
    elif size2 is not None:
        # and size1 is none
        matchlen = len(''.join(list2[i2:i2+size2]))
        if ''.join(list2[i2:i2+size2]) != ''.join(list1[i1:])[:matchlen]:
            return None
    else:
        # both are none; just calculate the match length
        matchlen = 0
        while l1concat[matching] == l2concat[matching]:
            matchlen += 1
    matchphrase = ''.join(list1[i1:])[:matchlen]
    # get the exact bounds for list1
    bound1 = 1
    for i in range(len(list1)-i1+1):
        if ''.join(list1[i1:i1+i]) == matchphrase:
            bound1 = i
            break
    # get the exact bounds for list2
    bound2 = 1
    for i in range(len(list2)-i2+1):
        if ''.join(list2[i2:i2+i]) == matchphrase:
            bound2 = i
            break
    return [(i1, i1+bound1), (i2, i2+bound2)]

In [None]:
# Group on the verb closest to the anchor point, using allennlp dependency parsing (based on demo code)
# Include SVO information and expanded verbs
def group_anchor_verb(row):
    if row['split_anchor_span'] is None:
        return dict(zip(grouping_headers, ['']))
    p = row['dependency_parse']
    # build out a more usable version of the dependency tree with information about tree level!
    # tree_array[n] = (parent, level) so the root node is (0, 0)
    tree_array = [(h, -1) for h in p['predicted_heads']]
    need_connection = [0]
    level = 0
    while len(need_connection) > 0:
        need_connection_update = []
        for i in range(len(tree_array)):
            if tree_array[i][0] in need_connection:
                tree_array[i] = (tree_array[i][0], level)
                need_connection_update.append(i+1)
        need_connection = need_connection_update
        level += 1
    # Figure out what indexes our anchor fits into
    # Assume that the anchor is contiguous text
    # TODO: is this always true?
    for i in range(len(p['words'])):
        if len(''.join(p['words'][:i])) >= len(''.join(row['split_tokens'][:row['split_anchor_span'][0]])):
            match = list_elements_match(
                p['words'], row['split_tokens'], i, row['split_anchor_span'][0], 
                size2=row['split_anchor_span'][1]-row['split_anchor_span'][0])
            if match is not None:
                break
    # Find the smallest containing dependency node
    matching_nodes = [(i, tree_array[i][0], tree_array[i][1]) for i in range(match[0][0], match[0][1])]
    matching_nodes = list(set(matching_nodes))
    while len(matching_nodes) > 1:
        matching_nodes.sort(key=lambda x: x[2])
        parent = matching_nodes.pop()[1]
        matching_nodes.append((parent-1, tree_array[parent-1][0], tree_array[parent-1][1]))
        matching_nodes = list(set(matching_nodes))
    # this is the anchor node
    node = matching_nodes[0]
    # Find the closest parent that is a verb
    parent = node[1]
    while p['pos'][node[0]] not in ['VERB', 'AUX'] and parent != 0:
        node = (parent-1, tree_array[parent-1][0], tree_array[parent-1][1])
        parent = node[1]
    if p['pos'][node[0]] not in ['VERB', 'AUX']:
        # we've landed in a root node that isn't a verb, wheeeeee
        return dict(zip(grouping_headers, ['']))
    # node is definitely in the anchor verb (type 'VERB' or 'AUX') now
    immediate_verb = p['words'][node[0]]
    relevant_verbs = ['']*len(p['words'])
    relevant_verbs[node[0]] = p['words'][node[0]]
    # climb to the top of the anchor verb blob
    if p['pos'][node[0]] in ['AUX']:
        parent = node[1]
        while parent != 0 and p['pos'][node[0]] in ['VERB']:
            node = (parent-1, tree_array[parent-1][0], tree_array[parent-1][1])
            if p['pos'][node[0]] in ['VERB']:
                relevant_verbs.append(p['words'][node[0]])
            parent = node[1]
    # node should now be the top anchorverb
    node = get_node_from_index(row['dependency_parse'], node[0])
    anchor_verb = ' '.join(unroll_dependency_node(
        node,
        allowed_links=['aux', 'auxpass', 'cop'],
        allowed_types=['VERB', 'AUX', 'PART']
    ).strip().split())
    subjects = get_node_children(node, ['nsubj', 'nsubjpass'])
    objects = get_node_children(node, ['dobj'])
    output = [anchor_verb, subjects, objects]
    return dict(zip(grouping_headers, output))

output = sample_input.join(sample_input_dep).apply(
    lambda row: group_anchor_verb(row), 
    axis=1, result_type='expand')

output = sample_input.join(output)
output

In [None]:
# Group on the POS of the anchor point, using allennlp dependency parsing (based on demo code)
# I'm defining the "POS of a phrase" as the POS of the lowest node that contains the entire phrase
def group_anchor_pos(row, context=1):
    if row['split_anchor_span'] is None:
        return dict(zip(grouping_headers, ['']))
    p = row['dependency_parse']
    # build out a more usable version of the dependency tree with information about tree level!
    # tree_array[n] = (parent, level) so the root node is (0, 0)
    tree_array = [(h, -1) for h in p['predicted_heads']]
    need_connection = [0]
    level = 0
    while len(need_connection) > 0:
        need_connection_update = []
        for i in range(len(tree_array)):
            if tree_array[i][0] in need_connection:
                tree_array[i] = (tree_array[i][0], level)
                need_connection_update.append(i+1)
        need_connection = need_connection_update
        level += 1
    # Figure out what indexes our anchor fits into
    # Assume that the anchor is contiguous text
    # TODO: is this always true?
    for i in range(len(p['words'])):
        if len(''.join(p['words'][:i])) >= len(''.join(row['split_tokens'][:row['split_anchor_span'][0]])):
            match = list_elements_match(
                p['words'], row['split_tokens'], i, row['split_anchor_span'][0], 
                size2=row['split_anchor_span'][1]-row['split_anchor_span'][0])
            if match is not None:
                break
    # Find the smallest containing dependency node
    matching_nodes = [(i, tree_array[i][0], tree_array[i][1]) for i in range(match[0][0], match[0][1])]
    matching_nodes = list(set(matching_nodes))
    while len(matching_nodes) > 1:
        matching_nodes.sort(key=lambda x: x[2])
        parent = matching_nodes.pop()[1]
        matching_nodes.append((parent-1, tree_array[parent-1][0], tree_array[parent-1][1]))
        matching_nodes = list(set(matching_nodes))
    # this is the anchor node
    node = matching_nodes[0]
    # And get the POS and words corresponding to that node and its {context} parents
    labeltiers = []
    labelwords = []
    while len(labeltiers) < context:
        labeltiers.append(p['pos'][node[0]])
        labelwords.append(p['words'][node[0]])
        parent = node[1]
        if parent == 0:
            break
        node = (parent-1, tree_array[parent-1][0], tree_array[parent-1][1])
    return dict(zip(grouping_headers, [labeltiers, labelwords]))

output = sample_input.join(sample_input_dep).apply(
    lambda row: group_anchor_pos(row, context=3), 
    axis=1, result_type='expand')

output = sample_input.join(output)
output

# Export CSVs

This section should be able to be run as a sequence by itself, assuming all functions are defined and the large imports have been performed already

In [None]:
import os.path
import pickle

def pipeline(search_word, 
             anchor_synonyms, 
             try_rerun=False):
    """
    Run the entire analysis pipeline for a given search phrase and set of synonyms.

    Parameters
    ----------
    search_word : str
        Identify the term we are splitting on (the "anchor").
        This also serves as the name for this entire cluster of results / practical pipeline run name.
    anchor_synonyms : [str]
        Other common names for this term that we should also consider as anchors.
    try_rerun : Boolean
        Forcibly recreate intermediate files such as coreference or dependency parses.

    """
    
    # Flags
    flag_rerun_coreference = try_rerun or (not os.path.isfile(f'outputs/{search_word}/partial-coreference.pkl'))
    flag_rerun_dependency_ws = try_rerun or (not os.path.isfile(f'outputs/{search_word}/partial-dependency-whtsp.pkl'))
    flag_rerun_dependency_cf = try_rerun or (not os.path.isfile(f'outputs/{search_word}/partial-dependency-coref.pkl'))
    
    # Perform basic df loading
    
    # Read in the dataframe containing entire paper abstracts (NOT pre-split into sentences)
    df = pd.read_csv(f'data/nlp-align_{search_word}.csv')

    # Create the outputs directory for this search word
    Path(f"outputs/{search_word}").mkdir(parents=True, exist_ok=True)

    # Restructure the dataframe to be more usable
    df = df.groupby('ID', group_keys=False).apply(
        lambda row: separate_title_abstract(row)
    ).reset_index(drop=True)

    # Split the full-abstract CSV into a CSV containing individual sentences instead
    df_sentences = df.groupby(['ID', 'Type'], group_keys=False).apply(
        lambda row: sentence_tokenize(row)
    ).reset_index(drop=True)
    
    # Export whitespace-based split data
    whitespace_output = df_sentences.groupby(df_sentences.index, group_keys=False).apply(
        lambda group: split_term_whitespace(group, search_word)).reset_index(drop=True)

    # Export coreference-based split data
    if flag_rerun_coreference:
        output = df.apply(
            lambda row: coref_predictor.predict(row['Text']), axis=1, result_type='expand')
        df_merged = df.join(output)

        output = df_merged.apply(
            lambda row: reinterpret_coref_clusters(row, search_word, df_sentences), 
            axis=1, result_type='expand')
        df_merged = df_merged.join(output)

        df_merged.to_pickle(f'outputs/{search_word}/partial-coreference.pkl')

    df_merged = pd.read_pickle(f'outputs/{search_word}/partial-coreference.pkl')
    coreference_output = df_sentences.groupby(df_sentences.index, group_keys=False).apply(
        lambda group: split_term_coreference(group, search_word, df_merged, split_term_whitespace)
    ).reset_index(drop=True)
    
    # Whitespace grouping...
    splitted_sentences = df_sentences.merge(whitespace_output,
                                            how='outer',
                                            left_on=join_headers,
                                            right_on=join_headers)

    # Retrieve dependencies
    if flag_rerun_dependency_ws:
        splitted_sentences_dep = splitted_sentences.apply(
            lambda row: parse_dependency(row), 
            axis=1, result_type='expand')

        splitted_sentences_dep.to_pickle(f'outputs/{search_word}/partial-dependency-whtsp.pkl')

    splitted_sentences_dep = pd.read_pickle(f'outputs/{search_word}/partial-dependency-whtsp.pkl')
    
    output = splitted_sentences.apply(
        lambda row: group_first_word(row), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/whitespace_firstword.csv')
    
    output = splitted_sentences.apply(
        lambda row: group_first_verb(row), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/whitespace_firstverb.csv')

    output = splitted_sentences.join(splitted_sentences_dep).apply(
        lambda row: group_main_verb(row), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/whitespace_mainverb.csv')

    output = splitted_sentences.join(splitted_sentences_dep).apply(
        lambda row: group_main_verb_expanded(row), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/whitespace_mainverb_svo.csv')

    output = splitted_sentences.join(splitted_sentences_dep).apply(
        lambda row: group_anchor_pos(row, context=3), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/whitespace_anchorpos.csv')

    output = splitted_sentences.join(splitted_sentences_dep).apply(
        lambda row: group_anchor_verb(row), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/whitespace_anchorverb.csv')

    # Coref grouping...
    splitted_sentences = df_sentences.merge(coreference_output,
                                      how='outer',
                                      left_on=join_headers,
                                      right_on=join_headers)

    # Retrieve dependencies
    if flag_rerun_dependency_cf:
        splitted_sentences_dep = splitted_sentences.apply(
            lambda row: parse_dependency(row), 
            axis=1, result_type='expand')

        splitted_sentences_dep.to_pickle(f'outputs/{search_word}/partial-dependency-coref.pkl')

    splitted_sentences_dep = pd.read_pickle(f'outputs/{search_word}/partial-dependency-coref.pkl')
    
    output = splitted_sentences.apply(
        lambda row: group_first_word(row), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/coreference_firstword.csv')

    output = splitted_sentences.apply(
        lambda row: group_first_verb(row), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/coreference_firstverb.csv')

    output = splitted_sentences.join(splitted_sentences_dep).apply(
        lambda row: group_main_verb(row), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/coreference_mainverb.csv')

    output = splitted_sentences.join(splitted_sentences_dep).apply(
        lambda row: group_main_verb_expanded(row), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/coreference_mainverb_svo.csv')

    output = splitted_sentences.join(splitted_sentences_dep).apply(
        lambda row: group_anchor_pos(row, context=3), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/coreference_anchorpos.csv')

    output = splitted_sentences.join(splitted_sentences_dep).apply(
        lambda row: group_anchor_verb(row), 
        axis=1, result_type='expand')
    splitted_sentences.join(output).to_csv(f'outputs/{search_word}/coreference_anchorverb.csv')

In [None]:
all_targets = [
    ('BERT', ['BERT']),
    ('SQuAD', ['SQuAD']),
    ('DROP', ['DROP']),
    ('GPT-2', ['GPT', 'GPT-2', 'GPT-3']),
    ('Transformers', ['Transformers', 'Transformer', 'transfer learning', 'transfer'])
]

for dataset in all_targets:
    pipeline(dataset[0], dataset[1], try_rerun=False)
    print(dataset[0], 'done running')

# Misc utility functions

In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)

search_word = 'BERT'
anchor_type = 'coreference'

# Restructure the outputs into something that can be neatly imported into a spreadsheet to play with
coreference_anchorverb = pd.read_csv(f'outputs/{search_word}/{anchor_type}_anchorverb.csv')
coreference_anchorverb = coreference_anchorverb.rename(
    columns={"group": "averb", "group2": "averb-s", "group3": "averb-o"})
coreference_mainverb = pd.read_csv(f'outputs/{search_word}/{anchor_type}_mainverb.csv')
coreference_mainverb = coreference_mainverb[['group']]
coreference_mainverb = coreference_mainverb.rename(
    columns={"group": "root"})
coreference_mainverb_svo = pd.read_csv(f'outputs/{search_word}/{anchor_type}_mainverb_svo.csv')
coreference_mainverb_svo = coreference_mainverb_svo[['group', 'group2', 'group3']]
coreference_mainverb_svo = coreference_mainverb_svo.rename(
    columns={"group": "root-full", "group2": "root-s", "group3": "root-o"})
coreference_firstverb = pd.read_csv(f'outputs/{search_word}/{anchor_type}_firstverb.csv')
coreference_firstverb = coreference_firstverb[['group']]
coreference_firstverb = coreference_firstverb.rename(
    columns={"group": "fverb"})
coreference_anchorpos = pd.read_csv(f'outputs/{search_word}/{anchor_type}_anchorpos.csv')
coreference_anchorpos = coreference_anchorpos[['group', 'group2']]
coreference_anchorpos = coreference_anchorpos.rename(
    columns={"group": "apos", "group2": "apos-w"})
coreference_firstword = pd.read_csv(f'outputs/{search_word}/{anchor_type}_firstword.csv')
coreference_firstword = coreference_firstword[['group']]
coreference_firstword = coreference_firstword.rename(
    columns={"group": "fword"})

outputs_merged = coreference_anchorverb.join(
    coreference_mainverb
).join(
    coreference_mainverb_svo
).join(
    coreference_firstverb
).join(
    coreference_anchorpos
).join(
    coreference_firstword
)

outputs_merged = outputs_merged[[
    'split_0', 'split_1', 'split_2',
    'averb', 'averb-s', 'averb-o', 
    'root', 'root-full', 'root-s', 'root-o',
    'fverb', 'fword', 'apos', 'apos-w',
    'URL', 'ID', 'Type', 'Index', 'Text', 'split_tokens', 'split_anchor_span'
]]

outputs_merged.to_csv(f'temp/{search_word}_{anchor_type}.csv')
outputs_merged

In [None]:
# p = dependency_predictor.predict(
#     sentence='the quick red fox jumped over the lazy dog.'
# )
# p