In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)

In [None]:
# Do one-time large imports

# For sentence tokenization
from nltk import tokenize

# For coreference resolution
from allennlp.predictors.predictor import Predictor
import allennlp_models.coref
coref_predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
)

# For part-of-speech tagging
import nltk
nltk.download('averaged_perceptron_tagger')

# For dependency parsing
from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
dependency_predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz"
)


In [None]:
### Basic input flags for the notebook / pipeline

# Flag for whether to generate test outputs
flag_debug = False

# Identify the term we are splitting on
search_word = "DROP"

# TODO: what if there are other common names for this term that are unlikely to be coref-matched?
# e.g. "GPT-2" vs "GPT" vs "GPT-3" etc

In [None]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Read in the dataframe containing entire paper abstracts (NOT pre-split into sentences)
df = pd.read_csv(f'data/nlp-align_{search_word}.csv')

# Create the outputs directory for this search word
Path(f"outputs/{search_word}").mkdir(parents=True, exist_ok=True)

# Split apart the 'Title' and 'Abstract' columns
def separate_title_abstract(group):
    row = group.loc[0]
    abs_text = tokenize.sent_tokenize(row['Abstract'])
    return pd.DataFrame({
        'URL': [row['URL']] * 2,
        'ID': [row['ID']] * 2,
        'Type': ['Title', 'Abstract'],
        'Text': [row['Title'], row['Abstract']]
    })

# Restructure the dataframe to be more usable...
df = df.groupby('ID', group_keys=False).apply(
    lambda row: separate_title_abstract(row)
).reset_index(drop=True)

df

In [None]:
# Split the full-abstract CSV into a CSV containing individual sentences instead
def sentence_tokenize(group):
    row = group.reset_index(drop=True).loc[0]
    sentences = tokenize.sent_tokenize(row['Text'])
    return pd.DataFrame({
        'URL': [row['URL']] * (len(sentences)),
        'ID': [row['ID']] * (len(sentences)),
        'Type': [row['Type']] * (len(sentences)),
        'Index': list(range(len(sentences))),
        'Text': sentences
    })

df_sentences = df.groupby(['ID', 'Type'], group_keys=False).apply(
    lambda row: sentence_tokenize(row)
).reset_index(drop=True)

df_sentences

In [None]:
# # Create a test dataframe so we can run models without taking impractically long
# # TODO: this is causing some type inconsistencies, fix those?

# temp_df = pd.DataFrame.from_dict(
#     {'URL': 'abc', 
#      'ID': '0', 
#      'Title': 'Paper Title',
#      'Abstract': 'The BERT language model (LM) (Devlin et al., 2019) is surprisingly good at answering cloze-style questions about relational facts. Petroni et al. (2019) take this as evidence that it memorizes factual knowledge during pre-training. We take issue with this interpretation and argue that the performance of BERT is partly due to reasoning about (the surface form of) entity names, e.g., guessing that a person with an Italian-sounding name speaks Italian.'
#     }
# )

# Splitting functions

Assume we have an input dataframe with some number of columns, at least one of which is titled `Text` and is the column containing each sentence of the abstract.

In [None]:
# split_0 is the text literally preceding the anchor
# split_1 is the text that the anchor consists of
# split_2 is the text literally following the anchor
# split_tokens is the list of tokens that split identifies
# split_anchor_span is a tuple (anchor_first_token_index, anchor_last_token_index) or null if there is no anchor
splitting_headers = ['split_0','split_1','split_2', 'split_tokens', 'split_anchor_span']
# Include ID, Type, Index in the split output to be able to join with df_sentences
join_headers = ['ID', 'Type', 'Index']

In [None]:
# Split on the search word, regardless of whitespace (if search word is A and we have word CAR, it slices it up)
# Only splits on the first instance
def split_term_literal_firstonly(row, search_word):
    output = []
    if search_word in row['Text']:
        output = row['Text'].split(search_word, maxsplit=1)
        output.insert(1, search_word.strip())
        output = [i.strip() for i in output]
        # if the beginning string is empty don't include it in the tokens list
        pre_split = output[0].split(' ') if output[0] != '' else []
        post_split = output[2].split(' ') if output[2] != '' else []
        output.append(pre_split + [search_word.strip()] + post_split)
        output.append((len(pre_split), len(pre_split)+1))
    else:
        output = [row['Text'].strip(),'','',row['Text'].strip().split(' '),None]
    return dict(zip(splitting_headers+join_headers,output+list(row[join_headers])))

if flag_debug:
    literal_firstonly_output = df_sentences.apply(
        lambda row: split_term_literal_firstonly(row, search_word), 
        axis=1, result_type='expand')

    literal_firstonly_output

In [None]:
# Split on the search word, regardless of whitespace (if search word is A and we have word CAR, it slices it up)
# Splits on ALL instances of the search word
def split_term_literal(group, search_word):
    row = group.iloc[0]
    output = []
    if search_word in row['Text']:
        splits = row['Text'].split(search_word)
        for i in range(len(splits) - 1):
            output_i = [search_word.join(splits[:i+1]), search_word.strip(), search_word.join(splits[i+1:])]
            output_i = [i.strip() for i in output_i]
            # if the beginning string is empty don't include it in the tokens list
            pre_split = output_i[0].split(' ') if output_i[0] != '' else []
            post_split = output_i[2].split(' ') if output_i[2] != '' else []
            output_i.append(pre_split + [search_word] + post_split)
            output_i.append((len(pre_split), len(pre_split)+1))
            output_i += list(row[join_headers])
            output.append(output_i)
    else:
        output = [[row['Text'].strip(),'','',row['Text'].strip().split(' '),None]+list(row[join_headers])]
    # Transpose the output format so we can use it in zip for dataframe generation
    output_t = [list(t) for t in list(zip(*output))]
    return pd.DataFrame(dict(zip(splitting_headers+join_headers,output_t)))

if flag_debug:
    literal_output = df_sentences.groupby(df_sentences.index, group_keys=False).apply(
        lambda group: split_term_literal(group, search_word)).reset_index(drop=True)

    literal_output

In [None]:
# Split on the search word, taking care to only split on whitespace
# Only splits on the first instance
def split_term_whitespace_firstonly(row, search_word):
    output = []
    if search_word in row['Text']:
        tokens = row['Text'].split(' ')
        for i in range(len(tokens)):
            if search_word in tokens[i]:
                output=[' '.join(tokens[:i]), tokens[i], ' '.join(tokens[i+1:])]
                break
        output = [i.strip() for i in output]
        # if the beginning string is empty don't include it in the tokens list
        pre_split = output[0].split(' ') if output[0] != '' else []
        post_split = output[2].split(' ') if output[2] != '' else []
        output.append(pre_split + [output[1]] + post_split)
        output.append((len(pre_split), len(pre_split)+1))
    else:
        output = [row['Text'].strip(),'','',row['Text'].strip().split(' '),None]
    return dict(zip(splitting_headers+join_headers,output+list(row[join_headers])))

if flag_debug:
    whitespace_firstonly_output = df_sentences.apply(
        lambda row: split_term_whitespace_firstonly(row, search_word), 
        axis=1, result_type='expand')

    whitespace_firstonly_output

In [None]:
# Split on the search word, taking care to only split on whitespace
# Splits on ALL instances of the search word
def split_term_whitespace(group, search_word):
    row = group.iloc[0]
    output = []
    if search_word in row['Text']:
        tokens = row['Text'].split(' ')
        for i in range(len(tokens)):
            if search_word in tokens[i]:
                output_i = [' '.join(tokens[:i]), tokens[i], ' '.join(tokens[i+1:])]
                output_i = [i.strip() for i in output_i]
                # if the beginning string is empty don't include it in the tokens list
                pre_split = output_i[0].split(' ') if output_i[0] != '' else []
                post_split = output_i[2].split(' ') if output_i[2] != '' else []
                output_i.append(pre_split + [output_i[1]] + post_split)
                output_i.append((len(pre_split), len(pre_split)+1))
                output_i += list(row[join_headers])
                output.append(output_i)
    else:
        output = [[row['Text'].strip(),'','',row['Text'].strip().split(' '),None]+list(row[join_headers])]
    # Transpose the output format so we can use it in zip for dataframe generation
    output_t = [list(t) for t in list(zip(*output))]
    return pd.DataFrame(dict(zip(splitting_headers+join_headers,output_t)))

if flag_debug:
    whitespace_output = df_sentences.groupby(df_sentences.index, group_keys=False).apply(
        lambda group: split_term_whitespace(group, search_word)).reset_index(drop=True)

    whitespace_output

In [None]:
# Run coreference resolution over the entire abstract, not individual sentences
output = df.apply(
    lambda row: coref_predictor.predict(row['Text']), axis=1, result_type='expand')
df_merged = df.join(output)

if flag_debug:
    df_merged

In [None]:
import re

# transform the output of coreference resolution into something that is more easily manipulated
# split it across multiple sentences so each indiv sentence row can still work
def reinterpret_coref_clusters(row, search_word, sentences):
    # Create dicts to map full-document to indiv sentence data
    src = sentences.loc[sentences['ID'] == row['ID']].loc[sentences['Type'] == row['Type']]['Text']
    curr_sentence = 0
    consumed = 0
    sent_mapping = {}
    sent_content = {}
    last_sent_end = 0
    doct_mapping = {}
    doct_split = []
    for i in range(len(row['document'])):
        if row['document'][i].strip() != '':
            if row['document'][i] not in src.iloc[curr_sentence][consumed:]:
                doct_split.append(row['document'][last_sent_end:i])
                last_sent_end = i
                curr_sentence += 1
                consumed = 0
            offset = src.iloc[curr_sentence][consumed:].index(row['document'][i])
            sent_mapping[i] = curr_sentence
            if curr_sentence not in sent_content:
                sent_content[curr_sentence] = []
            sent_content[curr_sentence].append(i)
            doct_mapping[i] = i - last_sent_end
            consumed += offset + len(row['document'][i])
        else:
            sent_mapping[i] = curr_sentence
            sent_content[curr_sentence].append(i)
            doct_mapping[i] = i - last_sent_end
        doct_split.append(row['document'][last_sent_end:])
    # Select the cluster that best matches the search word
    selcluster_idx = -1
    selcluster_ct = 0
    for i in range(len(row['clusters'])):
        currcluster_ct = 0
        for c in row['clusters'][i]:
            currcluster_ct += len(re.findall(f'{search_word}', ' '.join(row['document'][c[0]:c[1]+1])))
        if currcluster_ct > selcluster_ct:
            selcluster_idx = i
            selcluster_ct = currcluster_ct
    # Build the output row
    output = [sent_mapping, sent_content, doct_mapping, selcluster_idx]
    return dict(zip(['sent_mapping', 'sent_content', 'doct_mapping', 'selcluster_idx'],output))

output = df_merged.apply(
    lambda row: reinterpret_coref_clusters(row, search_word, df_sentences), 
    axis=1, result_type='expand')
df_merged = df_merged.join(output)

df_merged.to_csv(f'outputs/{search_word}/coreference-partial.csv')
if flag_debug:
    df_merged

In [None]:
# Split based on co-references to any phrase containing search term, using allennlp coreference resolution
# This does NOT preserve the original sentence spacing
# REQUIRES THAT WE ALREADY RAN THE COREFERENCE PREDICTOR - this func does NOT do all of the work!
# Only splits on the first instance
def split_term_coreference_firstonly(row, search_word, lookup, fallback):
    # there's probably a cleaner way to do this...
    lookup_row = lookup.loc[lookup['ID']==row['ID']].loc[lookup['Type']==row['Type']].to_dict(orient='records')[0]
    if lookup_row['selcluster_idx'] == -1:
        # if we didn't identify any clusters that match the search term, use our fallback method
        return fallback(row, search_word)
    split_clusters = lookup_row['clusters'][lookup_row['selcluster_idx']]
    output = []
    for i in range(len(split_clusters)):
        c = split_clusters[i]
        if lookup_row['sent_mapping'][c[0]] == lookup_row['sent_mapping'][c[0]] == row['Index']:
            sentence_start = lookup_row['sent_content'][row['Index']][0]
            sentence_end = lookup_row['sent_content'][row['Index']][-1]
            pre_split = lookup_row['document'][sentence_start:c[0]]
            anchor = lookup_row['document'][c[0]:c[1]+1]
            post_split = lookup_row['document'][c[1]+1:sentence_end+1]
            output=[' '.join(pre_split),
                    ' '.join(anchor),
                    ' '.join(post_split)]
            output.append(lookup_row['document'][sentence_start:sentence_end+1])
            output.append((len(pre_split), len(pre_split)+len(anchor)))
            break
    if output == []:
        # if there wasn't any reference in the sentence found, use our fallback method
        return fallback(row, search_word)
    return dict(zip(splitting_headers+join_headers,output+list(row[join_headers])))

if flag_debug:
    coreference_firstonly_output = df_sentences.apply(
        lambda row: split_term_coreference_firstonly(row, search_word, df_merged, split_term_whitespace_firstonly), 
        axis=1, result_type='expand')

    coreference_firstonly_output

In [None]:
# Split based on co-references to any phrase containing search term, using allennlp coreference resolution
# This does NOT preserve the original sentence spacing
# REQUIRES THAT WE ALREADY RAN THE COREFERENCE PREDICTOR - this func does NOT do all of the work!
# Splits on ALL instances of references to the search word
def split_term_coreference(group, search_word, lookup, fallback):
    row = group.iloc[0]
    # there's probably a cleaner way to do this...
    lookup_row = lookup.loc[lookup['ID']==row['ID']].loc[lookup['Type']==row['Type']].to_dict(orient='records')[0]
    if lookup_row['selcluster_idx'] == -1:
        # if we didn't identify any clusters that match the search term, use our fallback method
        return fallback(group, search_word)
    split_clusters = lookup_row['clusters'][lookup_row['selcluster_idx']]
    output = []
    for i in range(len(split_clusters)):
        c = split_clusters[i]
        if lookup_row['sent_mapping'][c[0]] == lookup_row['sent_mapping'][c[0]] == row['Index']:
            sentence_start = lookup_row['sent_content'][row['Index']][0]
            sentence_end = lookup_row['sent_content'][row['Index']][-1]
            pre_split = lookup_row['document'][sentence_start:c[0]]
            anchor = lookup_row['document'][c[0]:c[1]+1]
            post_split = lookup_row['document'][c[1]+1:sentence_end+1]
            output_i=[' '.join(pre_split),
                    ' '.join(anchor),
                    ' '.join(post_split)]
            output_i.append(lookup_row['document'][sentence_start:sentence_end+1])
            output_i.append((len(pre_split), len(pre_split)+len(anchor)))
            output_i += list(row[join_headers])
            output.append(output_i)
    if output == []:
        # if there wasn't any reference in the sentence found, use our fallback method
        return fallback(group, search_word)
    # Transpose the output format so we can use it in zip for dataframe generation
    output_t = [list(t) for t in list(zip(*output))]
    return pd.DataFrame(dict(zip(splitting_headers+join_headers,output_t)))

if flag_debug:
    coreference_output = df_sentences.groupby(df_sentences.index, group_keys=False).apply(
        lambda group: split_term_coreference(group, search_word, df_merged, split_term_whitespace)
    ).reset_index(drop=True)

    coreference_output

# Grouping functions

Assume we have an input dataframe with column headers `['split_0','split_1','split_2', 'split_tokens', 'split_anchor_span']`

`'split_1'` is the column that contains our search term / anchor point

`'split_0'` and `'split_2'` are the columns that contain text before and after the search terms respectively

In [None]:
# group is the text uniquely identifying a group
grouping_headers = ['group']

In [None]:
# Group on the literal first word that comes after the anchor point
def group_first_word(row):
    if (row['split_anchor_span'] is not None) and \
            row['split_anchor_span'][1] < len(row['split_tokens']):
        output = [row['split_tokens'][row['split_anchor_span'][1]]]
    else:
        output = ['']
    return dict(zip(grouping_headers, output))

if flag_debug:
    output = sample_input.apply(
        lambda row: group_first_word(row), 
        axis=1, result_type='expand').sort_values(by=['group'])

    output = sample_input.join(output)
    output

In [None]:
# Group on the first verb that comes after the anchor point, using NLTK part-of-speech tagging
def group_first_verb(row):
    tokens = [nltk.word_tokenize(row['split_0']), 
              nltk.word_tokenize(row['split_1']),
              nltk.word_tokenize(row['split_2'])]
    tokens_pos = nltk.pos_tag([item for sublist in tokens for item in sublist])
    verb = ''
    for i in range(len(tokens[0])+len(tokens[1]), len(tokens_pos)):
        if tokens_pos[i][1].startswith('V'):
            verb = tokens_pos[i][0]
            break
    output = [verb]
    return dict(zip(grouping_headers, output))

if flag_debug:
    output = sample_input.apply(
        lambda row: group_first_verb(row), 
        axis=1, result_type='expand').sort_values(by=['group'])

    output = sample_input.join(output)
    output

In [None]:
# Do dependency parsing once for the entire sample_input to save processing time
# for groupings that require dependency parsing later
def parse_dependency(row):
    p = dependency_predictor.predict(
        sentence=' '.join(row['split_tokens']).strip()
    )
    return dict(zip(['dependency_parse'], [p]))

if flag_debug:
    sample_input_dep = sample_input.apply(
        lambda row: parse_dependency(row), 
        axis=1, result_type='expand')

In [None]:
# Group on the main verb in the sentence, using allennlp dependency parsing (based on demo code)
def group_main_verb(row):
    p = row['dependency_parse']
    output = [p['hierplane_tree']['root']['word']]
    return dict(zip(grouping_headers, output))

if flag_debug:
    output = sample_input.join(sample_input_dep).apply(
        lambda row: group_main_verb(row), 
        axis=1, result_type='expand').sort_values(by=['group'])

    output = sample_input.join(output)
    output

In [None]:
# Helper function for phrase POS
# Return the match bounds of the sequence of elements of given sizes starting at list1[i1] and list2[i2] 
# that match
# If no given size is returned, returns max matching sequence length
# (ratio of element matches must be 1:some or some:1 between l1 and l2)
# Returns [(l1 bounds), (l2 bounds)] or None if they do not match
def list_elements_match(list1, list2, i1, i2, size1=None, size2=None):
    matchlen = 0
    if size1 is not None and size2 is not None:
        # check for exact text match
        matchlen = len(''.join(list1[i1:i1+size1]))
        if ''.join(list1[i1:i1+size1]) != ''.join(list2[i2:i2+size2]):
            return None
    elif size1 is not None:
        # and size2 is none
        matchlen = len(''.join(list1[i1:i1+size1]))
        if ''.join(list1[i1:i1+size1]) != ''.join(list2[i2:])[:matchlen]:
            return None
    elif size2 is not None:
        # and size1 is none
        matchlen = len(''.join(list2[i2:i2+size2]))
        if ''.join(list2[i2:i2+size2]) != ''.join(list1[i1:])[:matchlen]:
            return None
    else:
        # both are none; just calculate the match length
        matchlen = 0
        while l1concat[matching] == l2concat[matching]:
            matchlen += 1
    matchphrase = ''.join(list1[i1:])[:matchlen]
    # get the exact bounds for list1
    bound1 = 1
    for i in range(len(list1)-i1+1):
        if ''.join(list1[i1:i1+i]) == matchphrase:
            bound1 = i
            break
    # get the exact bounds for list2
    bound2 = 1
    for i in range(len(list2)-i2+1):
        if ''.join(list2[i2:i2+i]) == matchphrase:
            bound2 = i
            break
    return [(i1, i1+bound1), (i2, i2+bound2)]

In [None]:
# Group on the POS of the anchor point, using allennlp dependency parsing (based on demo code)
# I'm defining the "POS of a phrase" as the POS of the lowest node that contains the entire phrase
def group_pos_anchor(row, context=1):
    if row['split_anchor_span'] is None:
        return dict(zip(grouping_headers, ['']))
    p = row['dependency_parse']
    # build out a more usable version of the dependency tree with information about tree level!
    # tree_array[n] = (parent, level) so the root node is (0, 0)
    tree_array = [(h, -1) for h in p['predicted_heads']]
    need_connection = [0]
    level = 0
    while len(need_connection) > 0:
        need_connection_update = []
        for i in range(len(tree_array)):
            if tree_array[i][0] in need_connection:
                tree_array[i] = (tree_array[i][0], level)
                need_connection_update.append(i+1)
        need_connection = need_connection_update
        level += 1
    # Figure out what indexes our anchor fits into
    # Assume that the anchor is contiguous text
    # TODO: is this always true?
    for i in range(len(p['words'])):
        match = list_elements_match(
            p['words'], row['split_tokens'], i, row['split_anchor_span'][0], 
            size2=row['split_anchor_span'][1]-row['split_anchor_span'][0])
        if match is not None:
            break
    # Find the smallest containing dependency node
    matching_nodes = [(i, tree_array[i][0], tree_array[i][1]) for i in range(match[0][0], match[0][1])]
    matching_nodes = list(set(matching_nodes))
    while len(matching_nodes) > 1:
        matching_nodes.sort(key=lambda x: x[2])
        parent = matching_nodes.pop()[1]
        matching_nodes.append((parent-1, tree_array[parent-1][0], tree_array[parent-1][1]))
        matching_nodes = list(set(matching_nodes))
    labeltiers = []
    while len(labeltiers) < context:
        # TODO include the actual word of the node too?
        labeltiers.append(p['pos'][matching_nodes[0][0]])
        parent = matching_nodes[0][1]
        if parent == 0:
            break
        matching_nodes[0] = (parent-1, tree_array[parent-1][0], tree_array[parent-1][1])
    return dict(zip(grouping_headers, [labeltiers]))

if flag_debug:
    output = sample_input.join(sample_input_dep).apply(
        lambda row: group_pos_anchor(row, context=3), 
        axis=1, result_type='expand')

    output = sample_input.join(output)
    output

# Export CSVs

In [None]:
# Export whitespace-based split data
splitted_sentences = df_sentences.merge(whitespace_output,
                                  how='outer',
                                  left_on=join_headers,
                                  right_on=join_headers)

splitted_sentences_dep = splitted_sentences.apply(
    lambda row: parse_dependency(row), 
    axis=1, result_type='expand')

output = splitted_sentences.apply(
    lambda row: group_first_word(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
splitted_sentences.join(output).to_csv(f'outputs/{search_word}/whitespace_firstword.csv')

output = splitted_sentences.apply(
    lambda row: group_first_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
splitted_sentences.join(output).to_csv(f'outputs/{search_word}/whitespace_firstverb.csv')

output = splitted_sentences.join(splitted_sentences_dep).apply(
    lambda row: group_main_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
splitted_sentences.join(output).to_csv(f'outputs/{search_word}/whitespace_mainverb.csv')

output = splitted_sentences.join(splitted_sentences_dep).apply(
    lambda row: group_pos_anchor(row, context=3), 
    axis=1, result_type='expand')
splitted_sentences.join(output).to_csv(f'outputs/{search_word}/whitespace_anchorpos.csv')

In [None]:
# Export coreference-based split data
splitted_sentences = df_sentences.merge(coreference_output,
                                  how='outer',
                                  left_on=join_headers,
                                  right_on=join_headers)

splitted_sentences_dep = splitted_sentences.apply(
    lambda row: parse_dependency(row), 
    axis=1, result_type='expand')

output = splitted_sentences.apply(
    lambda row: group_first_word(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
splitted_sentences.join(output).to_csv(f'outputs/{search_word}/coreference_firstword.csv')

output = splitted_sentences.apply(
    lambda row: group_first_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
splitted_sentences.join(output).to_csv(f'outputs/{search_word}/coreference_firstverb.csv')

output = splitted_sentences.join(splitted_sentences_dep).apply(
    lambda row: group_main_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
splitted_sentences.join(output).to_csv(f'outputs/{search_word}/coreference_mainverb.csv')

output = splitted_sentences.join(splitted_sentences_dep).apply(
    lambda row: group_pos_anchor(row, context=3), 
    axis=1, result_type='expand')
splitted_sentences.join(output).to_csv(f'outputs/{search_word}/coreference_anchorpos.csv')

# Misc utility functions

In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)

pd.read_csv(f'outputs/{search_word}/coreference_mainverb.csv').sort_values(by=['group'])