In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)

In [None]:
# Do one-time large imports

# For sentence tokenization
from nltk import tokenize

# For coreference resolution
from allennlp.predictors.predictor import Predictor
import allennlp_models.coref
coref_predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
)

# For part-of-speech tagging
import nltk
nltk.download('averaged_perceptron_tagger')

# For dependency parsing
from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
dependency_predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz"
)


In [None]:
import csv
import pandas as pd
import matplotlib.pyplot as plt

# Identify the term we are splitting on
search_word = "BERT"

# Read in the dataframe containing entire paper abstracts (NOT pre-split into sentences)
df = pd.read_csv('data/nlp-align_BERT.csv')

# Restructure the dataframe to be more usable...
df = df.groupby('ID', group_keys=False).apply(
    lambda row: separate_title_abstract(row)
)

df

In [None]:
# Split the full-abstract CSV into a CSV containing individual sentences instead
def sentence_tokenize(group):
    row = group.reset_index(drop=True).loc[0]
    sentences = tokenize.sent_tokenize(row['Text'])
    return pd.DataFrame({
        'URL': [row['URL']] * (len(sentences)),
        'ID': [row['ID']] * (len(sentences)),
        'Type': [row['Type']] * (len(sentences)),
        'Index': list(range(len(sentences))),
        'Text': sentences
    })

df_sentences = df.groupby(['ID', 'Type'], group_keys=False).apply(
    lambda row: sentence_tokenize(row)
).reset_index(drop=True)

df_sentences

In [None]:
# # Create a test dataframe so we can run models without taking impractically long
# # TODO: this is causing some type inconsistencies, fix those?

# temp_df = pd.DataFrame.from_dict(
#     {'URL': 'abc', 
#      'ID': '0', 
#      'Title': 'Paper Title',
#      'Abstract': 'The BERT language model (LM) (Devlin et al., 2019) is surprisingly good at answering cloze-style questions about relational facts. Petroni et al. (2019) take this as evidence that it memorizes factual knowledge during pre-training. We take issue with this interpretation and argue that the performance of BERT is partly due to reasoning about (the surface form of) entity names, e.g., guessing that a person with an Italian-sounding name speaks Italian.'
#     }
# )

# Splitting functions

Assume we have an input dataframe with some number of columns, at least one of which is titled `Text` and is the column containing each sentence of the abstract.

In [None]:
splitting_headers = ['text_split_0','text_split_1','text_split_2']

In [None]:
# Split on the search word, regardless of whitespace (if search word is A and we have word CAR, it slices it up)
def split_search_term_literal(row, search_word):
    # Splits on first instance ONLY
    output = []
    if search_word in row['Text']:
        output = row['Text'].split(search_word, maxsplit=1)
        output.insert(1, search_word)
    else:
        output = [row['Text'],'','']
    return dict(zip(splitting_headers,output))

output = df_sentences.apply(
    lambda row: split_search_term_literal(row, search_word), 
    axis=1, result_type='expand')

output

In [None]:
# Split on the search word, taking care to only split on whitespace
def split_search_term_whitespace(row, search_word):
    # Splits on first instance ONLY
    output = []
    if search_word in row['Text']:
        tokens = row['Text'].split(' ')
        for i in range(len(tokens)):
            if search_word in tokens[i]:
                output=[' '.join(tokens[:i]), tokens[i], ' '.join(tokens[i+1:])]
                break
    else:
        output = [row['Text'],'','']
    return dict(zip(splitting_headers,output))

output = df_sentences.apply(
    lambda row: split_search_term_whitespace(row, search_word), 
    axis=1, result_type='expand')

output

In [None]:
import re

# Split apart the 'Title' and 'Abstract' columns
def separate_title_abstract(group):
    row = group.loc[0]
    abs_text = tokenize.sent_tokenize(row['Abstract'])
    return pd.DataFrame({
        'URL': [row['URL']] * 2,
        'ID': [row['ID']] * 2,
        'Type': ['Title', 'Abstract'],
        'Text': [row['Title'], row['Abstract']]
    })

# transform the output of coreference resolution into something that is more easily manipulated
# split it across multiple sentences so each indiv sentence row can still work
def reinterpret_coref_clusters(row, search_word, sentences):
    # Create dicts to map full-document to indiv sentence data
    src = sentences.loc[sentences['ID'] == row['ID']].loc[sentences['Type'] == row['Type']]['Text']
    curr_sentence = 0
    consumed = 0
    sent_mapping = {}
    sent_content = {}
    last_sent_end = 0
    doct_mapping = {}
    doct_split = []
    for i in range(len(row['document'])):
        if row['document'][i].strip() != '':
            if row['document'][i] not in src.iloc[curr_sentence][consumed:]:
                doct_split.append(row['document'][last_sent_end:i])
                last_sent_end = i
                curr_sentence += 1
                consumed = 0
            offset = src.iloc[curr_sentence][consumed:].index(row['document'][i])
            sent_mapping[i] = curr_sentence
            if curr_sentence not in sent_content:
                sent_content[curr_sentence] = []
            sent_content[curr_sentence].append(i)
            doct_mapping[i] = i - last_sent_end
            consumed += offset + len(row['document'][i])
        else:
            sent_mapping[i] = curr_sentence
            sent_content[curr_sentence].append(i)
            doct_mapping[i] = i - last_sent_end
        doct_split.append(row['document'][last_sent_end:])
    # Select the cluster that best matches the search word
    selcluster_idx = -1
    selcluster_ct = 0
    for i in range(len(row['clusters'])):
        currcluster_ct = 0
        for c in row['clusters'][i]:
            currcluster_ct += len(re.findall(f'{search_word}', ' '.join(row['document'][c[0]:c[1]+1])))
        if currcluster_ct > selcluster_ct:
            selcluster_idx = i
            selcluster_ct = currcluster_ct
    # Build the output row
    output = [sent_mapping, sent_content, doct_mapping, selcluster_idx]
    return dict(zip(['sent_mapping', 'sent_content', 'doct_mapping', 'selcluster_idx'],output))

# Run coreference resolution over the entire abstract, not individual sentences
output = df.apply(
    lambda row: coref_predictor.predict(row['Text']), axis=1, result_type='expand')
df_merged = df.join(output)

output = df_merged.apply(
    lambda row: reinterpret_coref_clusters(row, search_word, df_sentences), 
    axis=1, result_type='expand')
# df_merged = df_merged.join(output)

df_merged.join(output).to_csv(f'outputs/coreference-partial.csv')
df_merged.join(output)

df_merged

In [None]:
# Split based on co-references to any phrase containing BERT, using allennlp coreference resolution
# This does NOT preserve the original sentence spacing
def split_search_term_coreference(row, search_word, lookup, fallback):
    # Splits on first coref instance ONLY
    # there's probably a cleaner way to do this...
    lookup_row = lookup.loc[lookup['ID']==row['ID']].loc[lookup['Type']==row['Type']].to_dict(orient='records')[0]
    if lookup_row['selcluster_idx'] == -1:
        # if we didn't identify any clusters that match the search term, use our fallback method
        return fallback(row, search_word)
    split_clusters = lookup_row['clusters'][lookup_row['selcluster_idx']]
    output = []
    for i in range(len(split_clusters)):
        c = split_clusters[i]
        if lookup_row['sent_mapping'][c[0]] == lookup_row['sent_mapping'][c[0]] == row['Index']:
            sentence_start = lookup_row['sent_content'][row['Index']][0]
            sentence_end = lookup_row['sent_content'][row['Index']][-1]
            output=[' '.join(lookup_row['document'][sentence_start:c[0]]),
                    ' '.join(lookup_row['document'][c[0]:c[1]+1]),
                    ' '.join(lookup_row['document'][c[1]+1:sentence_end])]
            break
    if output == []:
        # if there wasn't any reference in the sentence found, use our fallback method
        return fallback(row, search_word)
    return dict(zip(splitting_headers,output))

metadata = df_merged.apply(
    lambda row: reinterpret_coref_clusters(row, search_word, df_sentences), 
    axis=1, result_type='expand')
output = df_sentences.apply(
    lambda row: split_search_term_coreference(row, search_word, df_merged.join(metadata), split_search_term_whitespace), 
    axis=1, result_type='expand')

output

# Grouping functions

Assume we have an input dataframe with column headers `['text_split_0','text_split_1','text_split_2']`

`'text_split_1'` is the column that contains our search term / anchor point

`'text_split_0'` and `'text_split_2'` are the columns that contain text before and after the search terms respectively

In [None]:
# sample_input = df.apply(lambda row: split_search_term_whitespace(row, search_word), axis=1, result_type='expand')

sample_input = df.join(output)
sample_input

In [None]:
# Group on the first word (regardless of part of speech) that comes after the anchor point
def group_first_word(row):
    output = [row['text_split_2'].split(' ')[0]]
    return dict(zip(['group'], output))

output = sample_input.apply(lambda row: group_first_word(row), axis=1, result_type='expand').sort_values(by=['group'])

output = sample_input.join(output)
output.to_csv(f'outputs/coreference_firstword.csv')
output

In [None]:
# Group on the first verb that comes after the anchor point, using NLTK part-of-speech tagging
def group_first_verb(row):
    tokens = [nltk.word_tokenize(row['text_split_0']), 
              nltk.word_tokenize(row['text_split_1']),
              nltk.word_tokenize(row['text_split_2'])]
    tokens_pos = nltk.pos_tag([item for sublist in tokens for item in sublist])
    verb = ''
    for i in range(len(tokens[0])+len(tokens[1]), len(tokens_pos)):
        if tokens_pos[i][1].startswith('V'):
            verb = tokens_pos[i][0]
            break
    output = [verb]
    return dict(zip(['group'], output))

output = sample_input.apply(
    lambda row: group_first_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])

output = sample_input.join(output)
output.to_csv(f'outputs/coreference_firstverb.csv')
output

In [None]:
# Group on the main verb in the sentence, using allennlp dependency parsing (based on demo code)
def group_main_verb(row):
    p = dependency_predictor.predict(
        sentence=' '.join([row['text_split_0'], row['text_split_1'], row['text_split_2']]).strip()
    )
    output = [p['hierplane_tree']['root']['word']]
    return dict(zip(['group'], output))

output = sample_input.apply(
    lambda row: group_main_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])

output = sample_input.join(output)
output.to_csv(f'outputs/coreference_mainverb.csv')
output

# Export CSVs

# Misc utility functions

In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)

pd.read_csv(f'outputs/coreference_mainverb.csv').sort_values(by=['group'])