In [1]:
import pandas as pd
pd.set_option("display.max_rows", None)

In [2]:
# Do one-time large imports

# For sentence tokenization
from nltk import tokenize

# For coreference resolution
from allennlp.predictors.predictor import Predictor
import allennlp_models.coref
coref_predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
)

# For part-of-speech tagging
import nltk
nltk.download('averaged_perceptron_tagger')

# For dependency parsing
from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
dependency_predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz"
)


Did not use initialization regex that was passed: _context_layer._module.weight_hh.*
Did not use initialization regex that was passed: _context_layer._module.weight_ih.*
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/cephcyn/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
import csv
import pandas as pd
import matplotlib.pyplot as plt

# Identify the term we are splitting on
search_word = "BERT"

# Read in the dataframe containing entire paper abstracts (NOT pre-split into sentences)
df = pd.read_csv('data/nlp-align_BERT.csv')

# Split apart the 'Title' and 'Abstract' columns
def separate_title_abstract(group):
    row = group.loc[0]
    abs_text = tokenize.sent_tokenize(row['Abstract'])
    return pd.DataFrame({
        'URL': [row['URL']] * 2,
        'ID': [row['ID']] * 2,
        'Type': ['Title', 'Abstract'],
        'Text': [row['Title'], row['Abstract']]
    })

# Restructure the dataframe to be more usable...
df = df.groupby('ID', group_keys=False).apply(
    lambda row: separate_title_abstract(row)
).reset_index(drop=True)

df

Unnamed: 0,URL,ID,Type,Text
0,https://www.semanticscholar.org/paper/BERT%3A-...,0,Title,BERT: Pre-training of Deep Bidirectional Trans...
1,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,We introduce a new language representation mod...
2,https://www.semanticscholar.org/paper/RoBERTa%...,1,Title,RoBERTa: A Robustly Optimized BERT Pretraining...
3,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,Language model pretraining has led to signific...
4,https://www.semanticscholar.org/paper/DistilBE...,2,Title,"DistilBERT, a distilled version of BERT: small..."
5,https://www.semanticscholar.org/paper/DistilBE...,2,Abstract,As Transfer Learning from large-scale pre-trai...
6,https://www.semanticscholar.org/paper/BERT-Red...,3,Title,BERT Rediscovers the Classical NLP Pipeline
7,https://www.semanticscholar.org/paper/BERT-Red...,3,Abstract,Pre-trained text encoders have rapidly advance...
8,https://www.semanticscholar.org/paper/What-Doe...,4,Title,What Does BERT Look At? An Analysis of BERT's ...
9,https://www.semanticscholar.org/paper/What-Doe...,4,Abstract,Large pre-trained neural networks such as BERT...


In [4]:
# Split the full-abstract CSV into a CSV containing individual sentences instead
def sentence_tokenize(group):
    row = group.reset_index(drop=True).loc[0]
    sentences = tokenize.sent_tokenize(row['Text'])
    return pd.DataFrame({
        'URL': [row['URL']] * (len(sentences)),
        'ID': [row['ID']] * (len(sentences)),
        'Type': [row['Type']] * (len(sentences)),
        'Index': list(range(len(sentences))),
        'Text': sentences
    })

df_sentences = df.groupby(['ID', 'Type'], group_keys=False).apply(
    lambda row: sentence_tokenize(row)
).reset_index(drop=True)

df_sentences

Unnamed: 0,URL,ID,Type,Index,Text
0,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,0,We introduce a new language representation mod...
1,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,1,"Unlike recent language representation models, ..."
2,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,2,"As a result, the pre-trained BERT model can be..."
3,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,3,It obtains new state-of-the-art results on ele...
4,https://www.semanticscholar.org/paper/BERT%3A-...,0,Title,0,BERT: Pre-training of Deep Bidirectional Trans...
5,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,0,Language model pretraining has led to signific...
6,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,1,"Training is computationally expensive, often d..."
7,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,2,We present a replication study of BERT pretrai...
8,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,3,We find that BERT was significantly undertrain...
9,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,4,Our best model achieves state-of-the-art resul...


In [None]:
# # Create a test dataframe so we can run models without taking impractically long
# # TODO: this is causing some type inconsistencies, fix those?

# temp_df = pd.DataFrame.from_dict(
#     {'URL': 'abc', 
#      'ID': '0', 
#      'Title': 'Paper Title',
#      'Abstract': 'The BERT language model (LM) (Devlin et al., 2019) is surprisingly good at answering cloze-style questions about relational facts. Petroni et al. (2019) take this as evidence that it memorizes factual knowledge during pre-training. We take issue with this interpretation and argue that the performance of BERT is partly due to reasoning about (the surface form of) entity names, e.g., guessing that a person with an Italian-sounding name speaks Italian.'
#     }
# )

# Splitting functions

Assume we have an input dataframe with some number of columns, at least one of which is titled `Text` and is the column containing each sentence of the abstract.

In [5]:
# split_0 is the text literally preceding the anchor
# split_1 is the text that the anchor consists of
# split_2 is the text literally following the anchor
# split_tokens is the list of tokens that split identifies
# split_anchor_span is a tuple (anchor_first_token_index, anchor_last_token_index) or null if there is no anchor
splitting_headers = ['split_0','split_1','split_2', 'split_tokens', 'split_anchor_span']

In [6]:
# Split on the search word, regardless of whitespace (if search word is A and we have word CAR, it slices it up)
def split_term_literal(row, search_word):
    # Splits on first instance ONLY
    output = []
    if search_word in row['Text']:
        output = row['Text'].split(search_word, maxsplit=1)
        output.insert(1, search_word)
        output = [i.strip() for i in output]
        # if the beginning string is empty don't include it in the tokens list
        pre_split = output[0].split(' ') if output[0] != '' else []
        post_split = output[2].split(' ') if output[2] != '' else []
        output.append(pre_split + [search_word] + post_split)
        output.append((len(pre_split), len(pre_split)+1))
    else:
        output = [row['Text'].strip(),'','',row['Text'].strip().split(' '),None]
    return dict(zip(splitting_headers,output))

literal_output = df_sentences.apply(
    lambda row: split_term_literal(row, search_word), 
    axis=1, result_type='expand')

literal_output

Unnamed: 0,split_0,split_1,split_2,split_tokens,split_anchor_span
0,We introduce a new language representation mod...,BERT,", which stands for Bidirectional Encoder Repre...","[We, introduce, a, new, language, representati...","(8, 9)"
1,"Unlike recent language representation models,",BERT,is designed to pre-train deep bidirectional re...,"[Unlike, recent, language, representation, mod...","(5, 6)"
2,"As a result, the pre-trained",BERT,model can be fine-tuned with just one addition...,"[As, a, result,, the, pre-trained, BERT, model...","(5, 6)"
3,It obtains new state-of-the-art results on ele...,,,"[It, obtains, new, state-of-the-art, results, ...",
4,,BERT,: Pre-training of Deep Bidirectional Transform...,"[BERT, :, Pre-training, of, Deep, Bidirectiona...","(0, 1)"
5,Language model pretraining has led to signific...,,,"[Language, model, pretraining, has, led, to, s...",
6,"Training is computationally expensive, often d...",,,"[Training, is, computationally, expensive,, of...",
7,We present a replication study of,BERT,"pretraining (Devlin et al., 2019) that careful...","[We, present, a, replication, study, of, BERT,...","(6, 7)"
8,We find that,BERT,"was significantly undertrained, and can match ...","[We, find, that, BERT, was, significantly, und...","(3, 4)"
9,Our best model achieves state-of-the-art resul...,,,"[Our, best, model, achieves, state-of-the-art,...",


In [7]:
# Split on the search word, taking care to only split on whitespace
def split_term_whitespace(row, search_word):
    # Splits on first instance ONLY
    output = []
    if search_word in row['Text']:
        tokens = row['Text'].split(' ')
        for i in range(len(tokens)):
            if search_word in tokens[i]:
                output=[' '.join(tokens[:i]), tokens[i], ' '.join(tokens[i+1:])]
                break
        output = [i.strip() for i in output]
        # if the beginning string is empty don't include it in the tokens list
        pre_split = output[0].split(' ') if output[0] != '' else []
        post_split = output[2].split(' ') if output[2] != '' else []
        output.append(pre_split + [output[1]] + post_split)
        output.append((len(pre_split), len(pre_split)+1))
    else:
        output = [row['Text'].strip(),'','',row['Text'].strip().split(' '),None]
    return dict(zip(splitting_headers,output))

whitespace_output = df_sentences.apply(
    lambda row: split_term_whitespace(row, search_word), 
    axis=1, result_type='expand')

whitespace_output

Unnamed: 0,split_0,split_1,split_2,split_tokens,split_anchor_span
0,We introduce a new language representation mod...,"BERT,",which stands for Bidirectional Encoder Represe...,"[We, introduce, a, new, language, representati...","(8, 9)"
1,"Unlike recent language representation models,",BERT,is designed to pre-train deep bidirectional re...,"[Unlike, recent, language, representation, mod...","(5, 6)"
2,"As a result, the pre-trained",BERT,model can be fine-tuned with just one addition...,"[As, a, result,, the, pre-trained, BERT, model...","(5, 6)"
3,It obtains new state-of-the-art results on ele...,,,"[It, obtains, new, state-of-the-art, results, ...",
4,,BERT:,Pre-training of Deep Bidirectional Transformer...,"[BERT:, Pre-training, of, Deep, Bidirectional,...","(0, 1)"
5,Language model pretraining has led to signific...,,,"[Language, model, pretraining, has, led, to, s...",
6,"Training is computationally expensive, often d...",,,"[Training, is, computationally, expensive,, of...",
7,We present a replication study of,BERT,"pretraining (Devlin et al., 2019) that careful...","[We, present, a, replication, study, of, BERT,...","(6, 7)"
8,We find that,BERT,"was significantly undertrained, and can match ...","[We, find, that, BERT, was, significantly, und...","(3, 4)"
9,Our best model achieves state-of-the-art resul...,,,"[Our, best, model, achieves, state-of-the-art,...",


In [8]:
# Run coreference resolution over the entire abstract, not individual sentences
output = df.apply(
    lambda row: coref_predictor.predict(row['Text']), axis=1, result_type='expand')
df_merged = df.join(output)

df_merged

Unnamed: 0,URL,ID,Type,Text,top_spans,antecedent_indices,predicted_antecedents,document,clusters
0,https://www.semanticscholar.org/paper/BERT%3A-...,0,Title,BERT: Pre-training of Deep Bidirectional Trans...,"[[0, 0], [2, 11], [6, 8], [6, 11]]","[[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0,...","[-1, -1, -1, -1]","[BERT, :, Pre, -, training, of, Deep, Bidirect...",[]
1,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,We introduce a new language representation mod...,"[[0, 0], [1, 1], [2, 17], [11, 11], [13, 17], ...","[[2, 3, 4, 9, 10, 11, 12, 13, 14, 15, 16, 17, ...","[-1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1...","[We, introduce, a, new, language, representati...","[[[2, 17], [25, 25], [59, 59], [105, 105], [11..."
2,https://www.semanticscholar.org/paper/RoBERTa%...,1,Title,RoBERTa: A Robustly Optimized BERT Pretraining...,"[[0, 0], [2, 7], [5, 5]]","[[0, 1, 2], [0, 1, 2], [0, 1, 2]]","[-1, -1, -1]","[RoBERTa, :, A, Robustly, Optimized, BERT, Pre...",[]
3,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,Language model pretraining has led to signific...,"[[0, 2], [4, 4], [6, 8], [10, 14], [13, 14], [...","[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[Language, model, pretraining, has, led, to, s...","[[[49, 49], [79, 79], [101, 101], [142, 142], ..."
4,https://www.semanticscholar.org/paper/DistilBE...,2,Title,"DistilBERT, a distilled version of BERT: small...","[[0, 0], [0, 6], [0, 7], [0, 14], [6, 6], [8, ...","[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1...","[-1, -1, -1, -1, -1, -1]","[DistilBERT, ,, a, distilled, version, of, BER...",[]
5,https://www.semanticscholar.org/paper/DistilBE...,2,Abstract,As Transfer Learning from large-scale pre-trai...,"[[1, 10], [4, 10], [15, 18], [15, 19], [15, 20...","[[12, 13, 14, 15, 16, 17, 19, 43, 44, 45, 46, ...","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1...","[As, Transfer, Learning, from, large, -, scale...","[[[4, 10], [23, 25]], [[47, 47], [104, 104], [..."
6,https://www.semanticscholar.org/paper/BERT-Red...,3,Title,BERT Rediscovers the Classical NLP Pipeline,"[[0, 0], [2, 5]]","[[0, 1], [0, 1]]","[-1, -1]","[BERT, Rediscovers, the, Classical, NLP, Pipel...",[]
7,https://www.semanticscholar.org/paper/BERT-Red...,3,Abstract,Pre-trained text encoders have rapidly advance...,"[[0, 4], [7, 7], [8, 16], [11, 11], [11, 12], ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[Pre, -, trained, text, encoders, have, rapidl...","[[[21, 25], [43, 44], [91, 92]], [[15, 15], [5..."
8,https://www.semanticscholar.org/paper/What-Doe...,4,Title,What Does BERT Look At? An Analysis of BERT's ...,"[[2, 2], [3, 3], [9, 10], [9, 11]]","[[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0,...","[-1, -1, 0, -1]","[What, Does, BERT, Look, At, ?, An, Analysis, ...","[[[2, 2], [9, 10]]]"
9,https://www.semanticscholar.org/paper/What-Doe...,4,Abstract,Large pre-trained neural networks such as BERT...,"[[0, 8], [8, 8], [11, 15], [11, 35], [15, 15],...","[[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 39, ...","[-1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1...","[Large, pre, -, trained, neural, networks, suc...","[[[0, 8], [28, 28]], [[70, 80], [83, 83]], [[8..."


In [9]:
import re

# transform the output of coreference resolution into something that is more easily manipulated
# split it across multiple sentences so each indiv sentence row can still work
def reinterpret_coref_clusters(row, search_word, sentences):
    # Create dicts to map full-document to indiv sentence data
    src = sentences.loc[sentences['ID'] == row['ID']].loc[sentences['Type'] == row['Type']]['Text']
    curr_sentence = 0
    consumed = 0
    sent_mapping = {}
    sent_content = {}
    last_sent_end = 0
    doct_mapping = {}
    doct_split = []
    for i in range(len(row['document'])):
        if row['document'][i].strip() != '':
            if row['document'][i] not in src.iloc[curr_sentence][consumed:]:
                doct_split.append(row['document'][last_sent_end:i])
                last_sent_end = i
                curr_sentence += 1
                consumed = 0
            offset = src.iloc[curr_sentence][consumed:].index(row['document'][i])
            sent_mapping[i] = curr_sentence
            if curr_sentence not in sent_content:
                sent_content[curr_sentence] = []
            sent_content[curr_sentence].append(i)
            doct_mapping[i] = i - last_sent_end
            consumed += offset + len(row['document'][i])
        else:
            sent_mapping[i] = curr_sentence
            sent_content[curr_sentence].append(i)
            doct_mapping[i] = i - last_sent_end
        doct_split.append(row['document'][last_sent_end:])
    # Select the cluster that best matches the search word
    selcluster_idx = -1
    selcluster_ct = 0
    for i in range(len(row['clusters'])):
        currcluster_ct = 0
        for c in row['clusters'][i]:
            currcluster_ct += len(re.findall(f'{search_word}', ' '.join(row['document'][c[0]:c[1]+1])))
        if currcluster_ct > selcluster_ct:
            selcluster_idx = i
            selcluster_ct = currcluster_ct
    # Build the output row
    output = [sent_mapping, sent_content, doct_mapping, selcluster_idx]
    return dict(zip(['sent_mapping', 'sent_content', 'doct_mapping', 'selcluster_idx'],output))

output = df_merged.apply(
    lambda row: reinterpret_coref_clusters(row, search_word, df_sentences), 
    axis=1, result_type='expand')
df_merged = df_merged.join(output)

df_merged.to_csv(f'outputs/coreference-partial.csv')
df_merged

Unnamed: 0,URL,ID,Type,Text,top_spans,antecedent_indices,predicted_antecedents,document,clusters,sent_mapping,sent_content,doct_mapping,selcluster_idx
0,https://www.semanticscholar.org/paper/BERT%3A-...,0,Title,BERT: Pre-training of Deep Bidirectional Trans...,"[[0, 0], [2, 11], [6, 8], [6, 11]]","[[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0,...","[-1, -1, -1, -1]","[BERT, :, Pre, -, training, of, Deep, Bidirect...",[],"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: ...","{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}","{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: ...",-1
1,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,We introduce a new language representation mod...,"[[0, 0], [1, 1], [2, 17], [11, 11], [13, 17], ...","[[2, 3, 4, 9, 10, 11, 12, 13, 14, 15, 16, 17, ...","[-1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1...","[We, introduce, a, new, language, representati...","[[[2, 17], [25, 25], [59, 59], [105, 105], [11...","{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: ...","{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: ...",0
2,https://www.semanticscholar.org/paper/RoBERTa%...,1,Title,RoBERTa: A Robustly Optimized BERT Pretraining...,"[[0, 0], [2, 7], [5, 5]]","[[0, 1, 2], [0, 1, 2], [0, 1, 2]]","[-1, -1, -1]","[RoBERTa, :, A, Robustly, Optimized, BERT, Pre...",[],"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0}","{0: [0, 1, 2, 3, 4, 5, 6, 7]}","{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7}",-1
3,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,Language model pretraining has led to signific...,"[[0, 2], [4, 4], [6, 8], [10, 14], [13, 14], [...","[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[Language, model, pretraining, has, led, to, s...","[[[49, 49], [79, 79], [101, 101], [142, 142], ...","{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: ...","{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: ...",1
4,https://www.semanticscholar.org/paper/DistilBE...,2,Title,"DistilBERT, a distilled version of BERT: small...","[[0, 0], [0, 6], [0, 7], [0, 14], [6, 6], [8, ...","[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5], [0, 1...","[-1, -1, -1, -1, -1, -1]","[DistilBERT, ,, a, distilled, version, of, BER...",[],"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: ...","{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: ...",-1
5,https://www.semanticscholar.org/paper/DistilBE...,2,Abstract,As Transfer Learning from large-scale pre-trai...,"[[1, 10], [4, 10], [15, 18], [15, 19], [15, 20...","[[12, 13, 14, 15, 16, 17, 19, 43, 44, 45, 46, ...","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1...","[As, Transfer, Learning, from, large, -, scale...","[[[4, 10], [23, 25]], [[47, 47], [104, 104], [...","{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: ...","{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: ...",2
6,https://www.semanticscholar.org/paper/BERT-Red...,3,Title,BERT Rediscovers the Classical NLP Pipeline,"[[0, 0], [2, 5]]","[[0, 1], [0, 1]]","[-1, -1]","[BERT, Rediscovers, the, Classical, NLP, Pipel...",[],"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}","{0: [0, 1, 2, 3, 4, 5]}","{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}",-1
7,https://www.semanticscholar.org/paper/BERT-Red...,3,Abstract,Pre-trained text encoders have rapidly advance...,"[[0, 4], [7, 7], [8, 16], [11, 11], [11, 12], ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[Pre, -, trained, text, encoders, have, rapidl...","[[[21, 25], [43, 44], [91, 92]], [[15, 15], [5...","{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: ...","{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: ...",0
8,https://www.semanticscholar.org/paper/What-Doe...,4,Title,What Does BERT Look At? An Analysis of BERT's ...,"[[2, 2], [3, 3], [9, 10], [9, 11]]","[[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0,...","[-1, -1, 0, -1]","[What, Does, BERT, Look, At, ?, An, Analysis, ...","[[[2, 2], [9, 10]]]","{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 1, 7: ...","{0: [0, 1, 2, 3, 4, 5], 1: [6, 7, 8, 9, 10, 11]}","{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 0, 7: ...",0
9,https://www.semanticscholar.org/paper/What-Doe...,4,Abstract,Large pre-trained neural networks such as BERT...,"[[0, 8], [8, 8], [11, 15], [11, 35], [15, 15],...","[[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 39, ...","[-1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1...","[Large, pre, -, trained, neural, networks, suc...","[[[0, 8], [28, 28]], [[70, 80], [83, 83]], [[8...","{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: ...","{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: ...",2


In [10]:
# Split based on co-references to any phrase containing BERT, using allennlp coreference resolution
# This does NOT preserve the original sentence spacing
# REQUIRES THAT WE ALREADY RAN THE COREFERENCE PREDICTOR - this func does NOT do all of the work!
def split_term_coreference(row, search_word, lookup, fallback):
    # Splits on first coref instance ONLY
    # there's probably a cleaner way to do this...
    lookup_row = lookup.loc[lookup['ID']==row['ID']].loc[lookup['Type']==row['Type']].to_dict(orient='records')[0]
    if lookup_row['selcluster_idx'] == -1:
        # if we didn't identify any clusters that match the search term, use our fallback method
        return fallback(row, search_word)
    split_clusters = lookup_row['clusters'][lookup_row['selcluster_idx']]
    output = []
    for i in range(len(split_clusters)):
        c = split_clusters[i]
        if lookup_row['sent_mapping'][c[0]] == lookup_row['sent_mapping'][c[0]] == row['Index']:
            sentence_start = lookup_row['sent_content'][row['Index']][0]
            sentence_end = lookup_row['sent_content'][row['Index']][-1]
            pre_split = lookup_row['document'][sentence_start:c[0]]
            anchor = lookup_row['document'][c[0]:c[1]+1]
            post_split = lookup_row['document'][c[1]+1:sentence_end]
            output=[' '.join(pre_split),
                    ' '.join(anchor),
                    ' '.join(post_split)]
            output.append(lookup_row['document'][sentence_start:sentence_end])
            output.append((len(pre_split), len(pre_split)+len(anchor)))
            break
    if output == []:
        # if there wasn't any reference in the sentence found, use our fallback method
        return fallback(row, search_word)
    return dict(zip(splitting_headers,output))

coreference_output = df_sentences.apply(
    lambda row: split_term_coreference(row, search_word, df_merged, split_term_whitespace), 
    axis=1, result_type='expand')

coreference_output

Unnamed: 0,split_0,split_1,split_2,split_tokens,split_anchor_span
0,We introduce,a new language representation model called BER...,,"[We, introduce, a, new, language, representati...","(2, 18)"
1,"Unlike recent language representation models ,",BERT,is designed to pre - train deep bidirectional ...,"[Unlike, recent, language, representation, mod...","(6, 7)"
2,"As a result , the pre - trained",BERT,model can be fine - tuned with just one additi...,"[As, a, result, ,, the, pre, -, trained, BERT,...","(8, 9)"
3,,It,obtains new state - of - the - art results on ...,"[It, obtains, new, state, -, of, -, the, -, ar...","(0, 1)"
4,,BERT:,Pre-training of Deep Bidirectional Transformer...,"[BERT:, Pre-training, of, Deep, Bidirectional,...","(0, 1)"
5,Language model pretraining has led to signific...,,,"[Language, model, pretraining, has, led, to, s...",
6,"Training is computationally expensive, often d...",,,"[Training, is, computationally, expensive,, of...",
7,We present a replication study of,BERT,"pretraining ( Devlin et al . , 2019 ) that car...","[We, present, a, replication, study, of, BERT,...","(6, 7)"
8,We find that,BERT,"was significantly undertrained , and can match...","[We, find, that, BERT, was, significantly, und...","(3, 4)"
9,Our best model achieves state-of-the-art resul...,,,"[Our, best, model, achieves, state-of-the-art,...",


# Grouping functions

Assume we have an input dataframe with column headers `['split_0','split_1','split_2', 'split_tokens', 'split_anchor_span']`

`'split_1'` is the column that contains our search term / anchor point

`'split_0'` and `'split_2'` are the columns that contain text before and after the search terms respectively

In [11]:
# group is the text uniquely identifying a group
grouping_headers = ['group']

In [24]:
sample_input = df_sentences.join(coreference_output)
sample_input

Unnamed: 0,URL,ID,Type,Index,Text,split_0,split_1,split_2,split_tokens,split_anchor_span
0,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,0,We introduce a new language representation mod...,We introduce,a new language representation model called BER...,,"[We, introduce, a, new, language, representati...","(2, 18)"
1,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,1,"Unlike recent language representation models, ...","Unlike recent language representation models ,",BERT,is designed to pre - train deep bidirectional ...,"[Unlike, recent, language, representation, mod...","(6, 7)"
2,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,2,"As a result, the pre-trained BERT model can be...","As a result , the pre - trained",BERT,model can be fine - tuned with just one additi...,"[As, a, result, ,, the, pre, -, trained, BERT,...","(8, 9)"
3,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,3,It obtains new state-of-the-art results on ele...,,It,obtains new state - of - the - art results on ...,"[It, obtains, new, state, -, of, -, the, -, ar...","(0, 1)"
4,https://www.semanticscholar.org/paper/BERT%3A-...,0,Title,0,BERT: Pre-training of Deep Bidirectional Trans...,,BERT:,Pre-training of Deep Bidirectional Transformer...,"[BERT:, Pre-training, of, Deep, Bidirectional,...","(0, 1)"
5,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,0,Language model pretraining has led to signific...,Language model pretraining has led to signific...,,,"[Language, model, pretraining, has, led, to, s...",
6,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,1,"Training is computationally expensive, often d...","Training is computationally expensive, often d...",,,"[Training, is, computationally, expensive,, of...",
7,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,2,We present a replication study of BERT pretrai...,We present a replication study of,BERT,"pretraining ( Devlin et al . , 2019 ) that car...","[We, present, a, replication, study, of, BERT,...","(6, 7)"
8,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,3,We find that BERT was significantly undertrain...,We find that,BERT,"was significantly undertrained , and can match...","[We, find, that, BERT, was, significantly, und...","(3, 4)"
9,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,4,Our best model achieves state-of-the-art resul...,Our best model achieves state-of-the-art resul...,,,"[Our, best, model, achieves, state-of-the-art,...",


In [25]:
# Group on the literal first word that comes after the anchor point
def group_first_word(row):
    if (row['split_anchor_span'] is not None) and \
            row['split_anchor_span'][1] < len(row['split_tokens']):
        output = [row['split_tokens'][row['split_anchor_span'][1]]]
    else:
        output = ['']
    return dict(zip(grouping_headers, output))

output = sample_input.apply(
    lambda row: group_first_word(row), 
    axis=1, result_type='expand').sort_values(by=['group'])

output = sample_input.join(output)
output

Unnamed: 0,URL,ID,Type,Index,Text,split_0,split_1,split_2,split_tokens,split_anchor_span,group
0,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,0,We introduce a new language representation mod...,We introduce,a new language representation model called BER...,,"[We, introduce, a, new, language, representati...","(2, 18)",
1,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,1,"Unlike recent language representation models, ...","Unlike recent language representation models ,",BERT,is designed to pre - train deep bidirectional ...,"[Unlike, recent, language, representation, mod...","(6, 7)",is
2,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,2,"As a result, the pre-trained BERT model can be...","As a result , the pre - trained",BERT,model can be fine - tuned with just one additi...,"[As, a, result, ,, the, pre, -, trained, BERT,...","(8, 9)",model
3,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,3,It obtains new state-of-the-art results on ele...,,It,obtains new state - of - the - art results on ...,"[It, obtains, new, state, -, of, -, the, -, ar...","(0, 1)",obtains
4,https://www.semanticscholar.org/paper/BERT%3A-...,0,Title,0,BERT: Pre-training of Deep Bidirectional Trans...,,BERT:,Pre-training of Deep Bidirectional Transformer...,"[BERT:, Pre-training, of, Deep, Bidirectional,...","(0, 1)",Pre-training
5,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,0,Language model pretraining has led to signific...,Language model pretraining has led to signific...,,,"[Language, model, pretraining, has, led, to, s...",,
6,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,1,"Training is computationally expensive, often d...","Training is computationally expensive, often d...",,,"[Training, is, computationally, expensive,, of...",,
7,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,2,We present a replication study of BERT pretrai...,We present a replication study of,BERT,"pretraining ( Devlin et al . , 2019 ) that car...","[We, present, a, replication, study, of, BERT,...","(6, 7)",pretraining
8,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,3,We find that BERT was significantly undertrain...,We find that,BERT,"was significantly undertrained , and can match...","[We, find, that, BERT, was, significantly, und...","(3, 4)",was
9,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,4,Our best model achieves state-of-the-art resul...,Our best model achieves state-of-the-art resul...,,,"[Our, best, model, achieves, state-of-the-art,...",,


In [26]:
# Group on the first verb that comes after the anchor point, using NLTK part-of-speech tagging
def group_first_verb(row):
    tokens = [nltk.word_tokenize(row['split_0']), 
              nltk.word_tokenize(row['split_1']),
              nltk.word_tokenize(row['split_2'])]
    tokens_pos = nltk.pos_tag([item for sublist in tokens for item in sublist])
    verb = ''
    for i in range(len(tokens[0])+len(tokens[1]), len(tokens_pos)):
        if tokens_pos[i][1].startswith('V'):
            verb = tokens_pos[i][0]
            break
    output = [verb]
    return dict(zip(grouping_headers, output))

output = sample_input.apply(
    lambda row: group_first_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])

output = sample_input.join(output)
output

Unnamed: 0,URL,ID,Type,Index,Text,split_0,split_1,split_2,split_tokens,split_anchor_span,group
0,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,0,We introduce a new language representation mod...,We introduce,a new language representation model called BER...,,"[We, introduce, a, new, language, representati...","(2, 18)",
1,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,1,"Unlike recent language representation models, ...","Unlike recent language representation models ,",BERT,is designed to pre - train deep bidirectional ...,"[Unlike, recent, language, representation, mod...","(6, 7)",is
2,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,2,"As a result, the pre-trained BERT model can be...","As a result , the pre - trained",BERT,model can be fine - tuned with just one additi...,"[As, a, result, ,, the, pre, -, trained, BERT,...","(8, 9)",be
3,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,3,It obtains new state-of-the-art results on ele...,,It,obtains new state - of - the - art results on ...,"[It, obtains, new, state, -, of, -, the, -, ar...","(0, 1)",obtains
4,https://www.semanticscholar.org/paper/BERT%3A-...,0,Title,0,BERT: Pre-training of Deep Bidirectional Trans...,,BERT:,Pre-training of Deep Bidirectional Transformer...,"[BERT:, Pre-training, of, Deep, Bidirectional,...","(0, 1)",
5,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,0,Language model pretraining has led to signific...,Language model pretraining has led to signific...,,,"[Language, model, pretraining, has, led, to, s...",,
6,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,1,"Training is computationally expensive, often d...","Training is computationally expensive, often d...",,,"[Training, is, computationally, expensive,, of...",,
7,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,2,We present a replication study of BERT pretrai...,We present a replication study of,BERT,"pretraining ( Devlin et al . , 2019 ) that car...","[We, present, a, replication, study, of, BERT,...","(6, 7)",measures
8,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,3,We find that BERT was significantly undertrain...,We find that,BERT,"was significantly undertrained , and can match...","[We, find, that, BERT, was, significantly, und...","(3, 4)",was
9,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,4,Our best model achieves state-of-the-art resul...,Our best model achieves state-of-the-art resul...,,,"[Our, best, model, achieves, state-of-the-art,...",,


In [27]:
# Group on the main verb in the sentence, using allennlp dependency parsing (based on demo code)
def group_main_verb(row):
    p = dependency_predictor.predict(
        sentence=' '.join([row['split_0'], row['split_1'], row['split_2']]).strip()
    )
    output = [p['hierplane_tree']['root']['word']]
    return dict(zip(grouping_headers, output))

output = sample_input.apply(
    lambda row: group_main_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])

output = sample_input.join(output)
output

Unnamed: 0,URL,ID,Type,Index,Text,split_0,split_1,split_2,split_tokens,split_anchor_span,group
0,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,0,We introduce a new language representation mod...,We introduce,a new language representation model called BER...,,"[We, introduce, a, new, language, representati...","(2, 18)",stands
1,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,1,"Unlike recent language representation models, ...","Unlike recent language representation models ,",BERT,is designed to pre - train deep bidirectional ...,"[Unlike, recent, language, representation, mod...","(6, 7)",designed
2,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,2,"As a result, the pre-trained BERT model can be...","As a result , the pre - trained",BERT,model can be fine - tuned with just one additi...,"[As, a, result, ,, the, pre, -, trained, BERT,...","(8, 9)",conceptually
3,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,3,It obtains new state-of-the-art results on ele...,,It,obtains new state - of - the - art results on ...,"[It, obtains, new, state, -, of, -, the, -, ar...","(0, 1)",absolute
4,https://www.semanticscholar.org/paper/BERT%3A-...,0,Title,0,BERT: Pre-training of Deep Bidirectional Trans...,,BERT:,Pre-training of Deep Bidirectional Transformer...,"[BERT:, Pre-training, of, Deep, Bidirectional,...","(0, 1)",BERT
5,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,0,Language model pretraining has led to signific...,Language model pretraining has led to signific...,,,"[Language, model, pretraining, has, led, to, s...",,led
6,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,1,"Training is computationally expensive, often d...","Training is computationally expensive, often d...",,,"[Training, is, computationally, expensive,, of...",,is
7,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,2,We present a replication study of BERT pretrai...,We present a replication study of,BERT,"pretraining ( Devlin et al . , 2019 ) that car...","[We, present, a, replication, study, of, BERT,...","(6, 7)",measures
8,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,3,We find that BERT was significantly undertrain...,We find that,BERT,"was significantly undertrained , and can match...","[We, find, that, BERT, was, significantly, und...","(3, 4)",We
9,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,4,Our best model achieves state-of-the-art resul...,Our best model achieves state-of-the-art resul...,,,"[Our, best, model, achieves, state-of-the-art,...",,of


In [28]:
# Helper function
# Return the match bounds of the sequence of elements of given sizes starting at list1[i1] and list2[i2] 
# that match
# If no given size is returned, returns max matching sequence length
# (ratio of element matches must be 1:some or some:1 between l1 and l2)
# Returns [(l1 bounds), (l2 bounds)] or None if they do not match
def list_elements_match(list1, list2, i1, i2, size1=None, size2=None):
    matchlen = 0
    if size1 is not None and size2 is not None:
        # check for exact text match
        matchlen = len(''.join(list1[i1:i1+size1]))
        if ''.join(list1[i1:i1+size1]) != ''.join(list2[i2:i2+size2]):
            return None
    elif size1 is not None:
        # and size2 is none
        matchlen = len(''.join(list1[i1:i1+size1]))
        if ''.join(list1[i1:i1+size1]) != ''.join(list2[i2:])[:matchlen]:
            return None
    elif size2 is not None:
        # and size1 is none
        matchlen = len(''.join(list2[i2:i2+size2]))
        if ''.join(list2[i2:i2+size2]) != ''.join(list1[i1:])[:matchlen]:
            return None
    else:
        # both are none; just calculate the match length
        matchlen = 0
        while l1concat[matching] == l2concat[matching]:
            matchlen += 1
    matchphrase = ''.join(list1[i1:])[:matchlen]
    # get the exact bounds for list1
    bound1 = 0
    for i in range(len(list1)-i1):
        if ''.join(list1[i1:i1+i]) == matchphrase:
            bound1 = i
            break
    # get the exact bounds for list2
    bound2 = 0
    for i in range(len(list2)-i2):
        if ''.join(list2[i2:i2+i]) == matchphrase:
            bound2 = i
            break
    return [(i1, i1+bound1), (i2, i2+bound2)]

In [29]:
# Group on the POS of the anchor point, using allennlp dependency parsing (based on demo code)
# I'm defining the "POS of a phrase" as the POS of the lowest node that contains the entire phrase
def group_pos_anchor(row, context=1):
    if row['split_anchor_span'] is None:
        return dict(zip(grouping_headers, ['']))
    p = dependency_predictor.predict(
        sentence=' '.join(row['split_tokens']).strip()
    )
    # build out a more usable version of the dependency tree with information about tree level!
    # tree_array[n] = (parent, level) so the root node is (0, 0)
    tree_array = [(h, -1) for h in p['predicted_heads']]
    need_connection = [0]
    level = 0
    while len(need_connection) > 0:
        need_connection_update = []
        for i in range(len(tree_array)):
            if tree_array[i][0] in need_connection:
                tree_array[i] = (tree_array[i][0], level)
                need_connection_update.append(i+1)
        need_connection = need_connection_update
        level += 1
    # Figure out what indexes our anchor fits into
    # Assume that the anchor is contiguous text
    # TODO: is this always true?
    for i in range(len(p['words'])):
        match = list_elements_match(
            p['words'], row['split_tokens'], i, row['split_anchor_span'][0], 
            size2=row['split_anchor_span'][1]-row['split_anchor_span'][0])
        if match is not None:
            break
    # Find the smallest containing dependency node
    matching_nodes = [(i, tree_array[i][0], tree_array[i][1]) for i in range(match[0][0], match[0][1]+1)]
    matching_nodes = list(set(matching_nodes))
    while len(matching_nodes) > 1:
        matching_nodes.sort(key=lambda x: x[2])
        parent = matching_nodes.pop()[1]
        matching_nodes.append((parent-1, tree_array[parent-1][0], tree_array[parent-1][1]))
        matching_nodes = list(set(matching_nodes))
    labeltiers = []
    while len(labeltiers) < context:
        labeltiers.append(p['pos'][matching_nodes[0][0]])
        parent = matching_nodes[0][1]
        if parent == 0:
            break
        matching_nodes[0] = (parent-1, tree_array[parent-1][0], tree_array[parent-1][1])
    return dict(zip(grouping_headers, [labeltiers]))

output = sample_input.apply(
    lambda row: group_pos_anchor(row, context=3), 
    axis=1, result_type='expand')

output = sample_input.join(output)
output

Unnamed: 0,URL,ID,Type,Index,Text,split_0,split_1,split_2,split_tokens,split_anchor_span,group
0,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,0,We introduce a new language representation mod...,We introduce,a new language representation model called BER...,,"[We, introduce, a, new, language, representati...","(2, 18)","[DET, NOUN, VERB]"
1,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,1,"Unlike recent language representation models, ...","Unlike recent language representation models ,",BERT,is designed to pre - train deep bidirectional ...,"[Unlike, recent, language, representation, mod...","(6, 7)",[VERB]
2,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,2,"As a result, the pre-trained BERT model can be...","As a result , the pre - trained",BERT,model can be fine - tuned with just one additi...,"[As, a, result, ,, the, pre, -, trained, BERT,...","(8, 9)","[NOUN, ADV, PUNCT]"
3,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,3,It obtains new state-of-the-art results on ele...,,It,obtains new state - of - the - art results on ...,"[It, obtains, new, state, -, of, -, the, -, ar...","(0, 1)","[VERB, NOUN, PROPN]"
4,https://www.semanticscholar.org/paper/BERT%3A-...,0,Title,0,BERT: Pre-training of Deep Bidirectional Trans...,,BERT:,Pre-training of Deep Bidirectional Transformer...,"[BERT:, Pre-training, of, Deep, Bidirectional,...","(0, 1)",[PROPN]
5,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,0,Language model pretraining has led to signific...,Language model pretraining has led to signific...,,,"[Language, model, pretraining, has, led, to, s...",,
6,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,1,"Training is computationally expensive, often d...","Training is computationally expensive, often d...",,,"[Training, is, computationally, expensive,, of...",,
7,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,2,We present a replication study of BERT pretrai...,We present a replication study of,BERT,"pretraining ( Devlin et al . , 2019 ) that car...","[We, present, a, replication, study, of, BERT,...","(6, 7)","[PROPN, ADP, NOUN]"
8,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,3,We find that BERT was significantly undertrain...,We find that,BERT,"was significantly undertrained , and can match...","[We, find, that, BERT, was, significantly, und...","(3, 4)","[ADV, VERB, PRON]"
9,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,4,Our best model achieves state-of-the-art resul...,Our best model achieves state-of-the-art resul...,,,"[Our, best, model, achieves, state-of-the-art,...",,


# Export CSVs

In [None]:
# Export whitespace-based split data
output = df_sentences.join(whitespace_output).apply(
    lambda row: group_first_word(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
sample_input.join(output).to_csv(f'outputs/whitespace_firstword.csv')

output = df_sentences.join(whitespace_output).apply(
    lambda row: group_first_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
sample_input.join(output).to_csv(f'outputs/whitespace_firstverb.csv')

output = df_sentences.join(whitespace_output).apply(
    lambda row: group_main_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
sample_input.join(output).to_csv(f'outputs/whitespace_mainverb.csv')

output = df_sentences.join(whitespace_output).apply(
    lambda row: group_pos_anchor(row, context=3), 
    axis=1, result_type='expand')
sample_input.join(output).to_csv(f'outputs/whitespace_anchorpos.csv')

In [31]:
output = df_sentences.join(whitespace_output).apply(
    lambda row: group_pos_anchor(row, context=3), 
    axis=1, result_type='expand')
sample_input.join(output).to_csv(f'outputs/whitespace_anchorpos.csv')

output = df_sentences.join(coreference_output).apply(
    lambda row: group_pos_anchor(row, context=3), 
    axis=1, result_type='expand')
sample_input.join(output).to_csv(f'outputs/coreference_anchorpos.csv')

In [None]:
# Export coreference-based split data
output = df_sentences.join(coreference_output).apply(
    lambda row: group_first_word(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
sample_input.join(output).to_csv(f'outputs/coreference_firstword.csv')

output = df_sentences.join(coreference_output).apply(
    lambda row: group_first_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
sample_input.join(output).to_csv(f'outputs/coreference_firstverb.csv')

output = df_sentences.join(coreference_output).apply(
    lambda row: group_main_verb(row), 
    axis=1, result_type='expand').sort_values(by=['group'])
sample_input.join(output).to_csv(f'outputs/coreference_mainverb.csv')

output = df_sentences.join(coreference_output).apply(
    lambda row: group_pos_anchor(row, context=3), 
    axis=1, result_type='expand')
sample_input.join(output).to_csv(f'outputs/coreference_anchorpos.csv')

# Misc utility functions

In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)

pd.read_csv(f'outputs/coreference_mainverb.csv').sort_values(by=['group'])