In [1]:
import pandas as pd
pd.set_option("display.max_rows", None)

In [2]:
import csv
import pandas as pd
import matplotlib.pyplot as plt

search_word = "BERT"
df = pd.read_csv('data/nlp-align_BERT.csv')

df

Unnamed: 0,URL,ID,Type,Index,Text
0,https://www.semanticscholar.org/paper/BERT%3A-...,0,Title,0,BERT: Pre-training of Deep Bidirectional Trans...
1,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,0,We introduce a new language representation mod...
2,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,1,"Unlike recent language representation models, ..."
3,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,2,"As a result, the pre-trained BERT model can be..."
4,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,3,BERT is conceptually simple and empirically po...
5,https://www.semanticscholar.org/paper/BERT%3A-...,0,Abstract,4,It obtains new state-of-the-art results on ele...
6,https://www.semanticscholar.org/paper/RoBERTa%...,1,Title,0,RoBERTa: A Robustly Optimized BERT Pretraining...
7,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,0,Language model pretraining has led to signific...
8,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,1,"Training is computationally expensive, often d..."
9,https://www.semanticscholar.org/paper/RoBERTa%...,1,Abstract,2,We present a replication study of BERT pretrai...


# Splitting functions

Assume we have an input dataframe with some number of columns, at least one of which is titled `Text` and is the column containing each sentence of the abstract.

In [3]:
splitting_headers = ['text_split_0','text_split_1','text_split_2']

In [4]:
# Split on the search word, regardless of whitespace (if search word is A and we have word CAR, it slices it up)
def split_search_term_literal(row, search_word):
    # Splits on first instance ONLY
    if search_word in row['Text']:
        splitted = row['Text'].split(search_word, maxsplit=1)
        splitted.insert(1, search_word)
    else:
        splitted = [row['Text'],'','']
    return dict(zip(splitting_headers,splitted))

# df = df.join()
df.apply(lambda row: split_search_term_literal(row, search_word), axis=1, result_type='expand')

Unnamed: 0,text_split_0,text_split_1,text_split_2
0,,BERT,: Pre-training of Deep Bidirectional Transform...
1,We introduce a new language representation mod...,BERT,", which stands for Bidirectional Encoder Repre..."
2,"Unlike recent language representation models,",BERT,is designed to pre-train deep bidirectional r...
3,"As a result, the pre-trained",BERT,model can be fine-tuned with just one additio...
4,,BERT,is conceptually simple and empirically powerf...
5,It obtains new state-of-the-art results on ele...,,
6,Ro,BERT,a: A Robustly Optimized BERT Pretraining Approach
7,Language model pretraining has led to signific...,,
8,"Training is computationally expensive, often d...",,
9,We present a replication study of,BERT,"pretraining (Devlin et al., 2019) that carefu..."


In [5]:
# Split on the search word, taking care to only split on whitespace
def split_search_term_whitespace(row, search_word):
    # Splits on first instance ONLY
    if search_word in row['Text']:
        tokens = row['Text'].split(' ')
        for i in range(len(tokens)):
            if search_word in tokens[i]:
                splitted=[' '.join(tokens[:i]), tokens[i], ' '.join(tokens[i+1:])]
                break
    else:
        splitted = [row['Text'],'','']
    return dict(zip(splitting_headers,splitted))

df.apply(lambda row: split_search_term_whitespace(row, search_word), axis=1, result_type='expand')

Unnamed: 0,text_split_0,text_split_1,text_split_2
0,,BERT:,Pre-training of Deep Bidirectional Transformer...
1,We introduce a new language representation mod...,"BERT,",which stands for Bidirectional Encoder Represe...
2,"Unlike recent language representation models,",BERT,is designed to pre-train deep bidirectional re...
3,"As a result, the pre-trained",BERT,model can be fine-tuned with just one addition...
4,,BERT,is conceptually simple and empirically powerful.
5,It obtains new state-of-the-art results on ele...,,
6,,RoBERTa:,A Robustly Optimized BERT Pretraining Approach
7,Language model pretraining has led to signific...,,
8,"Training is computationally expensive, often d...",,
9,We present a replication study of,BERT,"pretraining (Devlin et al., 2019) that careful..."


In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.coref
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz")
predictor.predict(
  document="The woman reading a newspaper sat on the bench with her dog."
)

# Grouping functions

Assume we have an input dataframe with column headers `['text_split_0','text_split_1','text_split_2']`

`'text_split_1'` is the column that contains our search term / anchor point

`'text_split_0'` and `'text_split_2'` are the columns that contain text before and after the search terms respectively

In [6]:
sample_input = df.apply(lambda row: split_search_term_whitespace(row, search_word), axis=1, result_type='expand')

sample_input

Unnamed: 0,text_split_0,text_split_1,text_split_2
0,,BERT:,Pre-training of Deep Bidirectional Transformer...
1,We introduce a new language representation mod...,"BERT,",which stands for Bidirectional Encoder Represe...
2,"Unlike recent language representation models,",BERT,is designed to pre-train deep bidirectional re...
3,"As a result, the pre-trained",BERT,model can be fine-tuned with just one addition...
4,,BERT,is conceptually simple and empirically powerful.
5,It obtains new state-of-the-art results on ele...,,
6,,RoBERTa:,A Robustly Optimized BERT Pretraining Approach
7,Language model pretraining has led to signific...,,
8,"Training is computationally expensive, often d...",,
9,We present a replication study of,BERT,"pretraining (Devlin et al., 2019) that careful..."


In [7]:
# Group on the first word (regardless of part of speech) that comes after the anchor point
def group_first_word(row):
    grouped = [row['text_split_2'].split(' ')[0], row['text_split_2']]
    return dict(zip(['group', 'text_split_2'], grouped))

sample_input.apply(lambda row: group_first_word(row), axis=1, result_type='expand').sort_values(by=['group'])

Unnamed: 0,group,text_split_2
115,,
74,,
145,,
71,,
147,,
69,,
68,,
67,,
65,,
64,,


In [8]:
import nltk
nltk.download('averaged_perceptron_tagger')

# Group on the first verb that comes after the anchor point, using NLTK part-of-speech tagging
def group_first_verb(row):
    tokens = [nltk.word_tokenize(row['text_split_0']), 
              nltk.word_tokenize(row['text_split_1']),
              nltk.word_tokenize(row['text_split_2'])]
    tokens_pos = nltk.pos_tag([item for sublist in tokens for item in sublist])
    verb = ''
    for i in range(len(tokens[0])+len(tokens[1]), len(tokens_pos)):
        if tokens_pos[i][1].startswith('V'):
            verb = tokens_pos[i][0]
            break
    grouped = [verb, row['text_split_2']]
    return dict(zip(['group', 'text_split_2'], grouped))

sample_input.apply(lambda row: group_first_verb(row), axis=1, result_type='expand').sort_values(by=['group'])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/cephcyn/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,group,text_split_2
0,,Pre-training of Deep Bidirectional Transformer...
94,,
93,,
92,,heads.
91,,
143,,
89,,
88,,
87,,
144,,
