In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)

In [None]:
import csv
import pandas as pd
import matplotlib.pyplot as plt

search_word = "BERT"
df = pd.read_csv('data/nlp-align_BERT.csv')

df

# Splitting functions

Assume we have an input dataframe with some number of columns, at least one of which is titled `Text` and is the column containing each sentence of the abstract.

In [None]:
splitting_headers = ['text_split_0','text_split_1','text_split_2']

In [None]:
# Split on the search word, regardless of whitespace (if search word is A and we have word CAR, it slices it up)
def split_search_term_literal(row, search_word):
    # Splits on first instance ONLY
    if search_word in row['Text']:
        splitted = row['Text'].split(search_word, maxsplit=1)
        splitted.insert(1, search_word)
    else:
        splitted = [row['Text'],'','']
    return dict(zip(splitting_headers,splitted))

# df = df.join()
df.apply(lambda row: split_search_term_literal(row, search_word), axis=1, result_type='expand')

In [None]:
# Split on the search word, taking care to only split on whitespace
def split_search_term_whitespace(row, search_word):
    # Splits on first instance ONLY
    if search_word in row['Text']:
        tokens = row['Text'].split(' ')
        for i in range(len(tokens)):
            if search_word in tokens[i]:
                splitted=[' '.join(tokens[:i]), tokens[i], ' '.join(tokens[i+1:])]
                break
    else:
        splitted = [row['Text'],'','']
    return dict(zip(splitting_headers,splitted))

df.apply(lambda row: split_search_term_whitespace(row, search_word), axis=1, result_type='expand')

In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.coref
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz")
p = predictor.predict(
  document="The BERT language model (LM) (Devlin et al., 2019) is surprisingly good at answering cloze-style questions about relational facts. \
  Petroni et al. (2019) take this as evidence that BERT memorizes factual knowledge during pre-training. \
  We take issue with this interpretation and argue that the performance of BERT is partly due to reasoning about (the surface form of) entity names, e.g., guessing that a person with an Italian-sounding name speaks Italian."
)

In [None]:
p_src=["The BERT language model (LM) (Devlin et al., 2019) is surprisingly good at answering cloze-style questions about relational facts.",
  "Petroni et al. (2019) take this as evidence that BERT memorizes factual knowledge during pre-training.",
  "We take issue with this interpretation and argue that the performance of BERT is partly due to reasoning about (the surface form of) entity names, e.g., guessing that a person with an Italian-sounding name speaks Italian."]

current_sent_counter = 0
consumed_len = 0
mapping = {}
sent_mapping = {}
for di in range(len(p['document'])):
    if p['document'][di].strip() != '':
        if p['document'][di] not in p_src[current_sent_counter]:
            current_sent_counter += 1
            consumed_len = 0
        remaining_index = p_src[current_sent_counter][consumed_len:].index(p['document'][di])
        mapping[di] = consumed_len + remaining_index
        sent_mapping[di] = current_sent_counter
        consumed_len += remaining_index + len(p['document'][di])
    else:
        mapping[di] = consumed_len
        sent_mapping[di] = current_sent_counter
print(mapping)
print(sent_mapping)

In [None]:
# TODO convert the full list of cluster spans into something that can be interpreted across multiple sentences

In [None]:
import re

spans = []
# for ts in p['top_spans']:
#     spans.append(p['document'][ts[0]:ts[1]+1])
# print(spans)
# for i in range(len(p['predicted_antecedents'])):
#     if p['predicted_antecedents'][i] is not -1:
#         print(spans[i], spans[p['predicted_antecedents'][i]])
print(p['clusters'])
selected_index = 0
selected_count = 0
for i in range(len(p['clusters'])):
    curr_count = 0
    for c in p['clusters'][i]:
        print(' '.join(p['document'][c[0]:c[1]+1]))
        curr_count += len(re.findall(f'{search_word}', ' '.join(p['document'][c[0]:c[1]+1])))
#         print(p['document'][c[0]:c[1]+1])
    if curr_count > selected_count:
        selected_index = i
        selected_count = curr_count
    print()
print(selected_index, selected_count)

In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.coref
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz")

# Create dataframe with merged abstracts
# we have to build the coref-resolved table first since I want to do coreference resolution across 
# the entire abstract, not individual sentences
df_merged = df.sort_values(by=['Index']).groupby(['ID', 'Type'])['Text'].apply(' '.join).reset_index()
df_merged = df_merged.join(df_merged.apply(lambda row: predictor.predict(row['Text']), axis=1, result_type='expand'))

df_merged

In [None]:
df.sort_values(by=['Index']).groupby(['ID', 'Type'])['Text']

In [None]:
# Split based on co-references to any phrase containing BERT, using allennlp coreference resolution
# TODO should this preserve the original sentence spacing?
def split_search_term_coreference(row, search_word):
    # Splits on first instance ONLY
    if search_word in row['Text']:
        tokens = row['Text'].split(' ')
        for i in range(len(tokens)):
            if search_word in tokens[i]:
                splitted=[' '.join(tokens[:i]), tokens[i], ' '.join(tokens[i+1:])]
                break
    else:
        splitted = [row['Text'],'','']
    return dict(zip(splitting_headers,splitted))

df.apply(lambda row: split_search_term_coreference(row, search_word), axis=1, result_type='expand')

# Grouping functions

Assume we have an input dataframe with column headers `['text_split_0','text_split_1','text_split_2']`

`'text_split_1'` is the column that contains our search term / anchor point

`'text_split_0'` and `'text_split_2'` are the columns that contain text before and after the search terms respectively

In [None]:
sample_input = df.apply(lambda row: split_search_term_whitespace(row, search_word), axis=1, result_type='expand')

sample_input

In [None]:
# Group on the first word (regardless of part of speech) that comes after the anchor point
def group_first_word(row):
    grouped = [row['text_split_2'].split(' ')[0]]
    return dict(zip(['group'], grouped))

output = sample_input.apply(lambda row: group_first_word(row), axis=1, result_type='expand').sort_values(by=['group'])

sample_input = sample_input.join(output)
sample_input

sample_input.to_csv(f'outputs/whitespace_firstword.csv')

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

# Group on the first verb that comes after the anchor point, using NLTK part-of-speech tagging
def group_first_verb(row):
    tokens = [nltk.word_tokenize(row['text_split_0']), 
              nltk.word_tokenize(row['text_split_1']),
              nltk.word_tokenize(row['text_split_2'])]
    tokens_pos = nltk.pos_tag([item for sublist in tokens for item in sublist])
    verb = ''
    for i in range(len(tokens[0])+len(tokens[1]), len(tokens_pos)):
        if tokens_pos[i][1].startswith('V'):
            verb = tokens_pos[i][0]
            break
    grouped = [verb]
    return dict(zip(['group'], grouped))

output = sample_input.apply(lambda row: group_first_verb(row), axis=1, result_type='expand').sort_values(by=['group'])


sample_input = sample_input.join(output)
sample_input

sample_input.to_csv(f'outputs/whitespace_firstverb.csv')

In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz")

# Group on the main verb in the sentence, using allennlp dependency parsing (based on demo code)
def group_main_verb(row):
    p = predictor.predict(
        sentence=' '.join([row['text_split_0'], row['text_split_1'], row['text_split_2']]).strip()
    )
    grouped = [p['hierplane_tree']['root']['word']]
    return dict(zip(['group'], grouped))

output = sample_input.apply(lambda row: group_main_verb(row), axis=1, result_type='expand').sort_values(by=['group'])

sample_input = sample_input.join(output)
sample_input

sample_input.to_csv(f'outputs/whitespace_mainverb.csv')