In [1]:
import os

os.sys.path.insert(0, '../script')
from webnlg import WebNLGCorpus
from textacy import extract, spacy_utils, preprocess
from textacy import similarity
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg', disable=['ner'])

# bug? https://github.com/explosion/spaCy/issues/1574#issuecomment-391732372
for word in nlp.Defaults.stop_words:
    for w in (word, word[0].upper() + word[1:], word.upper()):
        lex = nlp.vocab[word]
        lex.is_stop = True

train = WebNLGCorpus.load('train')

# TODO: 
* LOG AND TRACK CASES WHERE THE ALGORITHM WASN'T ABLE TO DELEXICALIZE ALL PREDICATES

In [102]:
%%time

import re 
import pandas as pd
import pickle
from tqdm import tqdm
import string

def test_if_overlaps_with(tested_span, to_test_span=None):
    
    # doesn't overlap with None
    if not to_test_span:
        return False
    
    # tests if the spans have overlap
    return tested_span.start_char <= to_test_span.end_char and to_test_span.start_char <= tested_span.end_char

translate_punct_to_none = str.maketrans({k: ' ' for k in string.punctuation})

def preprocess_text_to_compare(s):
    
    return s.translate(translate_punct_to_none)
    

# regex to replace sequences of dots to a single dot -> 
#    there are cases of, I think, typos of two dots(OMG I've just thought that maybe it was a case of <abbreviature, final dot>)
c_dot = re.compile(r'\.{1,}')
# regex to extract PREDICATE tags
#    they are put in the format PREDICATE-$predicate_string$
#    and I need to find them and extract the predicate_string part
c_predicate = re.compile(r'PREDICATE-\$(.*?)\$')

c_remove_lowercase = re.compile(r'[^A-Z]')

def extract_from_entry(entry, ngram_lim=(1, 12), threshold_full=0.8, threshold_abbrev=0.5):
    
    positions = []

    # for each lexicalization
    for text in entry.lexes():
        # creates an array of chars from the lexicalization
        #    used to replace objects strings by tags
        #    because, in Python, str is immutable
        text_char = list(text)

        # creates a doc of the lexicalization
        doc = nlp(c_dot.sub('.', text))

        ngrams = []
        for n in range(*ngram_lim):
            ngrams.extend(extract.ngrams(doc, n, filter_punct=False, filter_stops=False, filter_nums=False))

        sims = []

        for ngram in ngrams:
            
            ngram_preprocessed = preprocess_text_to_compare(ngram.text)

            for triple in entry.get_data():
                
                object_preprocessed = preprocess_text_to_compare(triple['object'])
                object_abbrev_preprocessed = c_remove_lowercase.sub('', object_preprocessed)

                sims.append({'ngram': ngram,
                             'predicate': triple['predicate'],
                             'object': triple['object'],
                             'sim': similarity.levenshtein(ngram_preprocessed, object_preprocessed),
                             'sim_abbrev': similarity.levenshtein(ngram_preprocessed, object_abbrev_preprocessed)})

        df = pd.DataFrame(sims)
        
        choosen_ngram = None
        choosen_rows = []
        
        predicates_objects = [(data['predicate'], len(data['object'])) for data in entry.get_data()]
        sorted_predicates = [v[0] for v in sorted(predicates_objects, key=lambda v: v[1], reverse=True)]
        
        g_ = df
        
        for predicate in sorted_predicates:
            
            # removes overlaps
            g_ = g_[~g_.ngram.apply(lambda n: test_if_overlaps_with(n, choosen_ngram))]
            
            choosen_row = g_[g_.predicate == predicate].nlargest(1, 'sim')
            
            if choosen_row.sim.values[0] < threshold_full:
                
                choosen_row = g_[g_.predicate == predicate].nlargest(1, 'sim_abbrev')
                
                if choosen_row.sim_abbrev.values[0] < threshold_abbrev:
                    choosen_ngram = None
                    continue

            choosen_ngram = choosen_row.ngram.values[0]
            choosen_rows.append(choosen_row.index.values[0])
            
        
        
            
        g = df.loc[choosen_rows, :]
        
        g['end_char'] = g.ngram.apply(lambda x: x.end_char)
        
        for idx, row in g.sort_values('end_char', ascending=False).iterrows():

            text_char[row['ngram'].start_char: row['ngram'].end_char] = f'PREDICATE-${row["predicate"]}$'

        final = ''.join(text_char)
        
        predicate_position = []

        for i, sent in enumerate(nlp(final).sents):

            predicate_position.append(c_predicate.findall(sent.text))

        positions.append((final, predicate_position))
        
    return positions
    

result = []

#for entry in tqdm(list(train)):
#    result.append(extract_from_entry(entry))

#with open('alignment3', 'bw') as f:
#   pickle.dump(result, f)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 42 µs
Parser   : 109 ms


In [113]:
X = list(train)
X[-10]

Triple info: category=WrittenWork eid=Id239

	Modified triples:

United_States | leaderTitle | President_of_the_United_States
1634:_The_Ram_Rebellion | country | United_States
United_States | ethnicGroup | Asian_Americans


	Lexicalizations:

1634 The Ram Rebellion comes from the United States where one of the ethnic groups is Asian Americans and the leader of the country is called the President.
1634 The Ram Rebellion comes from the United States, where the title of the leader is the President and where there are many Asian Americans.
1634: The Ram Rebellion was written in the US; which is home to many Asian Americans.

In [112]:
extract_from_entry(X[-10], threshold_full=0.5)

[('1634 The Ram Rebellion comes PREDICATE-$leaderTitle$ where one of the ethnic groups is PREDICATE-$ethnicGroup$ and the leader of the country is called the President.',
  [['leaderTitle', 'ethnicGroup']]),
 ('1634 The Ram Rebellion comes PREDICATE-$leaderTitle$, where the title of the leader is the President and where there are many PREDICATE-$ethnicGroup$.',
  [['leaderTitle', 'ethnicGroup']]),
 ('1634: The Ram Rebellion was written in the PREDICATE-$leaderTitle$; which is home to many PREDICATE-$ethnicGroup$.',
  [['leaderTitle', 'ethnicGroup']])]

In [None]:
# -500
# Gary_Cohn_(comics) -> The comic character Bolt was created by Paris Cullins and Gary Cohn, the former being a United States national.
# -600
# Brazil -> Brazilian-based