In [1]:
import os

os.sys.path.insert(0, '../script')
from webnlg import WebNLGCorpus
from textacy import extract, spacy_utils, preprocess
from textacy import similarity
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg', disable=['ner'])

# bug? https://github.com/explosion/spaCy/issues/1574#issuecomment-391732372
for word in nlp.Defaults.stop_words:
    for w in (word, word[0].upper() + word[1:], word.upper()):
        lex = nlp.vocab[word]
        lex.is_stop = True

train = WebNLGCorpus.load('train')

# TODO: 
* LOG AND TRACK CASES WHERE THE ALGORITHM WASN'T ABLE TO DELEXICALIZE ALL PREDICATES

In [3]:
%%time

import re 
import pandas as pd
import pickle
from tqdm import tqdm
import string

def test_if_overlaps_with(tested_span, to_test_span=None):
    
    # doesn't overlap with None
    if not to_test_span:
        return False
    
    # tests if the spans have overlap
    return tested_span.start_char <= to_test_span.end_char and to_test_span.start_char <= tested_span.end_char

translate_punct_to_none = str.maketrans({k: ' ' for k in string.punctuation})

def preprocess_text_to_compare(s):
    
    return s.translate(translate_punct_to_none)
    

# regex to replace sequences of dots to a single dot -> 
#    there are cases of, I think, typos of two dots(OMG I've just thought that maybe it was a case of <abbreviature, final dot>)
c_dot = re.compile(r'\.{1,}')
# regex to extract PREDICATE tags
#    they are put in the format PREDICATE-$predicate_string$
#    and I need to find them and extract the predicate_string part
c_predicate = re.compile(r'PREDICATE-\$(.*?)\$')

c_remove_lowercase = re.compile(r'[^A-Z]')

def extract_from_entry(entry, ngram_lim=(1, 12), threshold_full=0.5, threshold_abbrev=0.5,
                       similarity_full=similarity.levenshtein,
                       similarity_abbrev=similarity.levenshtein):
    
    positions = []

    # for each lexicalization
    for text in entry.lexes():
        # creates an array of chars from the lexicalization
        #    used to replace objects strings by tags
        #    because, in Python, str is immutable
        text_char = list(text)

        # creates a doc of the lexicalization
        doc = nlp(c_dot.sub('.', text))

        ngrams = []
        for n in range(*ngram_lim):
            ngrams.extend(extract.ngrams(doc, n, filter_punct=False, filter_stops=False, filter_nums=False))

        sims = []

        for ngram in ngrams:
            
            ngram_preprocessed = preprocess_text_to_compare(ngram.text)

            for triple in entry.get_data():
                
                object_preprocessed = preprocess_text_to_compare(triple['object'])
                object_abbrev_preprocessed = c_remove_lowercase.sub('', object_preprocessed)

                sims.append({'ngram': ngram,
                             'predicate': triple['predicate'],
                             'object': triple['object'],
                             'sim': similarity_full(ngram_preprocessed, object_preprocessed),
                             'sim_abbrev': similarity_abbrev(ngram_preprocessed, object_abbrev_preprocessed)})

        df = pd.DataFrame(sims)
        
        choosen_ngram = None
        choosen_rows = []
        
        predicates_objects = [(data['predicate'], len(data['object'])) for data in entry.get_data()]
        sorted_predicates = [v[0] for v in sorted(predicates_objects, key=lambda v: v[1], reverse=True)]
        
        g_ = df
        
        for predicate in sorted_predicates:
            
            # removes overlaps
            g_ = g_[~g_.ngram.apply(lambda n: test_if_overlaps_with(n, choosen_ngram))]
            
            choosen_row = g_[g_.predicate == predicate].nlargest(1, 'sim')
            
            if choosen_row.sim.values[0] < threshold_full:
                
                choosen_row = g_[g_.predicate == predicate].nlargest(1, 'sim_abbrev')
                
                if choosen_row.sim_abbrev.values[0] < threshold_abbrev:
                    choosen_ngram = None
                    continue

            choosen_ngram = choosen_row.ngram.values[0]
            choosen_rows.append(choosen_row.index.values[0])
            
        
        
            
        g = df.loc[choosen_rows, :]
        
        g['end_char'] = g.ngram.apply(lambda x: x.end_char)
        
        for idx, row in g.sort_values('end_char', ascending=False).iterrows():

            text_char[row['ngram'].start_char: row['ngram'].end_char] = f'PREDICATE-${row["predicate"]}$'

        final = ''.join(text_char)
        
        predicate_position = []

        for i, sent in enumerate(nlp(final).sents):

            predicate_position.append(c_predicate.findall(sent.text))

        positions.append((final, predicate_position))
        
    return positions
    

result = []

for entry in tqdm(list(train)):
    result.append(extract_from_entry(entry))

with open('alignment', 'bw') as f:
   pickle.dump(result, f)

100%|██████████| 6940/6940 [09:29<00:00, 12.18it/s]

CPU times: user 36min 54s, sys: 3.7 s, total: 36min 58s
Wall time: 9min 31s





In [5]:
X = list(train)
X[-600]

Triple info: category=SportsTeam eid=Id128

	Modified triples:

Agremiação_Sportiva_Arapiraquense | league | Campeonato_Brasileiro_Série_C
Campeonato_Brasileiro_Série_C | country | Brazil
Agremiação_Sportiva_Arapiraquense | season | 2015


	Lexicalizations:

Agremiação Sportiva Arapiraquense play in the Campeonato Brasileiro Série C league in Brazil in 2015.
Agremiação Sportiva Arapiraquense play in Brazil's Campeonato Brasileiro Série C league and competed in the 2015 event.
Agremiação Sportiva Arapiraquense play in the Brazilian-based Campeonato Brasileiro Série C league and they played in the 2015 season.

In [6]:
result[-600]

[('Agremiação Sportiva Arapiraquense play in the PREDICATE-$league$ league in PREDICATE-$country$ in PREDICATE-$season$.',
  [['league', 'country', 'season']]),
 ("Agremiação Sportiva Arapiraquense play in PREDICATE-$country$'s PREDICATE-$league$ league and competed in the PREDICATE-$season$ event.",
  [['country', 'league', 'season']]),
 ('Agremiação Sportiva Arapiraquense play in the Brazilian-based PREDICATE-$league$ league and they played in the PREDICATE-$season$ season.',
  [['league', 'season']])]

In [10]:
extract_from_entry(X[-600], threshold_abbrev=0.1)

[('Agremiação Sportiva Arapiraquense play in the PREDICATE-$league$ league in PREDICATE-$country$ in PREDICATE-$season$.',
  [['league', 'country', 'season']]),
 ("Agremiação Sportiva Arapiraquense play in PREDICATE-$country$'s PREDICATE-$league$ league and competed in the PREDICATE-$season$ event.",
  [['country', 'league', 'season']]),
 ('Agremiação Sportiva Arapiraquense play in the PREDICATE-$country$-based PREDICATE-$league$ league and they played in the PREDICATE-$season$ season.',
  [['country', 'league', 'season']])]

In [None]:
# -500
# Gary_Cohn_(comics) -> The comic character Bolt was created by Paris Cullins and Gary Cohn, the former being a United States national.
# -600
# Brazil -> Brazilian-based
# 2
# 25.0 -> 25 -> lower the threshold_full?

# Creating a predicate ranking recomendation model

* todo: analyze the occurrence of different orderings in the same entry

In [22]:
from collections import Counter

counter_bigram = Counter()

for i, entry_result in enumerate(result):
    
    for lexe_result in entry_result:
        
        sentences = lexe_result[1]
        
        for sentence in sentences:
            
            if len(sentence) > 1:

                for i in range(0, len(sentence) - 1):

                    counter_bigram[(sentence[i], sentence[i+1])] += 1

In [24]:
counter_bigram.most_common(10)

[(('country', 'leaderName'), 799),
 (('region', 'country'), 500),
 (('location', 'country'), 483),
 (('location', 'isPartOf'), 421),
 (('manager', 'club'), 392),
 (('leaderName', 'leaderName'), 314),
 (('isPartOf', 'country'), 303),
 (('country', 'ethnicGroup'), 295),
 (('club', 'club'), 280),
 (('leaderName', 'country'), 270)]

In [34]:
counter_sum = Counter()

for (p1, _), n in counter_bigram.items():
    
    counter_sum[p1] += n

In [37]:
import numpy as np

predicates = list(set().union(*[(p1 for (p1, _) in counter_bigram.keys()), (p2 for (_, p2) in counter_bigram.keys())]))

from_key_to_int = {k: i for i, k in enumerate(predicates)}
from_int_to_key = {i:k for k, i in from_key_to_int.items()}

a = np.zeros((len(counter_first_pos), len(counter_first_pos)))

for (p1, p2), n in counter_bigram.items():
    
    a[from_key_to_int[p1], from_key_to_int[p2]] = n / counter_sum[p1]

In [82]:
a[from_key_to_int['country'], from_key_to_int['leaderName']]

0.29625509825732294

In [44]:
X[-200]

Triple info: category=WrittenWork eid=Id49

	Modified triples:

AIP_Advances | editor | A.T._Charlie_Johnson
A.T._Charlie_Johnson | almaMater | Harvard_University
A.T._Charlie_Johnson | doctoralAdvisor | Michael_Tinkham


	Lexicalizations:

A T Charlie Johnson, AIP Advances editor, graduated from Harvard University assisted by doctoral advisor Michael Tinkham.
A T Charlie Johnson is the editor AIP Advances. His alma mater was Harvard University and his doctoral adviser was Michael Tinkham.

In [78]:
from itertools import permutations

def get_predicates_sequence(entry):
    
    predicates = [d['predicate'] for d in entry.get_data()]

    for p1, p2 in permutations(predicates, 2):

        print(p1, p2)
        print(a[from_key_to_int[p1], from_key_to_int[p2]])
        print()

In [79]:
X[-300]

Triple info: category=University eid=Id7

	Modified triples:

AWH_Engineering_College | country | India
AWH_Engineering_College | established | 2001
AWH_Engineering_College | city | "Kuttikkattoor"


	Lexicalizations:

AWH Engineering College was established in 2001 in Kuttikkattoor, India.
Kuttikkattoor, India is the location of the AWH Engineering College which was established in 2001.
The AWH Engineering College was established in 2001 in Kuttikkattoor, India.

In [80]:
get_predicates_sequence(X[-300])

country established
0.03299962921764924

country city
0.006674082313681869

established country
0.09404388714733543

established city
0.09404388714733543

city country
0.40394973070017953

city established
0.08797127468581688



In [81]:
result[-300]

[('AWH Engineering College was established in PREDICATE-$established$ in PREDICATE-$city$ PREDICATE-$country$.',
  [['established'], ['city', 'country']]),
 ('PREDICATE-$city$ PREDICATE-$country$ is the location of the AWH Engineering College which was established in PREDICATE-$established$.',
  [['city', 'country', 'established']]),
 ('The AWH Engineering College was established in PREDICATE-$established$ in PREDICATE-$city$ PREDICATE-$country$.',
  [['established'], ['city', 'country']])]