In [24]:
from itertools import islice
from textacy import extract
import spacy
from collections import namedtuple

In [63]:
with open('../data/comp_sentences', 'r') as f:
    
    compressed_samples = [t[:-1] for t in islice(f, 0, 3)]
    
with open('../data/uncomp_sentences', 'r') as f:
    
    uncompressed_samples = [t[:-1] for t in islice(f, 0, 3)]

In [12]:
nlp = spacy.load('en_core_web_lg')

# Delexicalizing subject, verb, object

In [94]:
def delexicalize_spans(text, to_delexicalize):
    
    text_char = list(text)

    base = 0

    for tag, span in sorted(to_delexicalize, key=lambda s: s[1].start_char):

        text_char[base + span.start_char: base + span.end_char] = tag

        len_span = span.end_char - span.start_char

        base += len(tag) - len_span
        
    return ''.join(text_char)

def delexicalize_svo(text):
    
    text_doc = nlp(text)

    svos = []

    for i, svo in enumerate(extract.subject_verb_object_triples(text_doc)):

        svos.extend(zip([f'[subject-{i}]', f'[verb-{i}]', f'[object-{i}]'], svo))
        
    return delexicalize_spans(text, svos)    

In [73]:
print('uncompressed:\n\n{}\n\ndelexicalized:\n\n{}\n\ncompressed:\n\n{}\n\ndelexicalized:\n\n{}'\
      .format(uncompressed_samples[0],
              delexicalize_svo(uncompressed_samples[0]),
              compressed_samples[0],
              delexicalize_svo(compressed_samples[0])
             ))

uncompressed:

Serge Ibaka -- the Oklahoma City Thunder forward who was born in the Congo but played in Spain -- has been granted Spanish citizenship and will play for the country in EuroBasket this summer, the event where spots in the 2012 Olympics will be decided.

delexicalized:

[subject-0] [verb-0]City Thun[object-0] who was born in the Congo but played in Spain -- has been granted Spanish citizenship and will play for the country in EuroBasket this summer, the event where spots in the 2012 Olympics will be decided.

compressed:

Serge Ibaka has been granted Spanish citizenship and will play in EuroBasket.

delexicalized:

[subject-0] [verb-0] Spanish [object-0] and will play in EuroBasket.


In [71]:
print('uncompressed:\n\n{}\n\ndelexicalized:\n\n{}\n\ncompressed:\n\n{}\n\ndelexicalized:\n\n{}'\
      .format(uncompressed_samples[1],
              delexicalize_svo(uncompressed_samples[1]),
              compressed_samples[1],
              delexicalize_svo(compressed_samples[1])
             ))

uncompressed:

MILAN -Catania held Roma to a 1-1 draw in Serie A on Wednesday as the teams played out the remaining 25 minutes of a game that was called off last month.

delexicalized:

[subject-0] [verb-0] [object-0] to a 1-1 draw in Serie A on Wednesday as the [subject-1] [verb-1] out the remaining 25 [object-1] of a game that was called off last month.

compressed:

Catania held Roma to a 1 1 draw in Serie A.

delexicalized:

[subject-0] [verb-0] [object-0] to a 1 1 draw in Serie A.


# Delexicalizing named entities

In [98]:
from itertools import groupby

def delexicalize_ner(text):
    
    text_doc = nlp(text)
    
    ners = [(f'[{span.label_}]', span) for span in extract.named_entities(text_doc)]
    
    return delexicalize_spans(text, ners)

In [99]:
print('uncompressed:\n\n{}\n\ndelexicalized:\n\n{}\n\ncompressed:\n\n{}\n\ndelexicalized:\n\n{}'\
      .format(uncompressed_samples[0],
              delexicalize_ner(uncompressed_samples[0]),
              compressed_samples[0],
              delexicalize_ner(compressed_samples[0])
             ))

uncompressed:

Serge Ibaka -- the Oklahoma City Thunder forward who was born in the Congo but played in Spain -- has been granted Spanish citizenship and will play for the country in EuroBasket this summer, the event where spots in the 2012 Olympics will be decided.

delexicalized:

[PERSON] -- the [GPE] City Thunder forward who was born in the [GPE] but played in [GPE] -- has been granted [NORP] citizenship and will play for the country in [GPE] this [DATE], the event where spots in the [EVENT] will be decided.

compressed:

Serge Ibaka has been granted Spanish citizenship and will play in EuroBasket.

delexicalized:

[PERSON] has been granted [NORP] citizenship and will play in [ORG].


# Delexicalize everything

## svo

In [111]:
%%time

with open('../data/comp_sentences', 'r') as f_in, open('../data/delex_svo_comp_sentences', 'w') as f_out:
        
    for line in islice(f_in, 0, 100):
        
        text = line[:-1]
        
        delex_text = delexicalize_svo(f'{text}\n')
        
        f_out.write(delex_text)

CPU times: user 2.89 s, sys: 15.6 ms, total: 2.91 s
Wall time: 767 ms


In [112]:
%%time

with open('../data/uncomp_sentences', 'r') as f_in, open('../data/delex_svo_uncomp_sentences', 'w') as f_out:
        
    for line in islice(f_in, 0, 100):
        
        text = line[:-1]
        
        delex_text = delexicalize_svo(f'{text}\n')
        
        f_out.write(delex_text)

CPU times: user 3.61 s, sys: 0 ns, total: 3.61 s
Wall time: 950 ms


In [109]:
%%time

with open('../data/comp_sentences', 'r') as f_in, open('../data/delex_ner_comp_sentences', 'w') as f_out:
        
    for line in islice(f_in, 0, 100):
        
        text = line[:-1]
        
        delex_text = delexicalize_ner(f'{text}\n')
        
        f_out.write(delex_text)

CPU times: user 2.86 s, sys: 15.6 ms, total: 2.88 s
Wall time: 771 ms


In [110]:
%%time

with open('../data/uncomp_sentences', 'r') as f_in, open('../data/delex_ner_uncomp_sentences', 'w') as f_out:
        
    for line in islice(f_in, 0, 100):
        
        text = line[:-1]
        
        delex_text = delexicalize_ner(f'{text}\n')
        
        f_out.write(delex_text)

CPU times: user 3.34 s, sys: 0 ns, total: 3.34 s
Wall time: 898 ms
