In [2]:
from textacy import extract
import spacy

In [4]:
# loads Spacy model

nlp = spacy.load('en_core_web_lg')

In [5]:
# registers a token extension to store the tag to be used as delexicalization
spacy.tokens.Token.set_extension('delexicalized_tag', default=None)

# register a Token extension to work as getter of the delexicalized version of it
#    if it's a delexicalized token, it should return its tag, else it shoudl return its text
spacy.tokens.Token.set_extension('delexicalized_with_ws', getter=lambda token: token._.delexicalized_tag + token.whitespace_ if token._.delexicalized_tag else token.text_with_ws)

# registers a Doc extension to work as getter of the delexicalized version of the full doc
spacy.tokens.Doc.set_extension('delexicalized', getter=lambda doc: ''.join(t._.delexicalized_with_ws for t in doc))

# function that receives a doc and a list of <tag, span> 
#    and merges the spans, so they are considered tokens
#    and assign to the resulting token the tag
def delexicalize_spans(doc, tag_span):
    
    for tag, span in tag_span:
        
        token = span.merge()
        token._.delexicalized_tag = tag

# Delexicalizing subject, verb, object

In [6]:
# function that delexicalizes a doc using its 
#    extracted subject, verb and objects
#    using the textacy.extract module
def delexicalize_svo(doc):
    
    svos = []

    # for each <subject, verb, object> creates a list of <tag, span>
    #    where tag is the corresponding role, added with the phrase index
    for i, svo in enumerate(extract.subject_verb_object_triples(doc)):

        svos.extend(zip([f'[subject-{i}]', f'[verb-{i}]', f'[object-{i}]'], svo))
        
    delexicalize_spans(doc, svos)

In [29]:
# example

text = "On Monday, Mr. Trump trained his attention on the Senate, where only four Republican votes are needed to pass the measure, should Democrats remain united, as expected."

doc = nlp(text)

delexicalize_svo(doc)

print(doc)

print()

print(doc._.delexicalized)

On Monday, Mr. Trump trained his attention on the Senate, where only four Republican votes are needed to pass the measure, should Democrats remain united, as expected. 

On Monday, [subject-0] [verb-0] his [object-0] on the Senate, where only four Republican votes are needed to pass the measure, should Democrats remain united, as expected. 


# Delexicalizing named entities

In [33]:
# function that delexicalizes a doc using its
#    named entities
#    extracted by the spacy model
def delexicalize_ner(doc):
    
    ners = [(f'[{span.label_}]', span) for span in extract.named_entities(doc)]
    
    return delexicalize_spans(doc, ners)

In [34]:
delexicalize_ner(doc)

print(doc)

print()

print(doc._.delexicalized)

On Monday, Mr. Trump trained his attention on the Senate, where only four Republican votes are needed to pass the measure, should Democrats remain united, as expected. 

On [DATE], Mr. [PERSON] trained his attention on the [ORG], where [CARDINAL] [NORP] votes are needed to pass the measure, should [NORP] remain united, as expected. 
