<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_md")
def custom_tokenizer(nlp):
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = spacy.util.compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

doc = nlp('adapting or protecting infrastructure or their operation in transportation on roads, waterways or railways')
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.serve(doc, style="dep", options=options)




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [2]:
for token in doc:
    print(token.text + '\t' + token.dep_ + '\t' + token.head.text + '\t' + token.head.pos_ + '\t',
            [child for child in token.children])

adapting	ROOT	adapting	VERB	 [or, protecting]
or	cc	adapting	VERB	 []
protecting	conj	adapting	VERB	 [infrastructure, in]
infrastructure	dobj	protecting	VERB	 [or, operation]
or	cc	infrastructure	NOUN	 []
their	poss	operation	NOUN	 []
operation	conj	infrastructure	NOUN	 [their]
in	prep	protecting	VERB	 [transportation]
transportation	pobj	in	ADP	 [on]
on	prep	transportation	NOUN	 [roads]
roads	pobj	on	ADP	 [,, waterways]
,	punct	roads	NOUN	 []
waterways	conj	roads	NOUN	 [or, railways]
or	cc	waterways	NOUN	 []
railways	conj	waterways	NOUN	 []


In [3]:
import spacy
from spacy.symbols import NOUN,VERB,ADJ
import itertools

def insert_before(list_, ele, target):
    for i in range(len(list_)):
        if list_[i] == target:
            list_.insert(i, ele)
    return list_


def resume_dict(root, dict_=[]):  
    if dict_ == []:
        return [] 
    elif dict_ == {'compound': [], 'prep': [], 'pobj': [], 'dobj': []}:
        return [root]    
    
    if isinstance(dict_['pobj'], dict):
        dict_['pobj'] = [dict_['pobj']]          
    if isinstance(dict_['dobj'], dict):
        dict_['dobj'] = [dict_['dobj']]
     
    if root.pos == ADJ:
        line = [root] + dict_['compound'] + dict_['prep'] + dict_['pobj']
    else:
        line = dict_['compound'] + [root] + dict_['dobj'] + dict_['prep'] + dict_['pobj']
    return line 


def dive_line(line, to_dive):
    res = []
    for i in to_dive:
        dict_ = line[i]
        temp = []
        for key,val in dict_.items():
            res_undict = resume_dict(key,val) # a list 1D
            
            # check whether there is still dict in res_undict
            to_dive = []
            for j in range(len(res_undict)):
                if isinstance(res_undict[j],dict):
                    to_dive.append(j)
            if to_dive:
                l_split = iter(dive_line(res_undict, to_dive))
                
            for k in to_dive:
                res_undict[k] = next(l_split)
                        
            temp.append(res_undict)       
        res.append(temp)
  
    return res


def decarde_concat_right(l1, l2):      
    if not l2:
        return l1 
    if not l1:
        return l2
    
    if not isinstance(l1, list):  
        l1 = [l1]
    elif isinstance(l1, list) and len(l1) == 1 and isinstance(l1[0], list) and len(l1[0]) == 1:
        l1 = l1[0]
        
    return [l1 + e2 for e2 in l2]

def decarde_concat_left(l1, l2):      
    if not l2:
        return l1 
    if not l1:
        return l2
    
    if not isinstance(l2, list):  
        l2 = [l2]
    elif isinstance(l2, list) and len(l2) == 1 and isinstance(l2[0], list) and len(l2[0]) == 1:
        l2 = l2[0]
        
    return [e1 + l2 for e1 in l1]

def decarde_concat(l1, l2):   
    res = []
    if not l2:
        return l1 
    if not l1:
        return l2
            
    try:
        return [e1 + e2 for e1 in l1 for e2 in l2]
    except TypeError:
        try:
            return decarde_concat_right(l1,l2)
        except TypeError:
            return decarde_concat_left(l1,l2)
        


def unlist(line): # given a line ls(list with lists in it), returns a list of 1D list
    if not line:
        return 
    elif not isinstance(line, list):
        return line
    elif isinstance(line, list) and len(line) == 1 and isinstance(line[0], list) and len(line[0]) == 1:
        return line[0]
    else:      
        for i in range(len(line)):
            token = line[i]
            if isinstance(token, list):
                len_ = len(token)
                if len_ > 1:                   
                    lines = decarde_concat_right(line[:i], line[i])
                    if i < len(line)-1:
                        lines = decarde_concat(lines, unlist(line[i+1:]))
               
                    return lines 
                else: # list with length 1
                    return token[0]              
        return line  
    
    

def extract_CP(doc, co_hypo):
    res = {}
    for hypo in co_hypo:
        res[hypo] = {'compound': [], 'prep': [], 'pobj': [], 'dobj': []}
    
    # find elements if possible
    for token in co_hypo:
        for child in token.children:
            if child.dep_ in ['compound', 'amod']:
                res[token]['compound'].append(child)
                for ttoken in doc:
                    if ttoken.dep_ in ['compound', 'amod'] and ttoken.head == child:
                        # insert just before the final token
                        res[token]['compound'] = insert_before(res[token]['compound'], ttoken, child)
                        
            elif child.dep_ in ['prep', 'acl']:
                res[token]['prep'].append(child)
                for gchild in child.children:
                    if (child.dep_ == 'prep' and gchild.dep_ == 'pobj'):
                        res[token]['pobj'].append(gchild)
                        # complete compound for pobj
                        for ttoken in doc:
                            if ttoken.dep_ in ['compound', 'amod'] and ttoken.head == gchild:
                                res[token]['pobj'] = insert_before(res[token]['pobj'], ttoken, gchild)
                    elif (child.dep_ == 'acl' and gchild.dep_ == 'dobj'):
                        res[token]['dobj'].append(gchild)
                                
            elif token.pos == VERB and child.dep_ == 'dobj':
                res[token]['dobj'].append(child)
                # complete compound for dobj
                for ttoken in doc:
                    if ttoken.dep_ in ['compound', 'amod'] and ttoken.head == child:
                        res[token]['dobj'] = insert_before(res[token]['dobj'], ttoken, child)
    
    # complete elements for co-hypo
    for dep in ['compound', 'prep', 'pobj', 'dobj']:
        for token in co_hypo:
            if token.head == co_hypo[0] and not res[token][dep]:
                res[token][dep] = res[co_hypo[0]][dep]
                
    # if elements of head is empty 
    if res[co_hypo[0]] == {'compound': [], 'prep': [], 'pobj': [], 'dobj': []}:
        for hypo in co_hypo[1:]:
            if res[hypo] != {'compound': [], 'prep': [], 'pobj': [], 'dobj': []}:
                res[co_hypo[0]] = res[hypo]   
#====================================================================================================================#    
#====================================================================================================================#
    # if all pobj have only compounds as children then return 
    for hypo, dic in res.items(): 
        pobj = dic['pobj'] # pobj in each co-hyponym
        if pobj and isinstance(pobj,list):            
            all_conj = True
            for ppobj in pobj: 
                 if ppobj.dep_ != 'conj': 
                        all_conj = False
                        break            
            if len(pobj) > 1: 
                if all_conj:
                    pobj_cohypo = pobj
                else:
                    pobj = pobj[-1] 
                    pobj_cohypo = [pobj]
            else:                     
                pobj = pobj[0]
                pobj_cohypo = [pobj]

            # extract other potential co-hypo            
            for token in doc:
                if token.dep_ == 'conj' and token not in pobj_cohypo and token.head in pobj_cohypo:
                    pobj_cohypo.append(token)
            has_prep = False
            for child in pobj.children:
                if child.dep_ in ['prep','acl']: 
                    has_prep = True
                    break

            if has_prep or len(pobj_cohypo) > 1:
                res[hypo]['pobj'] = extract_CP(doc, pobj_cohypo)

        elif pobj and isinstance(pobj,dict): # if pobj is dict
            for item, item_dic in pobj.items():
                item_pobj = item_dic['pobj']
                if item_pobj and isinstance(item_pobj,dict):
                    pobj_cohypo = [pobj for pobj in item_pobj.keys()]
                    # extract other potential co-hypo
                    for token in doc:
                        if token.dep_ == 'conj' and token not in pobj_cohypo and token.head in pobj_cohypo:
                            pobj_cohypo.append(token)
                    if len(pobj_cohypo) > 1:
                        res[hypo]['pobj'] = extract_CP(doc, pobj_cohypo)
                        break


                elif item_pobj and isinstance(item_pobj, list):
                    if len(item_pobj) > 1:                      
                        item_pobj = item_pobj[-1] 
                    else: 
                        item_pobj = item_pobj[0]

                    has_prep = False
                    pobj_cohypo = [item_pobj]

                    # extract other potential co-hypo
                    for token in doc:
                        if token.dep_ == 'conj' and token not in pobj_cohypo and token.head in pobj_cohypo:
                            pobj_cohypo.append(token)
                    for child in item_pobj.children:
                        if child.dep_ in ['prep','acl']: 
                            has_prep = True
                            break

                    if has_prep or len(pobj_cohypo) > 1:
                        res[hypo]['pobj'] = extract_CP(doc, pobj_cohypo)

#====================================================================================================================#
#====================================================================================================================#

    # same process for dobj
    for hypo, dic in res.items(): 
        dobj = dic['dobj'] # dobj in each co-hyponym

        if dobj and isinstance(dobj,list):            
            all_conj = True
            for ddobj in dobj: 
                 if ddobj.dep_ != 'conj': 
                        all_conj = False
                        break            
            if len(dobj) > 1: 
                if all_conj:
                    dobj_cohypo = dobj
                else:
                    dobj = dobj[-1] 
                    dobj_cohypo = [dobj]
            else:                     
                dobj = dobj[0]
                dobj_cohypo = [dobj]
            has_prep = False

            
            # extract other potential co-hypo            
            for token in doc:
                if token.dep_ == 'conj' and token not in dobj_cohypo and token.head in dobj_cohypo:
                    dobj_cohypo.append(token)

            for child in dobj.children:
                if child.dep_ in ['prep','acl']: 
                    has_prep = True
                    break

            if has_prep or len(dobj_cohypo) > 1:
                res[hypo]['dobj'] = extract_CP(doc, dobj_cohypo)

        elif dobj and isinstance(dobj,dict): # if dobj is dict
            for item, item_dic in dobj.items():
                item_dobj = item_dic['dobj']
                if item_dobj and isinstance(item_dobj,dict):
                    dobj_cohypo = [dobj for dobj in item_dobj.keys()]
                    # extract other potential co-hypo
                    for token in doc:
                        if token.dep_ == 'conj' and token not in dobj_cohypo and token.head in dobj_cohypo:
                            dobj_cohypo.append(token)
                    if len(dobj_cohypo) > 1:
                        res[hypo]['dobj'] = extract_CP(doc, dobj_cohypo)
                        break


                elif item_dobj and isinstance(item_dobj, list):
                    if len(item_dobj) > 1:                      
                        item_dobj = item_dobj[-1] 
                    else: 
                        item_dobj = item_dobj[0]

                    has_prep = False
                    dobj_cohypo = [item_dobj]

                    # extract other potential co-hypo
                    for token in doc:
                        if token.dep_ == 'conj' and token not in dobj_cohypo and token.head in dobj_cohypo:
                            dobj_cohypo.append(token)
                    for child in item_dobj.children:
                        if child.dep_ in ['prep','acl']: 
                            has_prep = True
                            break

                    if has_prep or len(dobj_cohypo) > 1:
                        res[hypo]['dobj'] = extract_CP(doc, dobj_cohypo)
#====================================================================================================================#
#====================================================================================================================#                          
    return res


                                
def parsing(phrase, model=nlp):
    doc = model(phrase)
    
    # add root term
    main_principle = []
    co_hypo = []
    for token in doc:
        # find root and co-hyponyms
        if token.pos in [NOUN, VERB, ADJ] and token.dep_ == 'ROOT':
            main_principle.append(token) 
            for child in token.children:
                if child.dep_ == 'conj':
                    co_hypo.append(child)
                    # find conj of conj
                    for gchild in child.children:
                        if gchild.dep_ == 'conj':
                            co_hypo.append(gchild)
     
    co_hypo = main_principle+(co_hypo) # usually the first being the root node   
    res_dict = extract_CP(doc, co_hypo) 

    temp = []
    for co_hypo, decorations in res_dict.items():
        # if the co-hypo is an ADJ
        if co_hypo.pos == ADJ:
            decorations = {'compound': [], 'prep': [], 'pobj': [], 'dobj': []}
            for child in co_hypo.children:
                if child.dep_ == 'conj':
                    if len(res_dict[child]['compound'])>1:
                        decorations['compound'] = res_dict[child]['compound'][1:]
                    decorations['compound'].append(child)
                    decorations['prep'] = res_dict[child]['prep']
                    decorations['pobj'] = res_dict[child]['pobj']
                    temp.append(resume_dict(co_hypo, decorations))
                                    
        else:
            temp.append(resume_dict(co_hypo, decorations))     

    res = []


    for line in temp: # results with pobj as dictionary ==> transfor then into list
        to_dive = []
        for i in range(len(line)):
            if isinstance(line[i],dict):
                to_dive.append(i)

        if to_dive: 
            l_split = iter(dive_line(line, to_dive))
            for i in to_dive:
                line[i] = next(l_split)

        res.append(line)      

    fini = False
    while not fini:
        try:
            res = [' '.join([word.text for word in line]) for line in res]
            fini = True
        except AttributeError:
            temp = res 
            res = []
            for line in temp:
                res.extend(unlist(line))       
    return res 

In [4]:
parsing('technical subjects covered by former uspc cross-reference art collections [xracs] and digests')

['digests']

In [5]:
parsing('billing, invoicing, buying or selling transactions')

['transactions']

In [6]:
parsing("technologies or applications for mitigation or adaptation against climate change")

['technologies for mitigation against climate change',
 'technologies for adaptation against climate change',
 'applications for mitigation against climate change',
 'applications for adaptation against climate change']

In [7]:
parsing("controlling or monitoring, e.g. of flood or hurricane")

['controlling of flood',
 'controlling of hurricane',
 'monitoring of flood',
 'monitoring of hurricane']

In [8]:
doc = nlp('billing, invoicing, buying or selling transactions')
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.serve(doc, style="dep", options=options)


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [9]:
parsing('planning or developing urban green infrastructure')

['planning urban green infrastructure',
 'developing urban green infrastructure']

In [10]:
parsing('adapting or protecting infrastructure or their operation in transportation on roads, waterways or railways')

['adapting infrastructure in roads',
 'adapting infrastructure in waterways',
 'adapting infrastructure in railways',
 'adapting operation in roads',
 'adapting operation in waterways',
 'adapting operation in railways',
 'protecting infrastructure in roads',
 'protecting infrastructure in waterways',
 'protecting infrastructure in railways',
 'protecting operation in roads',
 'protecting operation in waterways',
 'protecting operation in railways']

In [11]:
parsing("uninterruptible or back-up power supplies integrating renewable energies")

['uninterruptible power supplies integrating',
 'back-up power supplies energies integrating']

In [12]:
parsing('controlling or monitoring of flood or hurricane')

['controlling of flood',
 'controlling of hurricane',
 'monitoring of flood',
 'monitoring of hurricane']

In [13]:
parsing('restoration and protection of coral reefs')

['restoration of coral reefs', 'protection of coral reefs']

In [14]:
parsing('monitoring or fighting invasive species')

['monitoring invasive species', 'fighting invasive species']

In [15]:
parsing('capture or disposal of greenhouse gases')

['capture of greenhouse gases', 'disposal of greenhouse gases']

In [16]:
parsing('artificial reefs or seaweed')

['artificial reefs', 'artificial seaweed']

In [17]:
parsing('restoration or protection of coral reefs')

['restoration of coral reefs', 'protection of coral reefs']

In [18]:
parsing('hard structures, e.g. dams, dykes or breakwaters')

['hard structures', 'dams', 'dykes', 'breakwaters']

In [19]:
parsing("forecasting, e.g. risk assessment or mapping")

['forecasting']

In [20]:
parsing("risk assessment or mapping")

['risk assessment', 'risk mapping']