<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [70]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_md")
doc = nlp("technologies or applications for mitigation or adaptation against climate change")
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.serve(doc, style="dep", options=options)




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [71]:
for token in doc:
    print(token.text + '\t' + token.dep_ + '\t' + token.head.text + '\t' + token.head.pos_ + '\t',
            [child for child in token.children])

technologies	ROOT	technologies	NOUN	 [or, applications, for]
or	cc	technologies	NOUN	 []
applications	conj	technologies	NOUN	 []
for	prep	technologies	NOUN	 [mitigation]
mitigation	pobj	for	ADP	 [or, adaptation]
or	cc	mitigation	NOUN	 []
adaptation	conj	mitigation	NOUN	 [against]
against	prep	adaptation	NOUN	 [change]
climate	compound	change	NOUN	 []
change	pobj	against	ADP	 [climate]


In [224]:
import spacy
from spacy.symbols import NOUN,VERB
import itertools

def insert_before(list_, ele, target):
    for i in range(len(list_)):
        if list_[i] == target:
            list_.insert(i, ele)
    return list_

def resume_dict(root, dict_=[]):  
    if dict_ == []:
        return []
    
    if isinstance(dict_['pobj'], dict):
        dict_['pobj'] = [dict_['pobj']]  
        
    line = dict_['compound'] + [root] + dict_['prep'] + dict_['pobj']
    return line 
    

def extract_CP(doc, co_hypo):
    res = {}
    for hypo in co_hypo:
        res[hypo] = {'compound': [], 'prep': [], 'pobj': []}
    
    # find elements if possible
    for token in co_hypo:
        for child in token.children:
            if child.dep_ in ['compound', 'amod']:
                res[token]['compound'].append(child)
                for ttoken in doc:
                    if ttoken.dep_ in ['compound', 'amod'] and ttoken.head == child:
                        # insert just before the final token
                        res[token]['compound'] = insert_before(res[token]['compound'], ttoken, child)
                        
            elif child.dep_ == 'prep':
                res[token]['prep'].append(child)
                for gchild in child.children:
                    if gchild.dep_ == 'pobj':
                        res[token]['pobj'].append(gchild)
                        # complete compound for pobj
                        for ttoken in doc:
                            if ttoken.dep_ in ['compound', 'amod'] and ttoken.head == gchild:
                                res[token]['pobj'] = insert_before(res[token]['pobj'], ttoken, gchild)
    
    # complete elements for co-hypo
    for dep in ['compound', 'prep', 'pobj']:
        for token in co_hypo:
            if token.head == co_hypo[0] and not res[token][dep]:
                res[token][dep] = res[co_hypo[0]][dep]
                
    # if elements of head is empty 
    if res[co_hypo[0]] == {'compound': [], 'prep': [], 'pobj': []}:
        for hypo in co_hypo[1:]:
            if res[hypo] != {'compound': [], 'prep': [], 'pobj': []}:
                res[co_hypo[0]] = res[hypo]
   

    # if all pobj have only compounds as children then return 
    for hypo, dic in res.items(): 
        pobj = dic['pobj'] # pobj in each co-hyponym
        if pobj:
            if len(pobj) > 1: 
                pobj = pobj[-1] 
            else: 
                pobj = pobj[0]

            has_prep = False
            pobj_cohypo = [pobj]
            
            for child in pobj.children:
                if child.dep_ == 'prep': has_prep = True
                if child.dep_ == 'conj': pobj_cohypo.append(child)

            if has_prep or len(pobj_cohypo) > 1:
                res[hypo]['pobj'] = extract_CP(doc, pobj_cohypo)

    return res 
                                
                                
def parsing(phrase, model=nlp):
    doc = model(phrase)
    
    # add root term
    main_principle = []
    co_hypo = []
    for token in doc:
        # find root and co-hyponyms
        if token.pos in [NOUN, VERB] and token.dep_ == 'ROOT':
            main_principle.append(token) 
            for child in token.children:
                if child.dep_ == 'conj':
                    co_hypo.append(child)
                    # find conj of conj
                    for gchild in child.children:
                        if gchild.dep_ == 'conj':
                            co_hypo.append(gchild)
     
    co_hypo = main_principle+(co_hypo) # usually the first being the root node    
    res_dict = extract_CP(doc, co_hypo)      
    
    temp = []
    for co_hypo, decorations in res_dict.items():
        temp.append(resume_dict(co_hypo, decorations))
    
    res = []
    for line in temp: # results with pobj as dictionary ==> transfor then into list
        if isinstance(line[-1],dict):
            
            roots = iter(line[-1].keys())
            l_split = list(itertools.repeat(line[:-1], len(line[-1])))
            
            for i in range(len(l_split)):
                root = next(roots)
                l_split[i] = l_split[i] + resume_dict(root, line[-1][root])
                
            res.extend(l_split)
            
        else: res.append(line)
            
    res = [' '.join([word.text for word in line]) for line in res]
    return res 

In [225]:
parsing("technologies or applications for mitigation or adaptation against climate change")

['technologies for mitigation against climate change',
 'technologies for adaptation against climate change',
 'applications for mitigation against climate change',
 'applications for adaptation against climate change']

In [169]:
doc = nlp("restoration or protection of coral reefs")
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.serve(doc, style="dep", options=options)




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [240]:
parsing('restoration and protection of coral reefs')

['restoration of coral reefs', 'protection of coral reefs']

In [189]:
doc = nlp("monitoring or fighting invasive species")
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.serve(doc, style="dep", options=options)


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [190]:
for token in doc:
    print(token.text + '\t' + token.dep_ + '\t' + token.head.text + '\t' + token.head.pos_ + '\t',
            [child for child in token.children])

monitoring	ROOT	monitoring	NOUN	 [or, fighting]
or	cc	monitoring	NOUN	 []
fighting	conj	monitoring	NOUN	 [species]
invasive	amod	species	NOUN	 []
species	dobj	fighting	VERB	 [invasive]


In [239]:
parsing('monitoring or fighting invasive species')

['monitoring', 'fighting']

In [193]:
doc = nlp("capture or disposal of greenhouse gases")
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.serve(doc, style="dep", options=options)


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [228]:
parsing('capture or disposal of greenhouse gases')

['capture of greenhouse gases', 'disposal of greenhouse gases']

In [237]:
parsing('artificial reefs or seaweed')

['artificial reefs', 'artificial seaweed']

In [238]:
parsing('restoration or protection of coral reefs')

['restoration of coral reefs', 'protection of coral reefs']

In [230]:
parsing('hard structures, e.g. dams, dykes or breakwaters')

['hard structures', 'dams', 'dykes', 'breakwaters']

In [218]:
doc = nlp("controlling or monitoring, e.g. of flood or hurricane")
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.serve(doc, style="dep", options=options)




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [219]:
for token in doc:
    print(token.text + '\t' + token.dep_ + '\t' + token.head.text + '\t' + token.head.pos_ + '\t',
            [child for child in token.children])

controlling	ROOT	controlling	VERB	 [or, monitoring, ,, of]
or	cc	controlling	VERB	 []
monitoring	conj	controlling	VERB	 []
,	punct	controlling	VERB	 []
e.g.	advmod	of	ADP	 []
of	prep	controlling	VERB	 [e.g., flood]
flood	pobj	of	ADP	 [or, hurricane]
or	cc	flood	NOUN	 []
hurricane	conj	flood	NOUN	 []


In [233]:
parsing("controlling or monitoring, e.g. of flood or hurricane")

['controlling of flood',
 'controlling of hurricane',
 'monitoring of flood',
 'monitoring of hurricane']

In [235]:
doc = nlp("forecasting, e.g. risk assessment or mapping")
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.serve(doc, style="dep", options=options)




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [234]:
parsing("forecasting, e.g. risk assessment or mapping")

['forecasting']

In [236]:
parsing("risk assessment or mapping")

['risk assessment', 'risk mapping']