In [1]:
# install ipydagred3
# install benepar and download benepar_fr

import ipydagred3
import spacy
from spacy.tokens import Doc, Span, Token
from spacy import displacy
from benepar.spacy_plugin import BeneparComponent

nlp = spacy.load("fr")
nlp.add_pipe(BeneparComponent('benepar_fr'))

In [2]:
# feel free to complete this dict and PR

tooltips = {
    "NP-SUJ": "Nom propre sujet",
    "PONCT": "Ponctuation",
    "VN": "Noyau verbal",
    "ADV": "Adverbe",
    "AP-ATS": "attribut du sujet",
    "NC": "Nom commun",
    "DET": "Déterminant",
    "V": "Verb",
    "SENT": "Phrase racine",
    "NP-OBJ": "Nom propre, objet",
    "Srel": "Subordonnée relative"
    
}

In [5]:
import collections

try:
    nlp.remove_pipe("constituency_parser")
    Span.remove_extension("constituency")
    Span.remove_extension("show_constituency")
    Span.remove_extension("search_constituency")
except:
    pass


def _flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

flatten = lambda l: list(_flatten(l))

class ConstituencyParser():
    name = "constituency_parser"

    def __init__(self):
        Span.set_extension("constituency", default=None)
        Span.set_extension("show_constituency", default=None)
        Span.set_extension("search_constituency", default=None)



    def __call__(self, doc):
        for sent in doc.sents:
            nodes, parsed  = self.processConstituency(sent._.parse_string)
            sent._.set("show_constituency", lambda: self.showDependencyGraph(nodes, parsed))
            sent._.set("search_constituency", lambda _type: self.find(_type, parsed, sent))
        return doc

    def processConstituency(self, pStr):
        nodes = []
        cur = "";
        stack = [];
        nid = 0;
        wordIndex = 0
        for i in range(len(pStr)):
            if(pStr[i] == ' ' or pStr[i] == '\n'):
                if (len(cur) > 0): 
                    newNode = {
                        "nodeID": nid,
                        "nodeType": "Internal",
                        "name": cur,
                        "children": []
                    }
                    cur = "";
                    nid += 1;
                    if (len(stack) > 0):
                        stack[len(stack) - 1]["children"].append(newNode);
                    stack.append(newNode);
                    nodes.append(newNode)
            elif pStr[i] == ')':
                if (len(cur) > 0):
                    newNode = {
                        "nodeID": nid,
                        "nodeType": "Leaf",
                        "name": cur,
                        "wordIndex": wordIndex,
                        "children": []
                    }
                    cur = "";
                    nid += 1;
                    wordIndex += 1;
                    stack[len(stack) - 1]["children"].append(newNode);
                    nodes.append(newNode)
                    stack.pop();
                else:
                    if (len(stack) == 1):
                        root = stack[0]
                    stack.pop();
            elif pStr[i] == '(':
                continue
            else:
                cur = cur + pStr[i];
        return nodes, root

    def showDependencyGraph(self, nodes, parsed):
        g = ipydagred3.Graph()
        for node in nodes:
            g.setNode(str(node["nodeID"]),
                      label=node["name"],
                      tooltip=tooltips[node["name"]] if node["name"] in tooltips else node["name"],
                      rx=5,
                      ry=5,
                      style="fill: " + ("white" if len(node["children"]) else "#00bcd4"));

        def setEdge(parent):
            for i in range(len(parent["children"])):
                g.setEdge(str(parent["nodeID"]), str(parent["children"][i]["nodeID"]))
                setEdge(parent["children"][i])

        setEdge(parsed)
        widget = ipydagred3.DagreD3Widget(graph=g)
        return display(widget)

    def getWordIndex(self, node):
        return [self.getWordIndex(childNode) for childNode in node["children"]] if node["nodeType"] != "Leaf" else node["wordIndex"]

    def getSpan(self, node):
        return
    
    def getString(self, node):
        return ' '.join([self.getString(childNode) for childNode in node["children"]]) if node["nodeType"] != "Leaf" else node["name"]

    def search(self, _types, node):
        fltn = lambda l: [item for sublist in l for item in sublist]
        types = [child for child in node["children"] if child["name"] in _types]
        others = fltn([self.search(_types, child) for child in node["children"]])
        return types+others

    def find(self, _types, node, sent):
        spans = []
        strings = []
        for _node in self.search(_types, node):
            indexes = flatten([x for x in self.getWordIndex(_node)])
            span = sent[min(indexes): max(indexes)+1]
            spans.append(span)
            strings.append(self.getString(_node))
        return spans
        

# add constituencyParser to spacy pipeline
constituencyParser = ConstituencyParser()
nlp.add_pipe(constituencyParser, last=True)


In [8]:
doc = nlp("le petit chat joue dans le grand jardin vert.")
sent = list(doc.sents)[0]
print("=========================\n")
print("SENTENCE  : ", sent)
print("SUBJECTs  : ", sent._.search_constituency(["NP-SUJ"]))
print("OBJECTs   : ", sent._.search_constituency(["NP-OBJ"]))
print("VERBS     : ", sent._.search_constituency(["VN"]))
sent._.show_constituency()




SENTENCE  :  le petit chat joue dans le grand jardin vert.
SUBJECTs  :  [le petit chat]
OBJECTs   :  []
VERBS     :  [joue]


DagreD3Widget()