In [1]:
from IPython.core.display import display, HTML, Image
display(HTML("<style>.container { width:95% !important; }</style>"))
%config IPCompleter.use_jedi=False

In [2]:
from py2neo import Graph, Node, Relationship
import networkx as nx
import requests
import pandas as pd
from itertools import chain, product

In [14]:
def get_root_token(tokens):
    root = [t for t in tokens if t['HEAD']=='0'][0]
    return root['ID']

def get_token_by_id(tokens, token_id):
    return [t for t in tokens if t['ID']==token_id][0]

def get_children_ids_of_id(tokens, token_id):
    return [t['ID'] for t in tokens if t['HEAD']==token_id]
    

def turn_feats_to_dict(feats_value):
    if feats_value == '_':
        return {}
    else:
        to_return = feats_value.split('|')
        to_return = [i.split('=') for i in to_return]
        to_return = {k:v for k,v in to_return}
        return to_return

Run the following in the [neo4j console](http://localhost:7474/browser/) (password 1234) to delete everything
```
MATCH (n) DETACH DELETE n
```

In [4]:
graph = Graph("bolt://linguistic-annotation_neo4j_1:7687/data/db", auth=("neo4j", "1234"))

In [5]:
# FIELDS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
REPO = 'https://github.com/UniversalDependencies/UD_French-ParTUT' #TODO: Pick a repo at random that starts with UD
TEST_CONLLU = 'https://raw.githubusercontent.com/UniversalDependencies/UD_French-ParTUT/master/fr_partut-ud-test.conllu'
DEV_CONLLU = 'https://raw.githubusercontent.com/UniversalDependencies/UD_French-ParTUT/master/fr_partut-ud-dev.conllu'
TRAIN_CONLLU = 'https://raw.githubusercontent.com/UniversalDependencies/UD_French-ParTUT/master/fr_partut-ud-train.conllu'

In [6]:
%%time
response = requests.get(TRAIN_CONLLU)
data = response.text
sentences = [sentence.split('\n') for sentence in data.split('\n\n')[:-1]]
fields = sentences[0].pop(0)
fields = fields.split(' = ')[-1].split()

sentences_data = [[e.split('\t') for e in sentence if not e.startswith('#')] for sentence in sentences]
sentences_metadata = [[e.split('# ')[-1] for e in sentence if e.startswith('#')] for sentence in sentences]
sentences_metadata = [[e.split(' = ') for e in sentence] for sentence in sentences_metadata]

data_dicts = [[{a:b for a,b in zip(fields, token)} for token in sentence] for sentence in sentences_data]
# metadata_dicts = [dict(item) for item in sentences_metadata]

dataset = [{'sentence_id': m[0][1], 'sentence': m[1][1], 'tokens': d} for d,m in zip(data_dicts, sentences_metadata)]

CPU times: user 190 ms, sys: 9.58 ms, total: 200 ms
Wall time: 290 ms


In [7]:
sentence = dataset[0] 

In [8]:
sentence.keys()

dict_keys(['sentence_id', 'sentence', 'tokens'])

In [9]:
sentence['sentence']

'La distribution de la présente version de ce contrat ne crée aucune relation juridique entre les parties au contrat présenté ci-après et Creative Commons.'

In [10]:
sentence['sentence_id']

'fr_partut-ud-3'

In [11]:
len(sentence['tokens'])

27

In [12]:
tokens = sentence['tokens']

In [15]:
depth = 0
i = get_root_token(tokens)
ids_at_current_depth = [i]
edge_list = []

while ids_at_current_depth:
    descendants = []
    for i in ids_at_current_depth:
        children = get_children_ids_of_id(tokens, i)
        edges = list(product([i], children))
        edge_list.extend(edges)
        descendants.extend(children)
    ids_at_current_depth = descendants

In [16]:
print(edge_list)

[('11', '2'), ('11', '10'), ('11', '13'), ('11', '26'), ('2', '1'), ('2', '6'), ('13', '12'), ('13', '14'), ('13', '17'), ('6', '3'), ('6', '4'), ('6', '5'), ('6', '9'), ('17', '15'), ('17', '16'), ('17', '20'), ('17', '24'), ('9', '7'), ('9', '8'), ('20', '18'), ('20', '19'), ('20', '21'), ('24', '23'), ('24', '25'), ('21', '22')]


In [18]:
for a,b in edge_list:
    token_a = get_token_by_id(tokens, a)
    token_a['name'] = token_a['FORM']
    token_b = get_token_by_id(tokens, b)
    token_b['name'] = token_b['FORM']
    
    test_a = graph.nodes.match("Token",  **token_a).first()
    test_b = graph.nodes.match("Token",  **token_b).first()
    
    n_a = Node('Token', **token_a) if not test_a else test_a
    n_b = Node('Token', **token_b) if not test_b else test_b
    rel = Relationship(n_a, 'IS_HEAD_OF', n_b)
    graph.create(n_a)
    graph.create(n_b)
    graph.create(rel)