In [1]:
from IPython.core.display import display, HTML, Image
display(HTML("<style>.container { width:95% !important; }</style>"))
%config IPCompleter.use_jedi=False

In [2]:
from py2neo import Graph, Node, Relationship
import networkx as nx
import requests
import pandas as pd
from itertools import chain

Run the following in the [neo4j console](http://localhost:7474/browser/) (password 1234) to delete everything
```
MATCH (n) DETACH DELETE n
```

In [3]:
graph = Graph("bolt://linguistic-annotation_neo4j_1:7687/data/db", auth=("neo4j", "1234"))

In [4]:
# FIELDS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
REPO = 'https://github.com/UniversalDependencies/UD_French-ParTUT' #TODO: Pick a repo at random that starts with UD
TEST_CONLLU = 'https://raw.githubusercontent.com/UniversalDependencies/UD_French-ParTUT/master/fr_partut-ud-test.conllu'
DEV_CONLLU = 'https://raw.githubusercontent.com/UniversalDependencies/UD_French-ParTUT/master/fr_partut-ud-dev.conllu'
TRAIN_CONLLU = 'https://raw.githubusercontent.com/UniversalDependencies/UD_French-ParTUT/master/fr_partut-ud-train.conllu'

In [5]:
%%time
response = requests.get(TRAIN_CONLLU)
data = response.text
sentences = [sentence.split('\n') for sentence in data.split('\n\n')[:-1]]
fields = sentences[0].pop(0)
fields = fields.split(' = ')[-1].split()

sentences_data = [[e.split('\t') for e in sentence if not e.startswith('#')] for sentence in sentences]
sentences_metadata = [[e.split('# ')[-1] for e in sentence if e.startswith('#')] for sentence in sentences]
sentences_metadata = [[e.split(' = ') for e in sentence] for sentence in sentences_metadata]

data_dicts = [[{a:b for a,b in zip(fields, token)} for token in sentence] for sentence in sentences_data]
# metadata_dicts = [dict(item) for item in sentences_metadata]

dataset = [{'sentence_id': m[0][1], 'sentence': m[1][1], 'tokens': d} for d,m in zip(data_dicts, sentences_metadata)]

CPU times: user 188 ms, sys: 10.4 ms, total: 198 ms
Wall time: 676 ms


In [11]:
sentence = dataset[0] # This needs to be turned into an edge list

In [7]:
sentence.keys()

dict_keys(['sentence_id', 'sentence', 'tokens'])

In [8]:
sentence['sentence']

'La distribution de la présente version de ce contrat ne crée aucune relation juridique entre les parties au contrat présenté ci-après et Creative Commons.'

In [9]:
sentence['sentence_id']

'fr_partut-ud-3'

In [10]:
len(sentence['tokens'])

27

### Notes

- Every token is a dictionary
- Every token has an ID
- Every token has a HEAD that refers to another token's (dict) ID
- The linguistic root of the sentence has a HEAD of 0 (refers to the root node of the sentence and DEPREL value of 'root'.

In [23]:
tokens = sentence['tokens']
tokens[1:3]

[{'ID': '2',
  'FORM': 'distribution',
  'LEMMA': 'distribution',
  'UPOS': 'NOUN',
  'XPOS': 'S',
  'FEATS': 'Gender=Fem|Number=Sing',
  'HEAD': '11',
  'DEPREL': 'nsubj',
  'DEPS': '_',
  'MISC': '_'},
 {'ID': '3',
  'FORM': 'de',
  'LEMMA': 'de',
  'UPOS': 'ADP',
  'XPOS': 'E',
  'FEATS': '_',
  'HEAD': '6',
  'DEPREL': 'case',
  'DEPS': '_',
  'MISC': '_'}]

In [24]:
def get_root_token(tokens):
    return [t for t in tokens if t['HEAD']=='0'][0]

def turn_feats_to_dict(feats_value):
    if feats_value == '_':
        return {}
    else:
        to_return = feats_value.split('|')
        to_return = [i.split('=') for i in to_return]
        to_return = {k:v for k,v in to_return}
        return to_return

In [28]:
root = get_root_token(tokens)
root

{'ID': '11',
 'FORM': 'crée',
 'LEMMA': 'créer',
 'UPOS': 'VERB',
 'XPOS': 'V',
 'FEATS': 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 'HEAD': '0',
 'DEPREL': 'root',
 'DEPS': '_',
 'MISC': '_'}

In [29]:
turn_feats_to_dict(root['FEATS'])

{'Mood': 'Ind',
 'Number': 'Sing',
 'Person': '3',
 'Tense': 'Pres',
 'VerbForm': 'Fin'}