In [2]:
data_dir = './data'
files = ('train_expanded.jsonl', 'valid_expanded.jsonl', 'test_expanded.jsonl')

In [3]:
import os
import json

In [4]:
upos_set = set()
xpos_set = set()
deprel_set = set()

for file_name in files:
    file_path = os.path.join(data_dir, file_name)
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == "":
                continue
            doc_item = json.loads(line)
            content_item = doc_item['content']

            for sentence_item in content_item:
                sentence_token_info = sentence_item['token_info']

                for token_info in sentence_token_info:
                    upos = token_info['upos']
                    upos_set.add(upos)

                    xpos = token_info['xpos']
                    xpos_set.add(xpos)

                    deprel = token_info['deprel']
                    deprel_set.add(deprel)

In [9]:
print(len(upos_set))
print(sorted(list(upos_set)))

17
['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']


In [11]:
print(len(xpos_set))
print(sorted(list(xpos_set)))

50
['$', "''", ',', '-LRB-', '-LSB-', '-RRB-', '-RSB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']


In [12]:
print(len(deprel_set))
print(sorted(list(deprel_set)))

48
['acl', 'acl:relcl', 'advcl', 'advmod', 'amod', 'appos', 'aux', 'aux:pass', 'case', 'cc', 'cc:preconj', 'ccomp', 'compound', 'compound:prt', 'conj', 'cop', 'csubj', 'csubj:pass', 'dep', 'det', 'det:predet', 'discourse', 'dislocated', 'expl', 'fixed', 'flat', 'flat:foreign', 'goeswith', 'iobj', 'list', 'mark', 'nmod', 'nmod:npmod', 'nmod:poss', 'nmod:tmod', 'nsubj', 'nsubj:pass', 'nummod', 'obj', 'obl', 'obl:npmod', 'obl:tmod', 'orphan', 'parataxis', 'punct', 'root', 'vocative', 'xcomp']


In [23]:
with open('upos.txt', 'w') as f:
    f.write('\n'.join(sorted(list(upos_set))))

In [24]:
with open('xpos.txt', 'w') as f:
    f.write('\n'.join(sorted(list(xpos_set))))

In [25]:
with open('deprel.txt', 'w') as f:
    f.write('\n'.join(sorted(list(deprel_set))))