In [1]:
import os
import json
import glob
import random

In [2]:
sentences = []
for fn in glob.glob("data/ler.conll"):
    with open(fn, "r", encoding="utf-8") as f:
        sentence = []
        for line in f:
            if len(line) > 1:
                token, tag = line.strip().split(" ")
                sentence.append((token, tag))
            else:
                sentences.append(sentence)
                sentence = []
            
len(sentences)

66723

In [3]:
sentences[0]

[('Prozesskostenhilfe', 'O'),
 ('-', 'O'),
 ('Entschädigung', 'O'),
 ('für', 'O'),
 ('überlange', 'O'),
 ('Verfahrensdauer', 'O'),
 ('-', 'O'),
 ('Revisionsverfahren', 'O')]

In [4]:
random.seed(10)
random.shuffle(sentences)

split_1 = int(0.8 * len(sentences))
split_2 = int(0.9 * len(sentences))

train = sentences[0:split_1]
val = sentences[split_1:split_2]
test = sentences[split_2:]


In [5]:
len(train) + len(val) + len(test)

66723

In [6]:
def write_file(sents, file):
    with open(file, "w", encoding="utf-8") as f_out:
        for sent in sents:
            s = dict()
            s["tokens"] = [token for token,_ in sent]
            s["tags"] = [tag for _,tag in sent]
            f_out.write(json.dumps(s) + "\n")

def write_conll(sents, file):
    with open(file, "w", encoding="utf-8") as f_out:
        for sent in sents:
            for token, tag in sent:
                f_out.write(token + "\t" + tag + "\n")
            f_out.write("\n")

In [7]:
write_file(train, "data/train.json")
write_file(val, "data/validation.json")
write_file(test, "data/test.json")

write_conll(train, "data/train.conll")
write_conll(val, "data/validation.conll")
write_conll(test, "data/test.conll")

In [13]:
train_small = train[0:int(len(train)*0.05)]
val_small = val[0:int(len(val)*0.05)]
test_small = test[0:int(len(test)*0.05)]

write_file(train_small, "data/train_small.json")
write_file(val, "data/validation_small.json")
write_file(test_small, "data/test_small.json")

write_conll(train_small, "data/train_small.conll")
write_conll(val, "data/validation_small.conll")
write_conll(test_small, "data/test_small.conll")

In [14]:
def get_all_tags(sents):    
    tags = []
    for sentence in sentences:
        for _, tag in sentence:
            tags.append(tag)
    return tags

In [15]:
len(set(get_all_tags(train)))

39

In [16]:
len(set(get_all_tags(train_small)))

39

In [12]:
begin_tags = [tag for tag in get_all_tags(sentences) if tag.startswith("B-")]
len(begin_tags)
# must be 53.632 (number of entities)

53632