In [None]:
import os
if not os.path.isfile("estrutura_ud.py"):
    ! wget https://raw.githubusercontent.com/alvelvis/ACDC-UD/master/estrutura_ud.py
import estrutura_ud
try:
    import stanza
except Exception:
    ! pip3 install stanza
    import stanza
    stanza.download('pt')
nlp = stanza.Pipeline('pt')
import pprint
import shutil
import unicodedata
import itertools

In [None]:
tagset_folder = "tagset"
conllu_folder = "conllu"

for folder in [tagset_folder, conllu_folder]:
    if not os.path.isdir(folder):
        os.mkdir(folder)

col_sema = 8
files = {}
tags = {}
lemmas = {
    'ums': 'ums',
    'santos': 'santos',
    'estai': 'estai',
    'veio': 'veio'
}

if not all(os.path.isdir(x) for x in [tagset_folder, conllu_folder]):
    raise Exception("Folder not found.")

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def stanza_lemmatize(string):
    doc = nlp(string)
    return " ".join([token['lemma'] for sentence in doc.to_dict() for token in sentence if 'lemma' in token])

def count_tokens(string):
    return {
        'tokens': len(list(filter(lambda x: len(x.split("\t")) == 10 and not '-' in x.split("\t")[0], string.splitlines()))),
        'sentences': len([x for x in string.split("\n\n") if x.strip()]),
        'types': {x.split("\t")[1] for x in filter(lambda x: len(x.split("\t")) == 10, string.splitlines())},
        }

def tag_sentence(sentence, entity, tag):
    tokens = list(filter(lambda x: isinstance(x, list) and not '-' in x[0], sentence))
    metadata = list(filter(lambda x: isinstance(x, str), sentence))
    mwt = {l: line for l, line in enumerate(sentence) if isinstance(line, list) and '-' in line[0]}
    ngram_span = len(entity)
    for t, token in enumerate(tokens):
        if t + ngram_span > len(tokens):
            break
        ngram_word = [remove_accents(tokens[t+i][1].lower()) for i in range(ngram_span)]
        ngram_lemma = [remove_accents(tokens[t+i][2].lower()) for i in range(ngram_span)]
        if any(x == entity for x in [ngram_word, ngram_lemma]):
            if entity[0] == "campos" and tokens[t][0][0] != "C":
                continue
            for i in range(ngram_span):
                tokens[t+i][col_sema] = "|".join(set([x for x in tokens[t+i][col_sema].split("|") if x != "O"] + ["{}={}".format("I" if i else "B", tag)]))
    sentence = metadata + tokens
    for line in mwt:
        sentence.insert(line, mwt[line])
    return sentence

def count_tags(text):
    frequency = []
    for token in filter(lambda x: len(x.split("\t")) == 10 and not '-' in x.split("\t")[0] and "B=" in x.split("\t")[col_sema], text.splitlines()):
        frequency.extend([x.replace("B=", "") for x in token.split("\t")[col_sema].split("|") if x.startswith("B=")])
    return frequency

def apply_rules(text):
    corpus = estrutura_ud.Corpus()
    corpus.build(text)
    for sentence in corpus.sentences.values():
        for token in sentence.tokens:
            pass
    return corpus.to_str()

# Build tagset lexicon
for file in os.listdir(tagset_folder):
    if file.lower().endswith(".txt"):
        tag = file.upper().split(".TXT")[0].replace("_", ":").replace(" ", "_")
        with open("{}/{}".format(tagset_folder, file)) as f:
            text = f.read()
        tags[tag] = set([x.strip().lower() for x in text.splitlines() if not x.strip().startswith("#") and x.strip()])

# Load conllu files and gather initial statistics
for file in os.listdir(conllu_folder):
    if file.lower().endswith(".conllu"):
        with open("{}/{}".format(conllu_folder, file)) as f:
            text = f.read()
        sentences = [x.splitlines() for x in text.split("\n\n") if x.strip()]
        for s, sentence in enumerate(sentences):
            for l, line in enumerate(sentence):
                if len(line.split("\t")) == 10:
                    sentences[s][l] = line.split("\t")
                    sentences[s][l][col_sema] = "O"
        files[file] = {'text': text, 'tagged': sentences, 'tags_frequency': []}
        files[file].update(count_tokens(files[file]['text']))

# Tag each file
for f, file in enumerate(files):
    text_normalized = remove_accents(files[file]['text'].lower())
    for t, tag in enumerate(tags):
        for e, entity in enumerate(list(filter(lambda x: remove_accents(x) in text_normalized, tags[tag]))):
            if not entity in lemmas:
                lemmas[entity] = stanza_lemmatize(entity)
            print("{}/{} - {}/{} - {} / {}: {} - {} {}".format(f+1, len(files), t+1, len(tags), e+1, len(tags[tag]), entity, lemmas[entity], "-- skip lemma" if lemmas[entity] == entity else ""))
            for ngram in [entity, lemmas[entity] if lemmas[entity] != entity else ""]:
                if ngram:
                    for s, sentence in enumerate(files[file]['tagged']):
                        files[file]['tagged'][s] = tag_sentence(sentence, [remove_accents(x) for x in ngram.split(" ")], tag)        
    files[file]['output'] = files[file]['tagged'][:]
    for s, sentence in enumerate(files[file]['tagged']):
        for t, token in enumerate(sentence):
            if isinstance(token, list):
                files[file]['output'][s][t] = "\t".join(files[file]['output'][s][t])
        files[file]['output'][s] = "\n".join(files[file]['output'][s])
    files[file]['output'] = "\n\n".join(files[file]['output'])
    files[file]['output'] = apply_rules(files[file]['output'])
    files[file]['tags_frequency'].extend(count_tags(files[file]['output']))


In [None]:
# Stats
all_tags = []
[all_tags.extend(files[x]['tags_frequency']) for x in files]
all_tags = set(all_tags)

stats = "File;%NE;NE;TTR;SENT;TOKENS"
for file in sorted(files, reverse=True, key=lambda x: sum([files[x]['tags_frequency'].count(y) for y in set(files[x]['tags_frequency'])]) / files[x]['tokens']):
    stats += "\n{};{};{};{};{};{}".format(
        file, 
        sum([files[file]['tags_frequency'].count(y) for y in set(files[file]['tags_frequency'])]) / files[file]['tokens'],
        sum([files[file]['tags_frequency'].count(y) for y in set(files[file]['tags_frequency'])]),
        len(files[file]['types']) / files[file]['tokens'],
        files[file]['sentences'],
        files[file]['tokens']
        )
with open("files_stats.csv", "w") as f:
    f.write(stats)
print(stats + "\n")

stats = "Tag;NE;FILES"
for tag in sorted(all_tags, reverse=True, key=lambda x: sum([files[y]['tags_frequency'].count(x) for y in files])):
    stats += "\n{};{};{}".format(
        tag,
        sum([files[y]['tags_frequency'].count(tag) for y in files]),
        len(list(filter(lambda x: tag in files[x]['tags_frequency'], files)))
    )
with open("tags_stats.csv", "w") as f:
    f.write(stats)
print(stats + "\n")

In [None]:
# Save tagged files
if os.path.isdir("tagged"):
    shutil.rmtree("tagged")
os.mkdir("tagged")
for file in files:
    with open("tagged/{}".format(file), "w") as f:
        f.write(files[file]['output'])
    #os.system("meld --diff {}/{} tagged/{}".format(conllu_folder, file, file))

In [None]:
# Find from which word a lemma came
def GetKey(dictA, val):
    for key, value in dictA.items():
        if val == value:
            return key
    return "key doesn't exist"

GetKey(lemmas, "presença")