In [1]:
# !python -m pip install ufal.morphodita -U --user

In [29]:
from ufal.morphodita import *
import os
import json
from typing import List
import pandas as pd

In [2]:
def encode_entities(text):
    return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')

In [3]:
from ufal.morphodita import (
    Forms, TokenRanges, Tokenizer_newCzechTokenizer, 
    Tokenizer_newEnglishTokenizer, Tokenizer_newGenericTokenizer, 
    Tokenizer_newVerticalTokenizer, TaggedLemmasForms
)

class MorphoDiTaTokenizer:
    def __init__(self, lang:str ="cs"):
        lang = lang.lower()
        assert lang in ["cs", "en", "generic", "vertical"]
        if lang == "cs":
            self.tokenizer = Tokenizer_newCzechTokenizer()
        elif lang == "en":
            self.tokenizer = Tokenizer_newCzechTokenizer()
        elif lang == "generic":
            self.tokenizer = Tokenizer_newGenericTokenizer()
        elif lang == "vertical":
            self.tokenizer = Tokenizer_newVerticalTokenizer()
        self.forms = Forms()
        self.tokens = TokenRanges()


    def tokenizeSentences(self, text: str):
        self.tokenizer.setText(text)
        while self.tokenizer.nextSentence(self.forms, self.tokens):
            first = self.tokens[0].start
            last = self.tokens[-1].start + self.tokens[-1].length
            yield text[first:last]

    def tokenizeWords(self, text: str):
        self.tokenizer.setText(text)
        while self.tokenizer.nextSentence(self.forms, self.tokens):
            for form in self.forms:
                yield form

def detokenize(txt):
    # TFIDF to transformer (de)tokenization fixes
    txt = txt.replace(" .", ".").replace(" ,", ",").replace(" ?", "?")
    txt = txt.replace("`` ", '"').replace(" ''", '"').replace(" '", "'")
    txt = txt.replace("-LRB- ", "(").replace("-RRB-", ")")
    return txt

def detokenize2(txt):
    # updated detokenize, most models are not trained with this...
    txt = txt.replace(" .", ".").replace(" ,", ",").replace(" ?", "?").replace(" :", ":").replace(" ;", ";")
    txt = txt.replace("`` ", '"').replace(" ''", '"').replace(" '", "'")
    txt = txt.replace("-LRB- ", "(").replace("-RRB-", ")")
    txt = txt.replace("( ", "(").replace(" )", ")")
    return txt


In [4]:
class MorphoDiTa:
    def __init__(self, path):
        self.tagger = Tagger_load(path)
        self.forms = Forms()
        self.lemmas = TaggedLemmas()
        self.lemmas_forms = TaggedLemmasForms()
        self.tokens = TokenRanges()
        self.tokenizer = self.tagger.newTokenizer()
        self.morpho = self.tagger.getMorpho()

    def lemmatize(self, text):
        self.tokenizer.setText(text)
        while self.tokenizer.nextSentence(self.forms, self.tokens):
            self.tagger.tag(self.forms, self.lemmas)
            for lemma in self.lemmas:
                yield (self.morpho.rawLemma(lemma.lemma), self.morpho.rawLemma(lemma.tag))
                
    def analyze(self, text):
        self.tokenizer.setText(text)
        while self.tokenizer.nextSentence(self.forms, self.tokens):
            for form in self.forms:
                result = self.morpho.analyze(form, self.morpho.GUESSER, self.lemmas)
                guesser = "Guesser " if result == self.morpho.GUESSER else ""
                for lemma in self.lemmas:
                    print(f"Form: {form};  Guesser {guesser};  Lemma: {lemma.lemma} Tag: {lemma.tag};  Negation: {lemma.tag[10] == 'N'}")
                    
    def is_negation(self, word: str) -> bool:
        """
            Returns True if a given word is negation.

            Input:
                word: a single word
        """
        assert isinstance(word, str), f"Given word {word} is not a string"
        ret = False
        if len(word.strip().split()) == 1:  # single word
            result = self.morpho.analyze(word, self.morpho.GUESSER, self.lemmas)
            # morphodita can generate more lemmas for a single word (e.g. Nejedlý)
            negative = [lemma.tag[10] == 'N' for lemma in self.lemmas]
            ret = True if any(negative) else False
        return ret

In [5]:
path = "/mnt/data/factcheck/ufal/morphodita/czech-morfflex-pdt-161115/czech-morfflex-pdt-161115.tagger"

In [6]:
mdt = MorphoDiTa(path)

In [7]:
form = 'Nejedlý'

In [9]:
mdt.is_negation(form)

True

In [10]:
result = mdt.morpho.analyze(form, mdt.morpho.GUESSER, mdt.lemmas)
guesser = "Guesser " if result == mdt.morpho.GUESSER else ""

In [11]:
for lemma in mdt.lemmas:
    print(f"Form: {form};  Guesser {guesser};  Lemma: {lemma.lemma} Tag: {lemma.tag};  Negation: {lemma.tag[10] == 'N'}")

Form: Nejedlý;  Guesser ;  Lemma: Nejedlá_;S_^(*1ý) Tag: NNFP1-----A---6;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: Nejedlá_;S_^(*1ý) Tag: NNFP4-----A---6;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: Nejedlá_;S_^(*1ý) Tag: NNFP5-----A---6;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: Nejedlá_;S_^(*1ý) Tag: NNFS2-----A---6;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: Nejedlá_;S_^(*1ý) Tag: NNFS3-----A---6;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: Nejedlá_;S_^(*1ý) Tag: NNFS6-----A---6;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: Nejedlý_;S Tag: NNMP1-----A---6;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: Nejedlý_;S Tag: NNMP4-----A---6;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: Nejedlý_;S Tag: NNMP5-----A---6;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: Nejedlý_;S Tag: NNMS1-----A----;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: Nejedlý_;S Tag: NNMS5-----A----;  Negation: False
Form: Nejedlý;  Guesser ;  Lemma: 

In [12]:
len(mdt.lemmas)

34

In [13]:
text = "Petr nešel do obchodu, protože nebylo ještě otevřeno."

In [14]:
mdt.analyze(text)

Form: Petr;  Guesser ;  Lemma: Petr_;Y Tag: NNMS1-----A----;  Negation: False
Form: nešel;  Guesser ;  Lemma: jít Tag: VpYS---XR-NA---;  Negation: True
Form: do;  Guesser ;  Lemma: do-1 Tag: RR--2----------;  Negation: False
Form: do;  Guesser ;  Lemma: do-7_^(předpona,_sam.) Tag: A2--------A----;  Negation: False
Form: obchodu;  Guesser ;  Lemma: obchod Tag: NNIS2-----A----;  Negation: False
Form: obchodu;  Guesser ;  Lemma: obchod Tag: NNIS3-----A----;  Negation: False
Form: obchodu;  Guesser ;  Lemma: obchod Tag: NNIS6-----A---1;  Negation: False
Form: ,;  Guesser ;  Lemma: , Tag: Z:-------------;  Negation: False
Form: protože;  Guesser ;  Lemma: protože Tag: J,-------------;  Negation: False
Form: nebylo;  Guesser ;  Lemma: být Tag: VpNS---XR-NA---;  Negation: True
Form: ještě;  Guesser ;  Lemma: ještě Tag: Db-------------;  Negation: False
Form: otevřeno;  Guesser ;  Lemma: otevřít Tag: VsNS---XX-AP---;  Negation: False
Form: .;  Guesser ;  Lemma: . Tag: Z:-------------;  Negatio

# Get a proportion of negation claims

In [33]:
def read_jsonl_files(files: List[str]) -> dict:
    """
    Input:
        files: list containing paths of jsonl files

    Return:
        dictionary containing merged claims with corresponding labels
    """
    claims, labels = [], []
    for file in files:
        with open(file) as fr:
            for line in fr:
                d = json.loads(line)
                claims.append(d['claim'])
                labels.append(d['label'])
    assert len(claims) == len(labels)
    # return claims, labels
    ret = {'claim': claims, 'label': labels}
    return ret


def process_jsonl_in_folder(path: str, split: str) -> pd.DataFrame:
    """Read all jsonl files in given path and process them into a single DataFrame."""
    if split == 'all':
        files = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".jsonl")]
    else:
        files = [os.path.join(path, split, '.jsonl')]
    d = read_jsonl_files(files)
    df = pd.DataFrame(d)
    return df

In [34]:
path = '/mnt/data/factcheck/CTK/dataset/splits_concise_ctkId_s0_si0_t0.095_v0.12_source_77'
files = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".jsonl")]

In [35]:
files

['/mnt/data/factcheck/CTK/dataset/splits_concise_ctkId_s0_si0_t0.095_v0.12_source_77/validation.jsonl',
 '/mnt/data/factcheck/CTK/dataset/splits_concise_ctkId_s0_si0_t0.095_v0.12_source_77/test.jsonl',
 '/mnt/data/factcheck/CTK/dataset/splits_concise_ctkId_s0_si0_t0.095_v0.12_source_77/train.jsonl']

In [36]:
df = process_jsonl_in_folder(path, 'all')

In [46]:
neg_claims = 0
for claim in df.claim:
    for w in claim.split():
        if mdt.is_negation(w):
            neg_claims += 1
            break

In [47]:
print(f"Negative claims: {neg_claims} / {len(df.claim)}")

Negative claims: 329 / 3097
