Zunächst müssen einige Module importiert werden, die wir nutzen wollen

In [None]:
from xml.etree import ElementTree as ET
import random
from random import *


Jetzt machen wir die OdeNet-XML-Datei auf, parsen das XML und öffnen eine Datei, in die die Ausgabe geschrieben wird.

In [None]:
de_wn = open(r"deWordNet.xml","r",encoding="utf-8")

out_lex = open("out_lex.txt","w",encoding="utf-8")

tree = ET.parse(de_wn)

root = tree.getroot()

lexicon = root.find('Lexicon')


Mit check_word_lemma greift man auf Lexikon-Einträge zu, bekommt die Lexikon-ID für ein Wort, den Lemma-Wert, POS und die IDs der Synsets, in denen das Wort enthalten ist.

In [None]:
def check_word_lemma(word_to_check):    
    for lexentry in lexicon.iter('LexicalEntry'):
        lemma = lexentry.find('Lemma')
        lemma_value = lemma.attrib['writtenForm']
        lemma_id = lexentry.attrib['id']
        if lemma_value == word_to_check:
            pos = lemma.attrib['partOfSpeech']
            senses = []
            for sense in lexentry.iter('Sense'):
                sense_id = sense.attrib['id']
                synset_id = sense.attrib['synset']
#                senserelation_type = lexentry.find('SenseRelation').attrib['relType']
#                senserelation_target = lexentry.find('SenseRelation').attrib['target']
                senses.append([sense_id,synset_id])
#            print("LEMMA: " + lemma_value + "\nPOS: " + pos + "\nSENSE ID: " + sense_id)
            return(lemma_id, lemma_value, pos, senses)

In [None]:
check_word_lemma("Weihnachten")

Hier bekommt man die Lexikon-IDs für eine Liste von Wörtern.

In [None]:
def words2ids(wordlist):
    word_id_list = []
    for word in wordlist:
        try:
            lemma_id, lemma, pos, senses = check_word_lemma(word)
            word_id_list.append(lemma_id)
        except:
            print(word + " NOT IN ODENET")
    return(word_id_list)

In [None]:
words2ids(['Frühling','Sommer','Herbst','Winter'])

Mit check_word_id bekommt man für eine Lexikon-ID Lemma, POS, Synsets und Relationen

In [None]:
def check_word_id(id):    
    for lexentry in lexicon.iter('LexicalEntry'):
        if lexentry.attrib['id'] == id:
            lemma = lexentry.find('Lemma')
            lemma_value = lemma.attrib['writtenForm']
            pos = lemma.attrib['partOfSpeech']
            senses = []
            for sense in lexentry.iter('Sense'):
                sense_id = sense.attrib['id']
                synset_id = sense.attrib['synset']
#                senserelation_type = lexentry.find('SenseRelation').attrib['relType']
#                senserelation_target = lexentry.find('SenseRelation').attrib['target']
                senses.append(synset_id)
                relations = []
                if sense.find('SenseRelation') != None:
                    for relation in sense.iter('SenseRelation'):
                        reltype = relation.attrib['relType']
                        reltarget = relation.attrib['target']
                        relations.append((reltype,reltarget))
    return(lemma_value, pos, senses,relations)

In [None]:
check_word_id('w14145')

Mit words_in_synset bekommt man die Wörter, die in einem Synset sind.

In [None]:
def words_in_synset(id):
    words = []
    for lexentry in lexicon.iter('LexicalEntry'):
        for sense in lexentry.iter('Sense'):
            if sense.attrib['synset'] == id:
                lemma = lexentry.find('Lemma').attrib['writtenForm']
                words.append(lemma)
    return(words)

In [None]:
words_in_synset('odenet-2754-n')

Mit check_synset bekommt man alle Informationen zu einem Synset.

In [None]:
def check_synset(id):
    words = words_in_synset(id)
    for synset in lexicon.iter('Synset'):
        if id == synset.attrib['id']:
            ili = synset.attrib['ili']
            try:
                en_definition = synset.attrib["{http://purl.org/dc/elements/1.1/}description"]
            except KeyError:
                en_definition = []
            if synset.find('Definition') != None:
                de_definition = synset.find('Definition').text.strip()
            else:
                de_definition = []
            relations = []
            for relation in synset.iter('SynsetRelation'):
                reltype = relation.attrib['relType']
                reltarget = relation.attrib['target']
                relations.append((reltype,reltarget))
            return(ili,en_definition,de_definition, relations, words)

In [None]:
check_synset("odenet-25555-a")

In [None]:
def hypernyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    hyp_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "hypernym":
                hypernym_synset = relation[1]
                hypernym_words = words_in_synset(relation[1])
#            else:
#                hypernym_synset = []
#                hypernym_words = []               
                hyp_list.append((sense[0],hypernym_synset,hypernym_words))
    return(hyp_list)

def hyponyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    hyp_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "hyponym":
                hyponym_synset = relation[1]
                hyponym_words = words_in_synset(relation[1])
 #           else:
 #               hyponym_synset = []
 #               hyponym_words = []               
                hyp_list.append((sense[0],hyponym_synset,hyponym_words))
    return(hyp_list)

def meronyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    mero_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "mero_part":
                meronym_synset = relation[1]
                meronym_words = words_in_synset(relation[1])
                mero_list.append((sense[0],meronym_synset,meronym_words))
    return(mero_list)

def holonyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    holo_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "holo_part":
                holo_synset = relation[1]
                holo_words = words_in_synset(relation[1])
                holo_list.append((sense[0],holo_synset,holo_words))
    return(holo_list)

def antonyms_word(word):
    lemma_id, lemma, pos, senses = check_word_lemma(word)
    anto_list = []
    for sense in senses:
        (ili,definition,de_definition, relations, words) = check_synset(sense[1])
        for relation in relations:
            if relation[0] == "antonym":
                antonym_synset = relation[1]
                antonym_words = words_in_synset(relation[1])
                anto_list.append((sense[0],antonym_synset,antonym_words))
    return(anto_list)

In [None]:
hypernyms_word("Barsch")

In [None]:
meronyms_word("Morgenland")

In [None]:
myword = "übertragen"
(lemma_id, lemma_value, pos, senses) = check_word_lemma(myword)
print (lemma_value + " " + pos + " ")
for sense in senses:
    print("SENSE: " + str(sense[1]) + "  " + str(check_synset(sense[1])) + "\n")
print("HYPERNYMS: " + str(hypernyms_word(myword)))
print("HYPONYMS: " + str(hyponyms_word(myword)))
print("MERONYMS: " + str(meronyms_word(myword)))
print("HOLONYMS: " + str(holonyms_word(myword)))
print("ANTONYMS: " + str(antonyms_word(myword)))


In [None]:
import re
import networkx as nx
from networkx.readwrite import json_graph
from fourlang.text_to_4lang import TextTo4lang
from fourlang.lexicon import Lexicon
from graphviz import Source
from scripts.parse_data import read
from scripts.check_output import compare_graphs
from tqdm import tqdm

In [None]:
data_frame = read("de", graded=False)
data_frame

In [None]:
"""
for i in check_word_lemma("Papier")[3]:
    print(check_synset(i[1]))
"""
premise = "jazz"
hyper_premise_names_all = []
prems = []
try:
    prems = hypernyms_word(premise)
except:
    try:
        prems = hypernyms_word(premise.capitalize())
    except:
        print(premise)
        not_found += 1
        prems = []

if prems:
    for prem in prems:
        hyper_premise_names_all += prem[2]

hyper_premise_lower = [i.lower() for i in hyper_premise_names_all]
print(hyper_premise_lower)

In [None]:
preds_de = []
not_found = 0
for j in tqdm(range(len(data_frame))):
    index = j
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    
    hyper_premise_names_all = []
    prems = []
    try:
        prems = hypernyms_word(premise)
    except:
        try:
            prems = hypernyms_word(premise.capitalize())
        except:
            not_found += 1
            prems = []
    
    if prems:
        for prem in prems:
            hyper_premise_names_all += prem[2]
    
    hyper_premise_lower = [i.lower() for i in hyper_premise_names_all]
    
    if (hypothesis in set(hyper_premise_lower)):
        preds_de.append(1)
    else:
        preds_de.append(0)

In [None]:
with open("result_binary", "w+") as f:
    for i,pred in enumerate(preds):
        premise = data_frame.premise[i]
        hypothesis = data_frame.hypothesis[i]
        f.write(premise + " " + hypothesis + " " + str(pred) + "\n")

In [None]:
preds = []
for ind in fourlang_votes:
    preds.append(ind)

In [None]:
import os
from collections import defaultdict
from nltk.corpus import wordnet as wn

dictionary = defaultdict(list)
with open("dictionaries/de_to_en", "r+") as f:
    for line in f:
        line = line.strip().split("\t")
        dictionary[line[1].lower()].append(line[3].lower())

In [None]:
preds = []
not_found = 0
for j in tqdm(range(len(data_frame))):
    index = j
    premise = data_frame.premise[index]
    hypothesis = data_frame.hypothesis[index]
    score = data_frame.score[index]
    
    hyp_syn_names_all = []
    hyper_premise_names_all = []
    
    premises = []
    hypothesises = []
    
    if premise in dictionary:
        premise = dictionary[premise]
        premises += premise
    if hypothesis in dictionary:
        hypothesis = dictionary[hypothesis]
        hypothesises += hypothesis
    
    for premise in premises:
    
        premise_syns = wn.synsets(premise)
        """
        if len(premise_syns) > 0 and len(hyp_syns) > 0:
            en_premise = premise_syns[0].lemmas()[0].name()
            en_hyp = hyp_syns[0].lemmas()[0].name()
            fourlang_score = get_4lang_score(en_premise, en_hyp)
        else:
            fourlang_score = 0
        """


        for premise_syn in premise_syns:

            hyperpremise = set([i for i in premise_syn.closure(lambda s:s.hypernyms())])

            hyper_premise_lemmas = []
            for i in hyperpremise:
                lemmas = i.lemmas()
                for lemm in lemmas:
                    hyper_premise_lemmas.append(lemm)

            hyper_premise_names = set([i.name() for i in hyper_premise_lemmas])
            hyper_premise_names_all += list(hyper_premise_names)

    for hypothesis in hypothesises:
        hyp_syns = wn.synsets(hypothesis)
        for hyp_syn in hyp_syns:
            hyp_syn_lemmas = hyp_syn.lemmas()
            hyp_syn_names = set([i.name() for i in hyp_syn_lemmas])

            hyp_syn_names_all += list(hyp_syn_names)
        
    if (set(hyp_syn_names_all) & set(hyper_premise_names_all)) or preds_de[index] == 1 or fourlang_votes[index] == '1':
        preds.append(1)
    else:
        preds.append(0)