<a href="https://colab.research.google.com/github/bieropener/TW_thesis_Marsman/blob/main/TW_thesis_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install dependencies

In [None]:
!pip install spacy
# install the trained Dutch pipeline, used for tokenizing the corpus
!python -m spacy download nl_core_news_sm
!pip install lxml
# clone the repository of OpenDutchWordnet, the dictionary
!rm -rf OpenDutchWordnet
!git clone https://github.com/cltl/OpenDutchWordnet.git
!python setup.py install
!pip install rdflib

Import all tools

In [None]:
import xml.etree.ElementTree as ET
# Dutch pipeline for tokenizing the corpus
import spacy
nlp = spacy.load("nl_core_news_sm")
# Lxml to deal with XML-type files
import lxml
# Open Dutch WordNet for dictionary meanings
from OpenDutchWordnet import Wn_grid_parser
odwn = Wn_grid_parser(Wn_grid_parser.odwn)
# BERTje model for contextualized word embeddings
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
model = AutoModel.from_pretrained("GroNLP/bert-base-dutch-cased")
# tools to compare word embeddings
from sklearn.metrics.pairwise import cosine_similarity
import torch

Function definitions for preprocessing Lassy Small corpus.

In [None]:
# takes a XML-file and then extracts the plain text sentence from that file
def Lassy_XML_to_text(file):
  # create empty list to add all sentences to
  sents = []
  # read the file
  tree = ET.parse(file)
  # get the root element of the tree, which contains all other elements
  root = tree.getroot()
  # find every <sentence> element and add its text to the list of sentences
  for sent in root.iter('sentence'):
    sents.append(sent.text)
  return sents

# takes a list of strings with XML filenames, returns a text with plain text from these files
def Lassy_filelist_to_text(file_list):
  # create list for all sentences from different files
  all_sentences = []
  # extract the plain text by iterating over these files
  for filename in file_list:
    new_sent = Lassy_XML_to_text(filename)
    for word in new_sent:
      all_sentences.append(word)
  # Join list of all sentences together to form one text.
  one_text = ' '.join(all_sentences)
  return one_text

Function definitions for retrieving and comparing basic and contextual meaning for lexical units from a corpus of plain text.

In [None]:
# Function to get basic meanings for all nouns in a tokenized text.
def get_basic_meanings(corpus):
  basic_meanings = {}
  for token in corpus:
    if token.pos_ == "NOUN":
      meanings_num = odwn.lemma_num_senses(token.text, pos='noun')
      if meanings_num >1:
        token_entry = odwn.les_find_le(f"{token.text}-n-1")
        if token_entry:
          token_def = token_entry.get_definition()
          basic_meanings[token] = token_def
  return basic_meanings

# Function to get the contextual embedding of a token using BERTje.
def get_contextual_embedding(LU, sent):
  inputs = tokenizer(sent, return_tensors = "pt", padding = True, truncation = True)
  outputs = model(**inputs)
  tokenized_sentence = tokenizer.tokenize(sent)
  LU_tokens = tokenizer.tokenize(LU.text)
  LU_index = -1
  for i in range(len(tokenized_sentence)-  len(LU_tokens)+1):
    if tokenized_sentence[i:i + len(LU_tokens)] == LU_tokens:
      LU_index = i
      break
  if LU_index == -1:
    raise ValueError(f"Could not find tokens for {LU.text} in the tokenized sentence")
  #get index of token in tokenized input
  # LU_index = inputs.input_ids[0].tolist().index(tokenizer.convert_tokens_to_ids(LU.text))
  #get contextualized embedding
  LU_embedding = outputs.last_hidden_state[0,LU_index, :]
  return LU_embedding

# Function to get the synset embeddings for a token.
def get_synset_embeddings(LU):
  # create list for synsets
  synsets = []
  meanings_num = odwn.lemma_num_senses(LU.text, pos='noun')
  if meanings_num >0:
    for sense_num in range(1,meanings_num +1):
      le = odwn.les_find_le(f"{LU.text}-n-{sense_num}")
      if le:
        synset_text = le.get_definition()
        inputs = tokenizer(synset_text, return_tensors="pt", padding = True, truncation = True)
        outputs = model(**inputs)
        synset_embedding = outputs.last_hidden_state.mean(dim=1)
        synsets.append((le, synset_embedding))
    return synsets

# function that retrieves contextual meanings for noun tokens in a dictionary, using the corpus for context
def get_contextual_meanings(corpus, basic_dict):
  con_meanings = {}
  for token in basic_dict:
    sentence = token.sent.text
    word_embedding = get_contextual_embedding(token, sentence)
    synsets = get_synset_embeddings(token)
    if synsets:
      best_synset = None
      max_similarity = 0
      for le, synset_embedding in synsets:
        similarity = cosine_similarity(word_embedding.detach().numpy().reshape(1,-1), synset_embedding.detach().numpy().reshape(1,-1))[0][0]
        if similarity > max_similarity:
          max_similarity = similarity
          best_synset = le
        if best_synset:
          con_meanings[token] = best_synset.get_definition()
  return con_meanings

def compare_meanings(basic_dict, contextual_dict):
  non_literal_dict = {}
  for word, meaning in basic_dict.items():
    #if word in contextual_dict:
    if meaning != contextual_dict[word]:
      non_literal_dict[word] = (meaning, contextual_dict[word])
  return non_literal_dict

def preprocess(text):
  lower_text = text.lower()
  processed_text = nlp(lower_text)
  no_punct_text = [token for token in processed_text if not token.is_punct]
  no_num_text = [token for token in no_punct_text if not token.is_digit]
  no_mrw_text = [token for token in no_num_text if token.text!="mrw"]
  return no_mrw_text

def MIPVU(corpus):
  processed_corpus = preprocess(corpus)
  bdef = get_basic_meanings(processed_corpus)
  cdef= get_contextual_meanings(processed_corpus, bdef)
  res = compare_meanings(bdef, cdef)
  return res

Import corpus and preprocess if necessary (for Lassy Small corpus). Then apply MIPVU.

In [None]:
# take a small set of the corpus, save it as a list of filenames
file_list = ['WR-P-E-E-0000000018.p.1.s.1.xml','WR-P-E-E-0000000018.p.2.s.1.xml','WR-P-E-E-0000000018.p.2.s.2.xml','WR-P-E-E-0000000018.p.2.s.3.xml','WR-P-E-E-0000000018.p.2.s.4.xml','WR-P-E-E-0000000018.p.2.s.5.xml','WR-P-E-E-0000000018.p.3.s.1.xml','WR-P-E-E-0000000018.p.4.s.1.xml','WR-P-E-E-0000000018.p.5.s.1.xml','WR-P-E-E-0000000018.p.6.s.1.xml','WR-P-E-E-0000000018.p.7.s.1.xml','WR-P-E-E-0000000018.p.7.s.2.xml','WR-P-E-E-0000000018.p.7.s.3.xml','WR-P-E-E-0000000018.p.7.s.4.xml','WR-P-E-E-0000000018.p.7.s.5.xml','WR-P-E-E-0000000018.p.7.s.6.xml','WR-P-E-E-0000000018.p.7.s.7.xml','WR-P-E-E-0000000018.p.8.s.1.xml','WR-P-E-E-0000000018.p.8.s.2.xml','WR-P-E-E-0000000018.p.9.s.1.xml','WR-P-E-E-0000000018.p.10.s.1.xml','WR-P-E-E-0000000018.p.10.s.2.xml','WR-P-E-E-0000000018.p.10.s.3.xml','WR-P-E-E-0000000018.p.10.s.4.xml','WR-P-E-E-0000000018.p.10.s.5.xml','WR-P-E-E-0000000018.p.11.s.1.xml','WR-P-E-E-0000000018.p.11.s.2.xml','WR-P-E-E-0000000018.p.12.s.1.xml','WR-P-E-E-0000000018.p.13.s.1.xml','WR-P-E-E-0000000018.p.14.s.1.xml','WR-P-E-E-0000000018.p.14.s.2.xml','WR-P-E-E-0000000018.p.14.s.3.xml','WR-P-E-E-0000000018.p.14.s.4.xml','WR-P-E-E-0000000018.p.14.s.5.xml','WR-P-E-E-0000000018.p.14.s.6.xml','WR-P-E-E-0000000018.p.14.s.7.xml','WR-P-E-E-0000000018.p.15.s.1.xml','WR-P-E-E-0000000018.p.15.s.2.xml']
corpus_Lassy = Lassy_filelist_to_text(file_list)
# make sure the files in the file list together form a text and not more or less.
print(corpus_Lassy)
Lassy_MIPVU = MIPVU(corpus_Lassy)
for word, meaning in Lassy_MIPVU.items():
  print(word)
  print(meaning)

Try it out with some sample sentences

In [None]:
test = 'De student moest een berg huiswerk maken in een weekend.'
haha = MIPVU(test)
for word, meaning in haha.items():
  print(word)
  print(meaning)

Evaluation with Bruggen corpus.

In [None]:
# read the file and preprocess the text
Bruggen_file = open('Bruggen (2024) bijlage 1.txt', 'r')
Bruggen_corpus = Bruggen_file.read()
Bruggen_MIPVU = MIPVU(Bruggen_corpus)

In [None]:
for word, meaning in Bruggen_MIPVU.items():
  print(f"word: {word}, meaning: {meaning}")