# Scoring MTT details using BabelNet and spaCy

Here, I am developing code to detect mental time travel markers in texts using SpaCy. 

In [None]:
# import packages
import pandas as pd
import spacy
import requests
import time
import re
from spacy.matcher import Matcher
from collections import defaultdict

In [None]:
# load data
data = pd.read_csv("../data/raw/test_data.csv")

# add new ID 
data["ID_new"] = data["ResponseId"].astype(str) + "_" + data["trial"].astype(str)

# create dataframe with relevant variables
selected = ['ID_new', 'text']  
data = data[selected]


Note, that you have to create a [BabelNet](https://babelnet.org/) account to get an API key. Insert the API key from your account into the following code snippet.

In [None]:
# Authentication for BabelNet 
# define API key 
API_KEY = "YOUR_API_KEY"
# url
BASE_URL = "https://babelnet.io/v9/"
# headers
HEADERS   = {"Accept-Encoding": "gzip"}
# language: German
LANG = "DE"
# Cache to reduce API calls
sense_cache = {}

In [None]:
# Load language model from spacy
nlp = spacy.load("de_core_news_lg")  

In [None]:
# functions for creating dictionaries related to emotion and perception words
 
# API call to get babelnet synsets
def get_babelnet_senses(word, lang=LANG):
    if word in sense_cache:
        return sense_cache[word]

    url    = BASE_URL + "getSenses"
    params = {"lemma": word, "searchLang": lang, "key": API_KEY}
    resp   = requests.get(url, params=params, headers=HEADERS)
    time.sleep(1)
    if resp.status_code != 200:
        print(f"Fehler bei '{word}': {resp.status_code} – {resp.text[:100]!r}")
        return []
    senses = resp.json()
    sense_cache[word] = senses
    return senses

# fumction to extract lemmas
def extract_lemma_from_sense(sense, lang=LANG, allowed_pos={"NOUN", "ADJ", "VERB"}):
    props = sense.get("properties", {})
    lemma = props.get("fullLemma", "").lower()
    if props.get("language") == lang and props.get("pos") in allowed_pos:
        if not re.search(r"[0-9()@#\[\]{}:;,\-`´—]+", lemma) and not re.search(r'[\U00010000-\U0010ffff]', lemma):
            return {lemma}
    return set()

# function to get the lemmas from babelnet
def get_babelnet_lemmas(seed_words):
    all_lemmas = set()
    for w in seed_words:
        senses = get_babelnet_senses(w)
        print(f"{len(senses)} Senses für '{w}' gefunden")
        for sense in senses:
            all_lemmas |= extract_lemma_from_sense(sense)
    print(f"  ⇒ insgesamt {len(all_lemmas)} Lemmas")
    return all_lemmas

In [None]:
# Create a dictionary of perception words

# German words for base emotions and thoughts 
emotion_seeds = {
    "Wut": ["Wut", "Zorn", "ärgerlich", "verärgert", "zornig"],
    "Freude": ["Freude", "Glück", "fröhlich", "zufrieden", "glücklich"],
    "Angst": ["Angst", "Furcht", "ängstlich"],
    "Traurigkeit": ["Traurigkeit", "Kummer", "unglücklich", "traurig"],
    "Ekel": ["Ekel", "widerlich"],
    "Liebe": ["Liebe", "lieben"],
    "Hass": ["Hass", "hassen", "Abneigung"],
    "Stress": ["Stress", "gestresst", "überfordert", "unruhig"],
    "Schuld": ["Schuld", "schuldig", "Reue", "bereuen"],
    "Scham": ["Scham", "peinlich", "beschämt", "verlegen"],
    "Stolz": ["Stolz", "stolz", "selbstbewusst"],
    "Verzweiflung": ["Verzweiflung", "hoffnungslos", "verlassen", "hilflos"],
    "Gedanken": ["Gedanke", "denken", "vermuten", "hoffen", "glauben"]
}

# German words for perceptions 
perception_seeds = {
    "Wahrnehmung": ["Wahrnehmung", "wahrnehmen", "Empfindung", "empfinden"],
    "Sehen": ["sehen", "Anblick", "Aussicht", "anschauen", "hell", "dunkel", "bunt", "rot", "blau", "grün", "gelb", "orange", "lila"],
    "Hören": ["hören", "Geräusch", "Lärm", "Musik", "laut", "leise"],
    "Tasten": ["fühlen", "berühren", "tasten", "taktil", "weich", "hart", "glatt", "rau"],
    "Riechen": ["riechen", "Geruch"],
    "Schmecken": ["schmecken", "Geschmack", "salzig", "süß", "bitter", "würzig"],
    "Temperatur": ["Temperatur", "Hitze", "Kälte", "kalt", "warm", "heiß"],
    "Körperwahrnehmung": ["Schmerz", "schmerzen", "Hunger", "hungrig", "Durst", "durstig", "Müdigkeit", "müde"],
}

# function to build dictionary 
def build_lemma_dict(seed_dict):
    result = {}
    for category, seed_words in seed_dict.items():
        lemmas = get_babelnet_lemmas(seed_words)
        result[category] = list(lemmas)
    return result

# dictionary for emotions
emotion_dict = build_lemma_dict(emotion_seeds)

# dictionary for perceptions
perception_dict = build_lemma_dict(perception_seeds)

In [None]:
# function to define spacy matcher
def build_matcher(nlp, category_dict, label_prefix):
    matcher = Matcher(nlp.vocab)
    for category, lemmas in category_dict.items():
        for lemma in lemmas:
            pattern = [{"LEMMA": lemma}]
            matcher.add(f"{label_prefix}_{category}", [pattern])
    return matcher

# matcher for emotion words 
emotion_matcher = build_matcher(nlp, emotion_dict, "EMOTION")

# matcher for perception words 
perception_matcher = build_matcher(nlp, perception_dict, "PERCEPTION")

In [None]:
# define function to find time markers

# define time markers
monate = r"(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)"
wochentage = r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag)"
holidays = r"(Weihnachten|Heiligabend|Ostern|Neujahr|Pfingsten|Silvester|Karfreitag)"
tageszeiten = r"\b(Frühjahr|abends|Morgen|morgens|Mittag|mittags|Nacht|nachts|Nachmittag|nachmittags|Vormittag|vormittags)\b"
zeiten = r"\b(heute|gestern|morgen|vorgestern|übermorgen|Wochenende|Woche|Monat|Jahr)\b"
jahreszeiten = r"\b(Frühling|Frühjahr|Sommer|Herbst|Winter)\b"
# define date patterns using regex
date_patterns = [
    r"\b\d{1,2}\.\d{1,2}\.\d{2,4}\b",           
    r"\b\d{1,2}\.?\s+" + monate + r"\s+\d{4}",  
    r"\b\d{4}\b",                                        
    r"\bum\s+\d{1,2}(:\d{2})?\s*Uhr", 
    r"\b(Am\s+\d{1,2}\.\s+" + monate + r")\b",
    r"\bam\s+(\d{1,2})\b", 
    wochentage,  
    monate, 
    holidays, 
    tageszeiten,
    zeiten,
    jahreszeiten      
]

# function to extract time markers
def extract_time_info(text, doc):
    times = []
    for ent in doc.ents:
        if ent.label_ in ["DATE", "TIME"]:
            times.append(ent.text)
    for pattern in date_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            if isinstance(matches[0], tuple):
                times.extend([m[0] for m in matches])  
            else:
                times.extend(matches)
    times = [time for time in times]
    return times 

In [None]:
# define function for text analysis 
def analyze_text(text):
    doc = nlp(text)
    # Lemmatize tokens
    lemmatized_tokens = [token.lemma_ for token in doc]
    emotion_matches = emotion_matcher(doc)
    perception_matches = perception_matcher(doc)
    # Extracting elements
    events = [token.lemma_ for token in doc if token.dep_ == "ROOT" and token.pos_ == "VERB"]
    places = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["GPE", "LOC", "FAC"]]
    times = extract_time_info(text, doc)
    emotions = [nlp.vocab.strings[match_id].replace("EMOTION_", "") for match_id, _, _ in emotion_matches]
    perceptions = [nlp.vocab.strings[match_id].replace("PERCEPTION_", "") for match_id, _, _ in perception_matches]
    if not places and not times and not emotions and not perceptions:
        events = []
    return events, places, times, emotions, perceptions

# Apply functions to each row 
results = data['text'].apply(analyze_text)

In [None]:
# convert results to separate columns
results_df = pd.DataFrame(results.tolist(), columns=['events', 'places', 'times', 'emotions', 'perceptions'])

# count the occurences
results_df['event_count'] = results_df['events'].apply(len)
results_df['place_count'] = results_df['places'].apply(len)
results_df['time_count'] = results_df['times'].apply(len)
results_df['emotion_count'] = results_df['emotions'].apply(len)
results_df['perception_count'] = results_df['perceptions'].apply(len)

# add the counts to the original data frame
cols_to_add = ['event_count', 'place_count', 'time_count', 'emotion_count', 'perception_count']
for col in cols_to_add:
    data[col] = results_df[col]

results_df


In [None]:
# save data in new csv file
data.to_csv('../data/scored/spacy_scored.csv', index=False)