# Carry out factor analysis

---

* Load the dataset

---

In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv("representative_sample_with_context_withoutDiscard.csv", delimiter = ";")

# Progress Check-In
print("\n=== Initial Data Summary ===")
print(f"Dataset: {len(data)} rows, columns: {data.columns.tolist()}")

print("\n=== Dataset Preview ===")
print(data.head())

---

* Install and load relevant packages

---

In [None]:
# Install SpaCy for linguistic data processing
!pip install spacy

# Download the large German model
!python -m spacy download de_core_news_lg

# Install factor_analyzer for factor analysis
!pip install factor_analyzer

---

* Create new tagging rules because spaCy mislabels some words
---

In [None]:
import spacy
from spacy.language import Language
from spacy.tokens import Token

# Load German spaCy model
nlp = spacy.load("de_core_news_lg")

# Define retagging rules in a dictionary
retagging_rules = {
    # Greetings
    "ah": {"lemma": "ah", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "ahh": {"lemma": "ah", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "ahhh": {"lemma": "ah", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "ahhhh": {"lemma": "ah", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "ciao": {"lemma": "ciao", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "ciaoi": {"lemma": "ciao", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "halli": {"lemma": "hallo", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "hallo": {"lemma": "hallo", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "hallöchen": {"lemma": "hallo", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "hello": {"lemma": "hallo", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "hey": {"lemma": "hey", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "heyy": {"lemma": "hey", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "heyyy": {"lemma": "hey", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "heyhey": {"lemma": "hey", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "hi": {"lemma": "hi", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "lg": {"lemma": "LG", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "lol": {"lemma": "lol", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "mfg": {"lemma": "MFG", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "moin": {"lemma": "moin", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "moinsen": {"lemma": "moinsen", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "oh": {"lemma": "oh", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "ohh": {"lemma": "oh", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "ohhh": {"lemma": "oh", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "ohhhh": {"lemma": "oh", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "tschau": {"lemma": "tschau", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "tschüss": {"lemma": "tschüss", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "tschüsschen": {"lemma": "tschüss", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "tschüssi": {"lemma": "tschüss", "pos": "INTJ", "tag": "ITJ", "morph": ""},
    "tschüß": {"lemma": "tschüss", "pos": "INTJ", "tag": "ITJ", "morph": ""},

    # "würde(s)t" != lemma: "würde(s)t"
    "würdest": {"lemma": "werden", "pos": "AUX", "tag": "VAFIN", "morph": "Mood=Sub|Number=Sing|Person=2|Tense=Past|VerbForm=Fin"},
    "würdet": {"lemma": "werden", "pos": "AUX", "tag": "VAFIN", "morph": "Mood=Sub|Number=Plur|Person=2|Tense=Past|VerbForm=Fin"},

    # "gibts" != VIMP
    "gibts": {"lemma": "geben", "pos": "VERB", "tag": "VVFIN", "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin"},
    "machts": {"lemma": "machen", "pos": "VERB", "tag": "VVFIN", "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin"},

    # Abbreviations without dividing periods
    "eigtl": {"lemma": "eigentlich", "pos": "ADV", "tag": "ADV", "morph": ""},
    "etc": {"lemma": "etc", "pos": "X", "tag": "XY", "morph": ""},
    "nh": {"lemma": "nh", "pos": "X", "tag": "XY", "morph": ""},
    "usw": {"lemma": "usw", "pos": "X", "tag": "XY", "morph": ""},
    "vlt": {"lemma": "vielleicht", "pos": "ADV", "tag": "ADV", "morph": ""},
    "vllt": {"lemma": "vielleicht", "pos": "ADV", "tag": "ADV", "morph": ""},
    "zb": {"lemma": "zb", "pos": "X", "tag": "XY", "morph": ""}
}

# Register custom attributes for tokens
Token.set_extension("custom_lemma", default=None, force=True)
Token.set_extension("custom_pos", default=None, force=True)
Token.set_extension("custom_tag", default=None, force=True)
Token.set_extension("custom_morph", default=None, force=True)


@Language.component("custom_retagger")
def custom_retagger(doc):
    # Retrieve the question_id from the document's user_data
    question_id = doc.user_data.get("question_id", "unknown_id")  # Default to "unknown_id" if missing

    for token in doc:
        word = token.text.lower()  # case-insensitive
        if word in retagging_rules:
            # Apply the custom annotations from the dictionary (override existing values)
            rule = retagging_rules[word]
            token._.custom_lemma = rule["lemma"]
            token._.custom_pos = rule["pos"]
            token._.custom_tag = rule["tag"]
            token._.custom_morph = rule["morph"]
            print(f"Question ID: {question_id}, Retagged: {token.text} -> {rule}")
    return doc


# Add custom retagger to the pipeline
nlp.add_pipe("custom_retagger", last=True)


---
* Check tagging pipeline: custom_retagger must be included (is added at the end)

---

In [None]:
print(nlp.pipe_names)

---
* Define and extract linguistic features

---

In [None]:
import spacy
from tqdm import tqdm
from spacy.matcher import Matcher


# Ensure tqdm works with pandas' apply function
tqdm.pandas()

# Load the German SpaCy model
#nlp = spacy.load("de_core_news_lg")


# --------------------------------------------------------------------------------
# Define word lists for semantic analysis
# --------------------------------------------------------------------------------
# VERBS

# Causative verbs: Expository, argumentative.	Explains cause-effect relationships.
causative_verbs = {
    "animieren", "anregen", "anspornen", "auslösen", "bedingen", "bewegen", "bewerkstelligen", "bewirken", "entfachen", "entfesseln", "entstehen", "ergeben", "ermuntern", "ermöglichen", "erregen", "erwecken", "evozieren", "folgen", "führen", "herbeiführen", "hervorrufen", "hinauslaufen", "induzieren", "initiieren", "kontrollieren", "lassen", "leiten", "lenken", "machen", "motivieren", "münden", "resultieren", "schaffen", "sorgen", "stärken", "triggern", "veranlassen", "verlaufen", "verursachen"
    }

# Communication verbs: Conversational, narrative.	Highlights interaction or narrative dialogue.
communication_verbs = {
    "anfragen", "anklingeln", "anmailen", "anrufen", "anschreiben", "anschreien", "ansimsen", "smsen", "antexten", "ausplaudern", "aussagen", "babbeln", "begründen", "behaupten", "beichten", "bekanntgeben", "bekanntmachen", "bekennen", "benachrichtigen", "bereden", "berichten", "besprechen", "chatten", "debattieren", "diskutieren", "disputieren", "durchklingeln", "durchsprechen", "einräumen", "erfragen", "erklären", "erkundigen", "erläutern", "erzählen", "erörtern", "fragen", "gestehen", "grüßen", "hinterfragen", "informieren", "kommunizieren", "kontaktieren", "kundtun", "mailen", "melden", "mitteilen", "nachfragen", "nachhaken", "offenbaren", "plaudern", "publikmachen", "quasseln", "quatschen", "rufen", "sagen", "schnacken", "schreiben", "schreien", "simsen", "sprechen", "telefonieren", "texten", "unterhalten", "unterrichten", "verhandeln", "verraten", "verständigen", "veröffentlichen", "vorlesen", "wissenlassen", "zugeben", "äußern"
    }

# Desire verbs: Reflective, persuasive.	Conveys subjectivity and personal or collective intentions.
desire_verbs = {
    "abzielen", "anpeilen", "anvisieren", "beabsichtigen", "bezwecken", "erhoffen", "ersehnen", "erstreben", "ersuchen", "erträumen", "erwünschen", "fokussieren", "herbeiwünschen", "hoffen", "intendieren", "konzentrieren", "sehnen", "sinnen", "streben", "suchen", "träumen", "wünschen"
    }

# Epistemic verbs: Expository, evaluative.	Marks reasoning, belief, or uncertainty.
epistemic_verbs = {
    "ahnen", "annehmen", "anzweifeln", "ausgehen", "bezweifeln", "erahnen", "erwarten", "klingen", "meinen", "mutmaßen", "rechnen", "scheinen", "schätzen", "spekulieren", "unterstellen", "vermuten", "vorausahnen", "vorhersehen", "wirken", "wittern", "zweifeln"
    }

# Existence verbs: Descriptive, narrative. Establishes states, entities, or general conditions. (w/o sein)
existence_verbs = {
    "anhalten", "aufhalten", "auftreten", "befinden", "bestehen", "bleiben", "dauern", "enthalten", "existieren", "fortbestehen", "fortdauern", "halten", "herrschen", "leben", "lieben", "standhalten", "überdauern", "überleben", "verbleiben", "verbringen", "verweilen", "vorkommen", "vorliegen", "weiterbestehen", "weitergehen", "wohnen", "währen"
    }

# Justification verbs: Formal, argumentative. Provides reasoning or validates claims.
justification_verbs = {
    "argumentieren", "begründen", "bekräftigen", "belegen", "bescheinigen", "bestätigen", "beteuern", "beweisen", "bezeugen", "demonstrieren", "einstehen", "legitimieren", "nachweisen", "rechtfertigen", "stärken", "stützen", "untermauern", "validieren", "verargumentieren", "vergewissern", "zeigen"
    }

# Mental verbs: Reflective, analytical.	Highlights thought processes and introspection.
mental_verbs = {
    "auseinandersetzen", "ausklügeln", "beachten", "bedenken", "beherzigen", "berücksichtigen", "brüten", "denken", "durchdenken", "erinnern", "erkennen", "erwägen", "finden", "glauben", "grübeln", "heißen", "nachdenken", "nachgrübeln", "nachsinnen", "reflektieren", "sehen",  "sinnieren", "studieren", "überdenken", "überlegen", "vergessen", "verstehen", "wissen"
    }


# --------------------------------------------------------------------------------
# ADJECTIVES

# Attitudinal adjectives: Reflective, persuasive, personal. Reflect subjectivity, personal stance, and evaluation.
attitudinal_adj = {
    "abartig", "absurd", "affig", "albern", "angepassrt", "angepisst", "anmaßend", "anständig", "arrogant", "asi", "asozial", "assi", "assig", "aufdringlich", "begeistert", "beleidigt", "bescheiden", "bescheuert", "beschränkt", "blöd", "blödsinnig", "boshaft",  "brav", "böse", "charmant", "chillig", "cool", "dankbar", "dienlich", "direkt", "doof", "dramatisch", "dreist", "dumm", "durchgeknallt", "egoistisch", "egozentrisch", "ehrlich", "eifersüchtig", "eigenartig", "eigensinnig", "eingebildet", "eingebildet", "eingeschnappt", "eitel", "engagiert", "enthusiastisch", "entspannt", "enttäuscht", "erfreut", "erschrocken", "erstaunt", "euphorisch", "extrovertiert", "fair", "falsch", "fantastisch", "fies", "frech", "freundlich", "frustriert", "förderlich", "geduldig", "geil", "geläufig", "gemein", "genervt","gereizt", "geschmacklos", "gespentisch", "gewöhnlich", "großartig", "großzügig", "gruselig", "gruselig", "gut", "günstig", "harmonisch", "hasserfüllt", "heiter", "herablassend", "hilfreich", "hinterfotzig", "hinterhältig", "hirnrissig", "hochnäßig", "hoffnungsvoll", "höflich", "ichbezogen", "idealistisch", "ignorant", "impulsiv", "inkonsequent", "introvertiert", "ironisch", "irre", "kacke", "kindisch", "klasse", "klassisch", "konservativ", "korrekt", "krass", "lahm", "langweilig", "launisch", "leichtgläubig", "liebenswert", "liebenswürdig", "liebevoll", "listig", "locker", "lächerlich", "manisch", "melancholisch", "merkwürdig", "merkwürdig", "mies", "misstrauisch", "moralisch","motiviert", "mutlos", "mysteriös", "nachdenklich", "nachlässig", "nachteilig", "naiv", "narzisstisch", "negativ", "neidisch", "neidisch", "nett", "nett", "neugierig", "normal", "nützlich", "obsessiv", "okay", "optimistisch", "pampig", "peinlich", "perplex", "pervers", "pessimistisch", "positiv", "provokant", "rational", "realistisch", "rechtens", "reizbar", "respektlos", "respektvoll", "richtig", "riskant", "rücksichtslos", "rücksichtsvoll", "sachlich", "schadenfroh", "schaurig", "scheiße", "schlau", "schlecht", "schlicht", "schrecklich", "schräg", "schädlich", "schön", "schüchtern", "selbstgefällig", "selbstgefällig", "selbstkritisch", "selbstverliebt", "selbstverliebt", "seltsam", "sentimental", "sicher", "sinnfrei", "sonderbar", "speziell", "spießig", "sprunghaft", "stolz", "stolz", "streng", "stur", "taktlos", "teuer", "theatralisch", "tolerant", "toll", "traurig", "töricht", "umgänglich", "unfair", "ungeduldig", "ungewöhnlich", "ungünstig", "unheimlich", "unhöflich", "unnahbar", "unnatürlich", "unnormal", "unruhig", "unsensibel", "unsicher", "unsinnig", "unverschämt", "unwichtig", "urig", "verblüfft", "verblüfft", "verdattert", "verletzend", "verletzt", "verliebt", "vermessen", "vernünftig", "verrückt", "verschroben", "versnobt", "vertrauensvoll", "vertrauenswürdig", "verwundert", "verzweifelt", "verärgert", "vorsichtig", "vorteilhaft", "weinerlich", "weltfremd", "wichtig", "widerlich", "witzig", "wunderlich", "wütend", "zickig", "zornig", "zufrieden", "zurückhaltend", "zwanglos","zynisch", "öde", "übel", "übergriffig", "überheblich", "überheblich", "üblich"
    }

# Descriptive adjectives: Narrative, expository, descriptive. Narrative or descriptive registers, focusing on sensory details.
descriptive_adjectives = {
    "asymmetrisch", "bitter", "breit", "bunt", "dunkel", "durchsichtig", "eckig", "einfarbig", "einheitlich", "elastisch", "farbig", "farblos", "fest", "feucht", "gerade", "glatt", "groß", "hart", "hell", "hoch", "horizontal", "kalt", "kantig", "klar", "klebrig", "klein", "krumm", "kräftig", "kurz", "lang", "langsam", "laut", "leer", "leicht", "leise", "leise", "locker", "luftdicht", "matt", "mild", "nass", "niedrig", "oval", "pflanzlich", "rau", "rund", "salzig", "sauber", "sauer", "scharf", "schmal", "schmutzig", "schnell", "schwach", "schwer", "spitz", "stark", "staubig", "still", "stinkend", "stumpf", "symmetrisch", "süß", "tief", "tierisch", "transparent", "trocken", "trüb", "vertikal", "voll", "warm", "wasserdicht", "weich", "zäh"
  }
# Modal adjectives: Instructional, evaluative.	Indicate necessity or possibility.
modal_adjectives = {
    "akzeptabel", "angeblich", "annehmbar", "bedenklich", "denkbar", "einleuchtend", "erforderlich", "erreichbar", "fraglich", "fragwürdig", "haltlos", "hinnehmbar", "hypothetisch", "machbar", "mutmaßlich", "möglich", "notwendig", "nötig", "offensichtlich", "plausibel", "problematisch", "realisierbar", "realistisch", "tragbar", "unabdingbar", "unabwendbar", "unausweichlich", "unbedenklich", "unbegründet", "unerlässlich", "ungewiss", "unglaubwürdig", "unmöglich", "unplausibel", "unsachlich", "unvermeidbar", "unvermeidlich", "unvertretbar", "unwahrscheinlich", "vermeidbar", "vermutlich", "verpflichtend", "vertretbar", "voraussichtlich", "wünschenswert", "zulässig", "zumutbar", "zweifelhaft", "zwielichtig", "zwingend", "überflüssig"
    }
# Social/relational adjectives: Expository, academic.	Describe relationships or social roles.
social_adjectives = {
    "antidemokratisch", "beruflich", "bürgerlich", "demokratisch", "diktatorisch", "einheimisch", "ethisch", "familiär", "freundschaftlich", "gemeinsam", "gemeinschaftlich", "gerichtlich", "gesellschaftlich", "humanitär", "illegal", "individuell", "interkulturell", "international", "kameradschaftlich", "kollegial", "kollektiv", "kooperativ", "kulturell", "legal", "lokal", "menschlich", "militärisch", "moralisch", "multikulturell", "national", "partnerschaftlich", "persönlich", "politisch", "privat", "rechtlich", "rechtmäßig", "regional", "sozial", "soziokulturell", "unkollegial", "unpersönlich", "unpolitisch", "unrechtmäßig", "unverantwortlich", "verantwortlich", "vereint", "wirtschaftlich", "zivil", "zwischenmenschlich", "öffentlich", "überregional"
    }

# Scientific/technical adjectives: Academic, technical.	Describe technical properties.
technical_adjectives = {
    "akustisch", "analog", "anorganisch", "biologisch", "chemisch", "deskriptiv", "digital", "dynamisch", "effizient", "elektrisch", "elektronisch", "empirisch", "experimentell", "genetisch", "hydraulisch", "ineffizient", "klinisch", "magnetisch", "manuell", "mathematisch", "mechanisch", "mikroskopisch", "naturwissenschaftlich", "normativ", "optisch", "organisch", "physikalisch", "praktisch", "programmierbar", "präzise", "präzise", "qualitativ", "quantitativ", "robust", "statisch", "statistisch", "technisch", "theoretisch", "thermisch", "ungenau", "unprogrammierbar", "wissenschaftlich"
    }


# --------------------------------------------------------------------------------
# ADVERBS

adverb_classes = {
    # Possibility adverbs: subjectivity and tentative stance
    "adv_poss": {
        "anscheinend", "augenscheinlich", "erkennbar", "ersichtlich", "eventuell", "gegebenenfalls", "höchstwahrscheinlich", "mutmaßlich", "möglicherweise", "offenbar", "offenkundig", "offensichtlich", "potenziell", "unwahrscheinlich", "vermutlich", "vielleicht", "voraussichtlich", "wahrscheinlich", "womöglich"
        },
    # Place adverbs > narrative style, focusing on setting and chronology
    "adv_loc": {
        "außen", "außerhalb", "bergab", "bergauf", "da", "dort", "draußen", "drinnen", "drüben", "entlang", "gegenüber", "hier", "hinten", "hinter", "innen", "irgendwo", "links", "mittendrin", "neben", "nebenan", "nirgendwo", "oben", "oberhalb", "rechts", "rückwärts", "seitlich", "unten", "unterhalb", "vorn", "vorne", "vorwärts", "woanders", "überall"
        },
    # Time adverbs > narrative style, focusing on setting and chronology
    "adv_temp": {
        "anfangs", "augenblicklich", "bald", "bereits", "bisher", "bislang", "damals", "danach", "dauerhaft", "demnächst", "eben", "einmal", "einst", "ewig", "früher", "gegenwärtig", "gerade", "gestern", "gleich", "grad", "grade", "heute", "heutzutage", "immer", "inzwischen", "irgendwann", "jemals", "jetzt", "künftig", "kürzlich", "lange", "letztendlich", "letztens", "manchmal", "mehrmals", "meist", "meistens", "mittlerweile", "momentan", "morgen", "nachher", "nachträglich", "neulich", "nie", "niemals", "noch", "nun", "oft", "oftmals", "plötzlich", "schlussendlich", "schon", "seitdem", "seither", "selten", "soeben", "sofort", "später", "stets", "unentwegt", "vorgestern", "vorher", "vorhin", "weiterhin", "wieder", "zeitlebens", "zeitweise", "zukünftig", "übermorgen"
        },
    # linking adverbials > logical structuring and cohesion, typical in expository or formal texts
    "adv_link": {
        "allerdings", "andererseits", "anschließend", "anstatt", "ausdrücklich", "außerdem", "dadurch", "daher", "daneben", "daraufhin", "darum", "darüber", "dazu", "demnach", "dennoch", "deshalb", "dessen", "deswegen", "diesem", "ebenfalls", "ebenso", "einerseits", "ergänzend", "ferner", "folglich", "gleichwohl", "hierbei", "hinaus", "indes", "insbesondere", "inzwischen", "jedoch", "mittlerweile", "obendrein", "schließlich", "sodann", "somit", "stattdessen", "trotzdem", "vielmehr", "weiterhin", "währenddessen", "zugleich", "zusätzlich", "zwischenzeitlich", "überdies"
        }
}


# --------------------------------------------------------------------------------
# PREPOSITIONS

preposition_classes = {
    # Local and temporal prepositions: Explore narrative flow and chronological organization + Highlight spatial relationships and descriptive elements.
    "prep_loc_temp" : {
        "ab", "an", "auf", "aus", "außerhalb", "bei", "binnen", "bis", "entfernt", "entlang", "hinter", "in", "innerhalb", "nach", "nachdem", "nahe", "neben", "ob", "oberhalb", "seit", "unter", "unterhalb", "von", "vor", "während", "zu", "zwischen", "über"
        },

    # Modal prepositions: Reveal manner, means, and instrumentality in text.
    "prep_mod" : {
        "abgesehen", "anhand", "anstatt", "ausgenommen", "ausschließlich", "außer", "bezüglich", "betreffend", "betreffs", "durch", "für", "einschließlich", "entgegen", "entsprechend", "exklusive", "gegen", "gegenüber", "gemäß", "hinsichtlich", "inklusive", "laut", "mit", "mithilfe", "mitsamt", "ohne", "per", "statt", "via", "vorausgesetzt", "wider", "zugunsten", "zuliebe", "zuzüglich"
        },

    # Causal prepositions: Analyze reasoning, cause-effect relationships.
    "prep_caus" : {
        "angesichts", "anlässlich", "aufgrund", "dank", "halber", "infolge", "mangels", "trotz", "obwohl", "um", "unbeschadet", "ungeachtet", "wegen", "zu", "zwecks"
        }
}




# --------------------------------------------------------------------------------
# Define function to extract linguistic features

# functional features are based on pos-tags
# semantic features are based on word lists
# --------------------------------------------------------------------------------


# Initialize a list to store tokenized and tagged data (save tagged output in file for future reference)
token_data = []


def extract_features(row):
    # Get the text and metadata
    question_context = row["question_context"]
    question_id = row["question_id"]
    question_id_individual = row["question_id_individual"]
    conceptual_question_type = row["Conceptual Question Type"]
    functional_question_type = row["Functional Question Type"]
    question_individual = row["question_individual"]

    # Create a SpaCy Doc object and attach the question_id
    doc = nlp.make_doc(question_context)  # Only tokenize initially
    doc.user_data["question_id"] = question_id  # Attach question_id to doc.user_data
    doc = nlp(doc)  # Process through the pipeline

    # Store tokenized and tagged information
    for token in doc:
        token_data.append({
            "question_id": question_id,
            "question_id_individual": question_id_individual,
            "Conceptual Question Type": conceptual_question_type,
            "Functional Question Type": functional_question_type,
            "question_context": question_context,
            "question_individual": question_individual,
            "token": token.text,
            "lemma": token._.custom_lemma or token.lemma_,
            "pos": token._.custom_pos or token.pos_,
            "tag": token._.custom_tag or token.tag_,
            "morph": token._.custom_morph or token.morph
        })


    # Calculate counts from semantic classes first to then be able to substract them: Adverbs
    adv_specific = sum(1 for token in doc if (
        token._.custom_tag or token.tag_) == "ADV" and any((
            token._.custom_lemma or token.lemma_) in adverb_classes[key] for key in adverb_classes))
    # Calculate counts from semantic classes first to then be able to substract them: Prepositions
    prep_specific = sum(1 for token in doc if (
        token._.custom_tag or token.tag_) in {"APPR", "APPO", "APZR"} and any((
            token._.custom_lemma or token.lemma_) in preposition_classes[key] for key in preposition_classes))


    features = {

# --------------------------------------------------------------------------------

        "word_count": len(doc),  # Total number of tokens in the line

# --------------------------------------------------------------------------------
# NOUNS
        # Common nouns: General information density > emphasis on objects, concepts, or categories rather than specific entities. Common in expository, descriptive, or argumentative texts, such as news articles or academic writing.
        "nn_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) in {"PROPN", "NOUN"} and (
                token._.custom_tag or token.tag_) == "NN"),
        # Proper nouns: Specificity and Personalization > focus on specific entities, like people, places, organizations, or events. Common in narrative or biographical texts, where storytelling or real-life examples dominate.
        "ne_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) in {"PROPN", "NOUN"} and (
                token._.custom_tag or token.tag_) == "NE"),


# --------------------------------------------------------------------------------
# ARTICLES
        # Definite articles: focus on shared knowledge, cohesion, and established references >> academic, narrative, and procedural
        "art_def_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "DET" and (
                token._.custom_tag or token.tag_) == "ART" and any(
                definite == "Def" for definite in token.morph.get("Definite", []))),
        # Indefinite articles: exploratory or descriptive tendencies + introducing new information >> creative, conversational, or expository texts
        "art_indef_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "DET" and (
                token._.custom_tag or token.tag_) == "ART" and any(
                definite == "Ind" for definite in token.morph.get("Definite", []))),


# --------------------------------------------------------------------------------
# VERBS
        # All verbs:
          # Temporal framing (present-focused vs. past-focused).
          # Text type and register (e.g., narrative vs. expository).
        # Present tense, indicative mood: narrative, description, actions and events, real time, Reflects hypothetical or ongoing possibilities, obligations, or abilities. Often used in constructing present perfect tenses. >> involvement, interaction, and narrativity
        "v_pres_ind_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) in {"AUX", "VERB"} and (
                token._.custom_tag or token.tag_) in {"VVFIN", "VAFIN", "VMFIN"} and any(
                    tense == "Pres" for tense in token.morph.get("Tense", [])) and any(
                        mood == "Ind" for mood in token.morph.get("Mood", []))),
        # Present tense, subjunctive mood (Konjunktiv I): reported speech and indirect discourse >> formal or journalistic style, polite or neutral tone, emphasis on objectivity and detachment
        "v_pres_sub_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) in {"AUX", "VERB"} and (
                token._.custom_tag or token.tag_) in {"VVFIN", "VAFIN", "VMFIN"} and any(
                    tense == "Pres" for tense in token.morph.get("Tense", [])) and any(
                        mood == "Sub" for mood in token.morph.get("Mood", []))),
        # Past tense, indicative mood: description of past events, narrativity, hypothetical situations or past obligations. past perfect or passive constructions.
        "v_past_ind_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) in {"AUX", "VERB"} and (
                token._.custom_tag or token.tag_) in {"VVFIN", "VAFIN", "VMFIN"} and any(
                    tense == "Past" for tense in token.morph.get("Tense", [])) and any(
                        mood == "Ind" for mood in token.morph.get("Mood", []))),
        # Past tense, subjunctive mood (Konjunktiv II): hypotheticals, counterfactual scenarios, or wishful thinking >> speculative or emotional tone, common in reflective or argumentative texts, softening statements, hypotheticals, or politeness, formality + higher complexity + condensing information
        "v_past_sub_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) in {"AUX", "VERB"} and (
                token._.custom_tag or token.tag_) in {"VVFIN", "VAFIN", "VMFIN"} and any(
                    tense == "Past" for tense in token.morph.get("Tense", [])) and any(
                        mood == "Sub" for mood in token.morph.get("Mood", []))),


        # Infinitive + Particle "zu" before infinitives: formality + higher complexity + condensing information (often appear with subordinate clauses) + instructional/directive >> informational density and formality
        "v_inf_count": sum(1 for token in doc if (
            (token._.custom_pos or token.pos_) in {"AUX", "VERB"} and (
                token._.custom_tag or token.tag_) in {"VVINF", "VMINF", "VAINF", "VVIZU"}) or (
                    token._.custom_pos or token.pos_) == "PART" and (
                        token._.custom_tag or token.tag_) == "PTKZU"),


        # Verbs in perfect forms
        # VVPP: narrative: past events, VAPP: formal exposition or detailed procedural descriptions, VMPP: hypothetical or speculative discourse
        "v_pp_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) in {"AUX", "VERB"} and (
                token._.custom_tag or token.tag_) in {"VVPP", "VAPP", "VMPP"}),


        # Verb lists (see above)
        "v_caus_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in causative_verbs),
        "v_comm_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in communication_verbs),
        "v_desire_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in desire_verbs),
        "v_epist_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in epistemic_verbs),
        "v_exist_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in existence_verbs),
        "v_justif_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in justification_verbs),
        "v_mental_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in mental_verbs),

# --------------------------------------------------------------------------------
# ADJECTIVES
        # Attributive adjectives(spacy assigns token.pos_ == "ADJ" to token.tag_ == "ADJD")
        # Positive: Indicates neutral descriptions, often seen in narratives, descriptive prose, or scientific texts describing phenomena (e.g., eine hohe Temperatur).
        "adja_pos_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "ADJ" and (
                token._.custom_tag or token.tag_) == "ADJA" and any(
                    degree == "Pos" for degree in token.morph.get("Degree", []))),
        # Comparative: Suggests a comparative focus, typical in evaluative or analytical texts.
        "adja_cmp_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "ADJ" and (
                token._.custom_tag or token.tag_) == "ADJA" and any(
                    degree == "Cmp" for degree in token.morph.get("Degree", []))),
        # Superlative: Often used for emphasis or ranking, seen in promotional language, reviews, or advertising.
        "adja_sup_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "ADJ" and (
                token._.custom_tag or token.tag_) == "ADJA" and any(
                    degree == "Sup" for degree in token.morph.get("Degree", []))),


        # Adverbial adjectives (spacy assigns token.pos_ == "ADV" to token.tag_ == "ADJD")
        # Positive: Indicates evaluation or descriptions of states, common in spoken language, dialogues, or personal narratives.
        "adjd_pos_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "ADV" and (
                token._.custom_tag or token.tag_) == "ADJD" and any(
                    degree == "Pos" for degree in token.morph.get("Degree", []))),
        # Comparative: Reflects relative assessments, seen in conversational comparisons or personal judgments.
        "adjd_cmp_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "ADV" and (
                token._.custom_tag or token.tag_) == "ADJD" and any(
                    degree == "Cmp" for degree in token.morph.get("Degree", []))),
        # Superlative: Indicates strong evaluations, often found in rhetorical or persuasive contexts.
        "adjd_sup_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "ADV" and (
                token._.custom_tag or token.tag_) == "ADJD" and any(
                    degree == "Sup" for degree in token.morph.get("Degree", []))),


        # Adjective lists (see above)
        "adj_attit_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in attitudinal_adj and (
                token._.custom_tag or token.tag_) in {"ADJA", "ADJD"}),
        "adj_descr_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in descriptive_adjectives and (
                token._.custom_tag or token.tag_) in {"ADJA", "ADJD"}),
        "adj_mod_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in modal_adjectives and (
                token._.custom_tag or token.tag_) in {"ADJA", "ADJD"}),
        "adj_soc_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in social_adjectives and (
                token._.custom_tag or token.tag_) in {"ADJA", "ADJD"}),
        "adj_tech_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in technical_adjectives and (
                token._.custom_tag or token.tag_) in {"ADJA", "ADJD"}),

# --------------------------------------------------------------------------------
# ADVERBS
        # Adverbs from previously defined adverb classes
        "adv_specific": adv_specific,
        # Adverbs minus previously defined adverb classes
        "adv_general_count": sum(1 for token in doc if (
            token._.custom_tag or token.tag_) == "ADV") - adv_specific,
        # Specific adverb classes
        "adv_poss_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in adverb_classes["adv_poss"] and (
                token._.custom_tag or token.tag_) == "ADV"),
        "adv_loc_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in adverb_classes["adv_loc"] and (
                token._.custom_tag or token.tag_) == "ADV"),
        "adv_temp_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in adverb_classes["adv_temp"] and (
                token._.custom_tag or token.tag_) == "ADV"),
        "adv_link_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in adverb_classes["adv_link"] and (
                token._.custom_tag or token.tag_) == "ADV"),

# --------------------------------------------------------------------------------
# PREPOSITIONS AND OTHERS
        # Prepositions > formality and information density
        # Prepositions from previously defined preposition classes
        "prep_specific": prep_specific,
        # Prepositions minus previously defined preposition classes
        "prep_general_count": sum(1 for token in doc if (
            token._.custom_tag or token.tag_) in {"APPR", "APPO", "APZR"}) - prep_specific,
        # Specific preposition classes
        "prep_loc_temp_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in preposition_classes["prep_loc_temp"] and (
                token._.custom_tag or token.tag_) in {"APPR", "APPO", "APZR"}),
        "prep_mod_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in preposition_classes["prep_mod"] and (
                token._.custom_tag or token.tag_) in {"APPR", "APPO", "APZR"}),
        "prep_caus_count": sum(1 for token in doc if (
            token._.custom_lemma or token.lemma_) in preposition_classes["prep_caus"] and (
                token._.custom_tag or token.tag_) in {"APPR", "APPO", "APZR"}),

        # Contractions > real time production > intimacy
        "prep_contra_count": sum(1 for token in doc if (
            token._.custom_tag or token.tag_) == "APPRART"),

# --------------------------------------------------------------------------------
# PRONOUNS: referring to shared personal knowledge + real time production

        # Demonstrative pronouns: Structured, cohesive Writing, adds emphasis in informal texts
        "pron_dem_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) in {"DET", "PRON"} and (
                token._.custom_tag or token.tag_) in {"PDS", "PDAT"}),

        # Indefinite pronouns: use in abstract or argumentative texts signals generalizations; in diary-like or reflective texts to express uncertainty
        "pron_ind_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) in {"DET", "PRON"} and (
                token._.custom_tag or token.tag_) in {"PIS", "PIAT", "PIDAT"}),

        # Personal pronouns + reflexive personal pronouns
        # 1st + 2nd person > Conversational or interactive registers (e.g., dialogues, speeches). + Personal or informal texts (e.g., letters, blogs)
        "pron_pers_1_2_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "PRON" and (
                token._.custom_tag or token.tag_) in {"PPER", "PRF"} and any(
                    p in {"1", "2"} for p in token.morph.get("Person", []))),
        # 3rd person > Narrative or descriptive registers (e.g., fiction, historical accounts). Formal or impersonal texts (e.g., academic writing).
        "pron_pers_3_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "PRON" and (
                token._.custom_tag or token.tag_) in {"PPER", "PRF"} and any(
                    p == "3" for p in token.morph.get("Person", []))),

        # Possessiv pronouns
        # Personal writing: Diaries, blogs, and letters: personal tone, interaction, or subjective involvement.
        # Add "DET" to exlcude mislabelled verb forms of "meinen": Token: meine, Lemma: mein, Pos: VERB
        "pron_poss_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "DET" and (
                token._.custom_tag or token.tag_) in {"PPOSS", "PPOSAT"} and any(
                    p == "Yes" for p in token.morph.get("Poss", []))),

        # Relativ pronouns > explanatory / expository + elaborating information
        "pron_rel_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "PRON" and (
                token._.custom_tag or token.tag_) in {"PRELS", "PRELAT"}),

        # Pronominal adverb: dafür, dabei, deswegen, trotzdem > justifying/explaining > purpose-showing; provide cohesion in logical arguments
        # for some reason, spacy matches with PROAV instead of PAV
        "pron_adv_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "ADV" and (
                token._.custom_tag or token.tag_) == "PROAV"),

# --------------------------------------------------------------------------------
# CONJUNCTIONS / PARTICLES

        # Subordinating conjunctions: indicate syntactically complex texts with multiple layers of ideas, reveal a focus on logical relations (cause-effect, conditions, etc.). Common in analytical or argumentative texts, academic writing, technical texts, and formal prose.
        "conj_sub_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "SCONJ" and (
                token._.custom_tag or token.tag_) in {"KOUI", "KOUS"}),
        # coordinating conjunctions: spoken language and informal texts, conveying clear and straightforward relationships, typical for conversational registers and narrative or instructional texts.
        "conj_coor_count": sum(1 for token in doc if (
            token._.custom_pos or token.pos_) == "CCONJ" and (
                token._.custom_tag or token.tag_) == "KON"),
        # Comparative particle: Descriptive: evaluative writing, comparative analysis, or descriptive texts. Interpersonal: conversational texts or spoken language. Formal: paired with subordinating conjunctions (als ob, wie wenn), contribute to syntactic complexity in formal texts
        "conj_comp_count": sum(1 for token in doc if (
            token._.custom_tag or token.tag_) == "KOKOM"),

# --------------------------------------------------------------------------------
# OTHERS

        # Interjections > informal, conversational tone
        "interj_count": sum(1 for token in doc if (
            token._.custom_tag or token.tag_) == "ITJ"),

        # WH-words > information-seeking, interactivity
        "wh_count": sum(1 for token in doc if (
            (token._.custom_pos or token.pos_) == "PRON" and (
                token._.custom_tag or token.tag_) == "PWS") or (
                (token._.custom_pos or token.pos_) == "DET" and (
                    token._.custom_tag or token.tag_) == "PWAT") or (
                    (token._.custom_pos or token.pos_) in {"CCONJ", "ADV"} and (
                        token._.custom_tag or token.tag_) == "PWAV")),

        # Response particles: conversational, interactive; involvement and speaker stance
        "resp_part_count": sum(1 for token in doc if (
            token._.custom_tag or token.tag_) == "PTKANT"),

        # Modellkennungen / Special characters >> specific vocabulary / technical
        "spec_char_count": sum(1 for token in doc if (
            token._.custom_tag or token.tag_) == "XY")


    }
    return features



# After processing all rows, convert token_data to a DataFrame and save it
token_df = pd.DataFrame(token_data)
token_df.to_csv("tagged_tokens_data.csv", index=False, encoding="utf-8")

# Extract features for each line and convert them to a DataFrame (+ Progress bar)
features = data.progress_apply(extract_features, axis=1).apply(pd.Series)

# Combine features with the original dataset
data = pd.concat([data, features], axis=1)

# Save the enriched dataset to inspect later
data.to_csv("enriched_data.csv", index=False)
print(data.head())

---
* Save pos-tagged data to file for future reference

---

In [None]:
# Check structure
print(type(token_data))  # Should be a list
print(len(token_data))  # Should not be 0
print(token_data[:5])  # Print first 5 entries

if len(token_data) > 0 and isinstance(token_data[0], dict):
    token_df = pd.DataFrame(token_data)  # Convert to DataFrame
    print(token_df.shape)  # Check if it's still empty
else:
    print("token_data is either empty or incorrectly structured.")

print("Columns in token_df:", token_df.columns.tolist())

# Save to file
token_df.to_csv("tagged_tokens_data.csv", index=False, encoding="utf-8")
print("Saved successfully.")

---
* Extract features to review pos-tagging (only for quality management)
---

In [None]:
'''
import spacy

# Function to debug matches for a specific feature across all rows, including question_id
def debug_all_to_file(data, output_file_path):
    """
    Debug function to write matches for a feature rule for all rows,
    including the question_id, into a file.
    Parameters:
        data: The DataFrame containing text data and question_id.
        output_file_path: The path to the output file.
    """
    total_matches = 0  # Counter for total matches

    with open(output_file_path, "w", encoding="utf-8") as file:
        for index, row in data.iterrows():
            text = row["question_context"]  # column for feature extraction
            question_id = row["question_id"]  # question_id for documentation
            doc = nlp(text)

            # Find matches based on the rule
            matches = [
                (token.text, token._.custom_lemma or token.lemma_,
                 token._.custom_pos or token.pos_,
                 token._.custom_tag or token.tag_,
                 token._.custom_morph or token.morph)
                for token in doc
                if (token._.custom_tag or token.tag_) == "XY"
                #if (token._.custom_tag or token.tag_) in {"PWS", "PWAT", "PWAV"}
            ]

            # Increment total matches count
            total_matches += len(matches)

            # Write matches if any are found
            if matches:
                file.write(f"Question ID: {question_id}\n")
                file.write(f"Row {index}:\n")
                for match in matches:
                    file.write(f"Token: {match[0]}, Lemma: {match[1]}, POS: {match[2]}, Tag: {match[3]}, Morph: {match[4]}\n")
                file.write(f"Original text: {text}\n")
                file.write("-" * 50 + "\n")

        # Write total number of matches
        file.write(f"Total matches found: {total_matches}\n")

# Run the function for all rows in the dataset
debug_all_to_file(data, "Tokens with tag 'XY'_afterRetagging_250128.txt")
'''

---
* Compare word counts with Conceptional Question Types

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set pandas options to display entire width of output
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Ensure 'word_count' is numeric
data['word_count'] = pd.to_numeric(data['word_count'], errors='coerce')
data = data.dropna(subset=['word_count'])  # Remove missing values

# Group by 'Conceptual Question Type' and calculate summary statistics
grouped_stats = data.groupby('Conceptual Question Type')['word_count'].describe()

# Print summary statistics
print("Descriptive Statistics of Word Count by Conceptual Question Type:")
print(grouped_stats)


# Boxplot to visualize distribution
plt.figure(figsize=(12,6))
sns.boxplot(x='Conceptual Question Type', y='word_count', data=data)
plt.xticks(rotation=90)  # Rotate x-axis labels for readability
plt.xlabel("Conceptual Question Type")
plt.ylabel("Word Count")
#plt.title("Distribution of Text length by Conceptual Question Type")
plt.show()



# Reset pandas display options to default if necessary
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm

# Set pandas options to display entire width of output
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# --- Data Preparation ---
# Ensure 'word_count' is numeric and drop missing values
data['word_count'] = pd.to_numeric(data['word_count'], errors='coerce')
data = data.dropna(subset=['word_count'])

# Group by 'Conceptual Question Type' and print summary statistics
grouped_stats = data.groupby('Conceptual Question Type')['word_count'].describe()
print("Descriptive Statistics of Word Count by Conceptual Question Type:")
print(grouped_stats)

# --- Font and Palette Setup ---
# Specify and add the font file (adjust path as needed)
font_path = 'lmroman10-regular.otf'
fm.fontManager.addfont(font_path)
plt.rcParams['font.family'] = 'Latin Modern Roman'

# Choose a specific color from the "colorblind" palette
my_color = sns.color_palette("colorblind")[8]

# --- Create the Boxplot ---
# Increase the figure height for a less stumpy look (e.g., 12x8 inches)
fig, ax = plt.subplots(figsize=(12, 8))

# Order the x-axis categories alphabetically
order = sorted(data['Conceptual Question Type'].unique())

# Create the boxplot with the given order and color
sns.boxplot(x='Conceptual Question Type', y='word_count', data=data, color=my_color, ax=ax, order=order)

# Rotate x-axis labels for readability and set their font size
plt.xticks(rotation=45, ha="right", fontsize=16)

# Set y-tick label size
ax.tick_params(axis='y', labelsize=16)

# Remove top and right spines
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

# Set axis labels with specified font sizes and padding
ax.set_xlabel("Conceptual Question Type", fontsize=20, labelpad=20)
ax.set_ylabel("Word Count", fontsize=20, labelpad=20)

# Add horizontal grid lines
ax.grid(axis="y", linestyle="--", linewidth=0.5, alpha=0.7)

plt.tight_layout()
plt.show()

# Save the figure as a PDF file
fig.savefig("WordCountDistribution.pdf", format="pdf", bbox_inches="tight")

# Reset pandas display options to default if necessary
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

---
* Compute raw feature counts

---

In [None]:
import pandas as pd

# Set pandas options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

feature_counts = data[[
                       # Nouns
                       "nn_count", "ne_count",
                       # Determiners
                       "art_def_count", "art_indef_count",
                       # Verbs
                       "v_pres_ind_count", "v_pres_sub_count", "v_past_ind_count", "v_past_sub_count",
                       "v_inf_count", "v_pp_count",
                       "v_caus_count", "v_comm_count", "v_desire_count", "v_epist_count", "v_exist_count", "v_justif_count", "v_mental_count",
                       # Adjectives
                       "adja_pos_count", "adja_cmp_count", "adja_sup_count",
                       "adjd_pos_count", "adjd_cmp_count", "adjd_sup_count",
                       "adj_attit_count", "adj_descr_count", "adj_mod_count", "adj_soc_count", "adj_tech_count",
                       # Adverbs
                       "adv_general_count", "adv_poss_count", "adv_loc_count", "adv_temp_count", "adv_link_count",
                       # Prepositions
                       "prep_general_count", "prep_loc_temp_count", "prep_mod_count", "prep_caus_count", "prep_contra_count",
                       # Pronouns
                       "pron_dem_count", "pron_ind_count", "pron_pers_1_2_count", "pron_pers_3_count", "pron_poss_count", "pron_rel_count", "pron_adv_count",
                       # Conjunctions / Particles
                       "conj_sub_count", "conj_coor_count", "conj_comp_count",
                       # Others
                       "interj_count", "wh_count", "resp_part_count", "spec_char_count"]
                      ].sum()


# Print feature counts without truncation
print("Feature Counts Across Dataset:")
print(feature_counts)

# Reset pandas display options to default if necessary
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

---
* Compute descriptive statistics for each linguistic feature
* Plot boxplot for visualisation
---

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# List of feature columns (with the '_count' suffix)
feature_columns = [
    # Nouns
    "nn_count", "ne_count",
    # Determiners
    "art_def_count", "art_indef_count",
    # Verbs
    "v_pres_ind_count", "v_pres_sub_count", "v_past_ind_count", "v_past_sub_count",
    "v_inf_count", "v_pp_count",
    "v_caus_count", "v_comm_count", "v_desire_count", "v_epist_count", "v_exist_count", "v_justif_count", "v_mental_count",
    # Adjectives
    "adja_pos_count", "adja_cmp_count", "adja_sup_count",
    "adjd_pos_count", "adjd_cmp_count", "adjd_sup_count",
    "adj_attit_count", "adj_descr_count", "adj_mod_count", "adj_soc_count", "adj_tech_count",
    # Adverbs
    "adv_general_count", "adv_poss_count", "adv_loc_count", "adv_temp_count", "adv_link_count",
    # Prepositions
    "prep_general_count", "prep_loc_temp_count", "prep_mod_count", "prep_caus_count", "prep_contra_count",
    # Pronouns
    "pron_dem_count", "pron_ind_count", "pron_pers_1_2_count", "pron_pers_3_count", "pron_poss_count", "pron_rel_count", "pron_adv_count",
    # Conjunctions / Particles
    "conj_sub_count", "conj_coor_count", "conj_comp_count",
    # Others
    "interj_count", "wh_count", "resp_part_count", "spec_char_count"
]

# Calculate descriptive statistics for each feature across all texts.
# We compute: mean, min, max, and standard deviation.
stats_df = data[feature_columns].agg(['mean', 'min', 'max', 'std'])

# Compute the range (max - min) for each feature and add it as a new row.
stats_df.loc['range'] = stats_df.loc['max'] - stats_df.loc['min']

# Reorder the rows in the desired order: mean, min, max, range, std.
stats_df = stats_df.loc[['mean', 'min', 'max', 'range', 'std']]

# Rename the columns to drop the '_count' suffix.
stats_df.columns = [col.replace('_count', '') for col in stats_df.columns]

# Print the descriptive statistics with features as columns.
print("Descriptive Statistics for Each Feature:")
print(stats_df)

# Optionally, transpose the DataFrame so that each row corresponds to a feature.
print("\nDescriptive Statistics (Features as rows):")
print(stats_df.T)



# Convert the data to long format for easier plotting
data_long = data[feature_columns].melt(var_name='feature', value_name='count')
# Remove the '_count' suffix for clarity in the plots
data_long['feature'] = data_long['feature'].str.replace('_count', '')

plt.figure(figsize=(14, 8))
sns.boxplot(x='feature', y='count', data=data_long)
plt.xticks(rotation=90)
plt.xlabel("Feature")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# --- Data Preparation ---
# Assuming 'data' is already defined and contains your features.
# List of feature columns (with the '_count' suffix)
feature_columns = [
    # Nouns
    "nn_count", "ne_count",
    # Determiners
    "art_def_count", "art_indef_count",
    # Verbs
    "v_pres_ind_count", "v_pres_sub_count", "v_past_ind_count", "v_past_sub_count",
    "v_inf_count", "v_pp_count",
    "v_caus_count", "v_comm_count", "v_desire_count", "v_epist_count", "v_exist_count", "v_justif_count", "v_mental_count",
    # Adjectives
    "adja_pos_count", "adja_cmp_count", "adja_sup_count",
    "adjd_pos_count", "adjd_cmp_count", "adjd_sup_count",
    "adj_attit_count", "adj_descr_count", "adj_mod_count", "adj_soc_count", "adj_tech_count",
    # Adverbs
    "adv_general_count", "adv_poss_count", "adv_loc_count", "adv_temp_count", "adv_link_count",
    # Prepositions
    "prep_general_count", "prep_loc_temp_count", "prep_mod_count", "prep_caus_count", "prep_contra_count",
    # Pronouns
    "pron_dem_count", "pron_ind_count", "pron_pers_1_2_count", "pron_pers_3_count", "pron_poss_count", "pron_rel_count", "pron_adv_count",
    # Conjunctions / Particles
    "conj_sub_count", "conj_coor_count", "conj_comp_count",
    # Others
    "interj_count", "wh_count", "resp_part_count", "spec_char_count"
]

# Convert the data to long format for easier plotting
data_long = data[feature_columns].melt(var_name='feature', value_name='count')
# Remove the '_count' suffix for clarity in the plots
data_long['feature'] = data_long['feature'].str.replace('_count', '')

# Compute sorted order of features alphabetically
order = sorted(data_long['feature'].unique())

# --- Font and Palette Setup ---
# Specify the path to your font file (adjust path as needed)
font_path = 'lmroman10-regular.otf'
fm.fontManager.addfont(font_path)
plt.rcParams['font.family'] = 'Latin Modern Roman'

# Choose a specific color from the "colorblind" palette
my_color = sns.color_palette("colorblind")[8]

# --- Create the Boxplot ---
fig, ax = plt.subplots(figsize=(20, 8))
sns.boxplot(x='feature', y='count', data=data_long, order=order, color=my_color, ax=ax)

# Rotate x-axis labels for readability and set their font size
plt.xticks(rotation=45, ha="right", fontsize=14)
ax.tick_params(axis='y', labelsize=16)

# Remove top and right spines for a cleaner look
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

# Set axis labels with specified font sizes and padding
ax.set_xlabel("Feature", fontsize=20, labelpad=20)
ax.set_ylabel("Count", fontsize=20, labelpad=20)

# Add horizontal grid lines
ax.grid(axis="y", linestyle="--", linewidth=0.5, alpha=0.7)

plt.tight_layout()
plt.show()

# Save the figure as a PDF file
fig.savefig("FeatureCountDistribution.pdf", format="pdf", bbox_inches="tight")

---

* Normalize frequencies (by 100 words) to ensure comparability

---

In [None]:
# List of columns to exclude from normalization (easier than the other way around)
exclude_columns = {
    "adv_specific",
    "Conceptual Question Type",
    "Functional Question Type",
    "prep_specific",
    "question_context",
    "question_id",
    "question_id_individual",
    "question_individual",
    "token_count",
    "word_count",  # Necessary for normalization # switch back to WORD_COUNT
}

# Identify columns to normalize
columns_to_normalize = [col for col in data.columns if col not in exclude_columns and not col.endswith("_freq")]

# Filter rows with word_count > 0 to avoid division by zero
data = data[data["word_count"] > 0]

# Normalize by `word_count` if frequency columns do not already exist (normalise by 100 words)
if not any(col + "_freq" in data.columns for col in columns_to_normalize):
    for col in columns_to_normalize:
        data[col + "_freq"] = data[col] / data["word_count"] * 100 # 100 because of mostly short texts (under 500 words)

# Extract normalized frequency columns dynamically
frequencies = [col for col in data.columns if col.endswith("_freq")]
feature_data = data[frequencies]

# Output the frequency data for inspection
print(feature_data.head())
print(data.columns)  # Check available columns

---
* Standardize features to mean of 0 and standard deviation of 1

---

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize feature data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(feature_data)

---
* Compute feature variances: Factor analysis requires that features have sufficient variance. If some features have zero or near-zero variance, it could cause issues during matrix decomposition.

---

In [None]:
# Set pandas options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Calculate variance for each feature
variances = feature_data.var()
print("Feature variances:")
print(variances)

# Check for zero or near-zero variance
low_variance_features = variances[variances < 1e-6]
print("Low-variance features:")
print(low_variance_features)

# Reset pandas display options to default if necessary
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

---

* Remove low-variance features (if they exist)

---

In [None]:
if not low_variance_features.empty:
    feature_data = feature_data.drop(columns=low_variance_features.index)
    scaled_data = StandardScaler().fit_transform(feature_data)

---
* Plot scree plot to determine number of factors
* Compute factor loadings
* Carry out varimax and promax rotation to decide which one to choose
* Compute variance explained by each factor
---

In [None]:
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.font_manager as fm
from IPython.display import display

# Ensure scaled_data is a DataFrame (assuming scaled_data and feature_data are defined)
scaled_data = pd.DataFrame(scaled_data, columns=feature_data.columns)

# Compute eigenvalues for the Scree Plot
fa = FactorAnalyzer()
fa.fit(scaled_data)
eigenvalues, _ = fa.get_eigenvalues()

# --- Font and Style Setup ---
font_path = 'lmroman10-regular.otf'
fm.fontManager.addfont(font_path)
plt.rcParams['font.family'] = 'Latin Modern Roman'

# --- Create the Scree Plot ---
fig, ax = plt.subplots(figsize=(8, 5))
# Plot eigenvalues with markers and a connecting line
ax.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker="o", linestyle="-", color="black")
# Plot horizontal line at eigenvalue = 1
ax.axhline(y=1, color="r", linestyle="--", label="Eigenvalue = 1")

# Set axis labels with larger fonts and extra padding
ax.set_xlabel("Number of Factors", fontsize=20, labelpad=20)
ax.set_ylabel("Eigenvalue", fontsize=20, labelpad=20)

# Set tick label sizes
ax.tick_params(axis="x", labelsize=16)
ax.tick_params(axis="y", labelsize=16)

# Add legend with adjusted font sizes
ax.legend(fontsize=16, title_fontsize=20)

# Add grid lines (dashed) on both axes
ax.grid(axis="both", linestyle="--", linewidth=0.5, alpha=0.7)

# Remove top and right spines for a cleaner look
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

plt.tight_layout()
plt.show()

# Save the scree plot as a PDF file
fig.savefig("ScreePlot.pdf", format="pdf", bbox_inches="tight")


# Decide number of factors (manually based on Scree Plot)
num_factors = 7  # Adjust this based on the plot

# Try both Varimax (orthogonal) and Promax (oblique)
rotations = ["varimax", "promax"]
for rotation in rotations:
    print(f"\n=== Factor Analysis with {rotation.upper()} Rotation ===")

    fa = FactorAnalyzer(n_factors=num_factors, rotation=rotation)
    fa.fit(scaled_data)

    # Extract factor loadings
    loadings = pd.DataFrame(
        fa.loadings_,
        index=scaled_data.columns,
        columns=[f"Factor {i+1}" for i in range(num_factors)]
    )

    # Sort loadings by highest absolute value for readability
    sorted_loadings = loadings.abs().max(axis=1).sort_values(ascending=False)
    loadings = loadings.loc[sorted_loadings.index]

    # Function to apply color formatting based on thresholds
    def highlight_values(val):
        abs_val = abs(val)
        if abs_val >= 0.30:
            return 'background-color: red'
        elif abs_val >= 0.20:
            return 'background-color: orange'
        elif abs_val >= 0.10:
            return 'background-color: yellow'
        return ''

    # Display loadings in Jupyter properly with color formatting
    print("Factor Loadings (high loadings > 0.30 are salient):")
    display(loadings.style.map(highlight_values))

    # Get variance explained by each factor
    variance_explained = pd.DataFrame(
        {"Explained Variance": fa.get_factor_variance()[1]},
        index=[f"Factor {i+1}" for i in range(num_factors)]
    )
    print("\nVariance Explained by Each Factor:")
    display(variance_explained)

---
Compute factor correlations

---

In [None]:
factor_correlation_matrix = pd.DataFrame(
    fa.loadings_.T @ fa.loadings_,  # This computes the correlation matrix
    index=[f"Factor {i+1}" for i in range(num_factors)],
    columns=[f"Factor {i+1}" for i in range(num_factors)]
)

print("Factor Correlation Matrix:")
display(factor_correlation_matrix)

---
Create table for factor loadings

---

In [None]:
import pandas as pd

# Convert factor loadings into a DataFrame
factor_loadings = pd.DataFrame(
    fa.loadings_,
    index=scaled_data.columns,
    columns=[f"Factor {i+1}" for i in range(fa.n_factors)]
)

# Save to CSV
factor_loadings.to_csv("factor_loadings.csv")

print("Factor loadings saved as factor_loadings.csv.")

---

* Compute factor scores to merge with Conceptual Question Types

---

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Set pandas options to display all rows and columns and max width
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 1) Load data
factor_loadings = pd.read_csv("factor_loadings.csv", index_col=0)
enriched_data = pd.read_csv("enriched_data.csv")

# Standardize column/index names
factor_loadings.columns = factor_loadings.columns.str.strip()
factor_loadings.index = factor_loadings.index.str.strip()
enriched_data.columns = enriched_data.columns.str.strip()

# Remove `_freq` suffix from factor_loadings feature names
factor_loadings.index = factor_loadings.index.str.replace("_freq", "", regex=False).str.strip()

# 2) Identify feature columns
feature_columns = enriched_data.columns[enriched_data.columns.get_loc("nn_count"):].str.strip()

# Ensure features match
features_in_loadings = set(factor_loadings.index)
features_in_enriched = set(feature_columns)
common_features = list(features_in_loadings & features_in_enriched)
if not common_features:
    raise ValueError("No common features found between factor loadings and enriched data!")

factor_loadings = factor_loadings.loc[common_features]

columns_to_keep = ["question_id", "Conceptual Question Type", "question_context", "question_individual", "word_count"] + common_features
enriched_data = enriched_data[columns_to_keep]

# 3) Standardize the features.
# do NOT standardize question_id, question_context, or question type.
scaler = StandardScaler()
enriched_data[common_features] = scaler.fit_transform(enriched_data[common_features])

# 4) Assign each feature to its most strongly associated factor
threshold = 0.1
feature_to_factor = {}
for feature in factor_loadings.index:
    max_factor = factor_loadings.loc[feature].abs().idxmax()
    if abs(factor_loadings.loc[feature, max_factor]) >= threshold:
        feature_to_factor[feature] = max_factor

# 5) Create df_factor_scores with a row per question
df_factor_scores = enriched_data.copy()  # keep original columns
factor_names = factor_loadings.columns.tolist()

# Initialize new factor columns with zero
for f_name in factor_names:
    df_factor_scores[f_name] = 0.0

# Sum standardised feature values for their assigned factors
for idx, row in df_factor_scores.iterrows():
    for feature, factor in feature_to_factor.items():
        df_factor_scores.at[idx, factor] += row[feature]


# Filter for questions with word_count above 100
short_questions = df_factor_scores[df_factor_scores["word_count"] <= 100].copy()

print(f"Number of questions with <= 100 words: {len(short_questions)}")


# Example: retrieve top/bottom scoring questions for each factor
for factor in factor_names:
    # Sort descending to see top-scoring (highest) texts
    sorted_desc = short_questions.sort_values(by=factor, ascending=False)
    top = sorted_desc.head(10)
    print(f"=== Factor: {factor} | Top 10 (Highest Scores) ===")
    display(top[["question_id", "Conceptual Question Type", "question_context", "question_individual", "word_count", factor]])

    # Sort ascending to see bottom-scoring (lowest or negative) texts
    sorted_asc = short_questions.sort_values(by=factor, ascending=True)
    bottom = sorted_asc.head(50)
    print(f"=== Factor: {factor} | Bottom 10 (Lowest/Negative Scores) ===")
    display(bottom[["question_id", "Conceptual Question Type", "question_context", "question_individual", "word_count", factor]])

    print("\n" + "="*80 + "\n")


pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.max_colwidth')
pd.reset_option('display.width')

In [None]:
import seaborn as sns
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt

# Specify font file:
font_path = 'lmroman10-regular.otf'
fm.fontManager.addfont(font_path)

# Set the font family globally using the font's name
plt.rcParams['font.family'] = 'Latin Modern Roman'

# Set Seaborn theme
# sns.set_style("whitegrid")
sns.set_palette("colorblind")

# ----------------------------------------------------------------------------
# 1) Group by question type and get mean factor scores
group_means = (
    df_factor_scores
    .groupby("Conceptual Question Type")[factor_names]
    .mean()
    .reset_index()
)

# 2) Convert to "long" format for Seaborn
melted_factor_scores = group_means.melt(
    id_vars=["Conceptual Question Type"],
    var_name="Factor",
    value_name="Score"
)

# 3) Sort question types for consistent plotting
sorted_question_types = sorted(melted_factor_scores["Conceptual Question Type"].unique())

# 4) Plot
fig, ax = plt.subplots(figsize=(16, 7))

sns.pointplot(
    data=melted_factor_scores,
    x="Conceptual Question Type",
    y="Score",
    hue="Factor",
    dodge=True,
    markers="o",
    linestyle="none",
    order=sorted_question_types,
    err_kws={'linewidth': 0.5},
    linewidth=0.5,
    markersize=12,
    ax=ax
)

ax.set_ylim(-3.5, 3.5)

# Add vertical lines for each category and a grid along y
for i in range(len(sorted_question_types)):
    plt.axvline(x=i, color="gray", linestyle="--", linewidth=0.5, alpha=0.7)
    plt.grid(axis="y", linestyle="--", linewidth=0.5, alpha=0.7)

# Add a horizontal line at y=0
ax.axhline(y=0, color="gray", linestyle="--", linewidth=0.5)

# Remove lines on top and on the right
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha="right", fontsize=16)
plt.yticks(fontsize=16)

# Labels & legend
plt.xlabel("Conceptual Question Type", fontsize=20, labelpad=20)
plt.ylabel("Mean Factor Score", fontsize=20, labelpad=20)
plt.legend(title="Factors", bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0, fontsize=16, title_fontsize=20)

plt.tight_layout()
plt.show()

# Save figure
fig.savefig("factor_scores_CQT.pdf")

---
* Create one plot per factor with question types on the y-axis, ordered by their mean score (from highest to lowest)

---

In [None]:
import matplotlib.patches as mpatches

for factor in factor_names:
    # Copy and sort the data for the current factor
    factor_data = group_means[["Conceptual Question Type", factor]].copy()
    factor_data = factor_data.sort_values(by=factor, ascending=True)

    # Create a new column for bar colors:
    factor_data["bar_color"] = factor_data[factor].apply(
        lambda x: "#FF9741" if x >= 0 else "#9D76E4"
    )

    # Create a horizontal bar plot using Matplotlib
    fig, ax = plt.subplots(figsize=(16, 7))
    bars = ax.barh(
        y=factor_data["Conceptual Question Type"],
        width=factor_data[factor],
        color=factor_data["bar_color"]
    )

    # -------------------------------------------------------------------------
    # ADD NUMERIC LABELS AT THE END OF EACH BAR
    # -------------------------------------------------------------------------
    for bar in bars:
        width = bar.get_width()  # The length of the bar
        y_pos = bar.get_y() + bar.get_height() / 2  # Vertical center of the bar

        # Decide text alignment depending on positive or negative
        if width >= 0:
            ha = "left"   # label appears to the right of the bar end
            offset = 0.02
        else:
            ha = "right"  # label appears to the left of the bar end
            offset = -0.02

        # Place the text slightly beyond the bar end
        ax.text(
            width + offset,
            y_pos,
            f"{width:.2f}",
            va="center",
            ha=ha,
            fontsize=12
        )

    # Add a vertical reference line at 0
    ax.axvline(0, color="gray", linewidth=1.2, linestyle="--")

    # Set labels and tick parameters
    ax.set_xlabel(f"{factor}: Factor Scores", fontsize=20, labelpad=20)
    ax.set_ylabel("Conceptual Question Type", fontsize=20, labelpad=20)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

    # Add horizontal grid lines
    ax.grid(axis="y", linestyle="--", linewidth=0.5, alpha=0.7)

    # Force the x-axis to always span from -3.5 to 3.5
    ax.set_xlim(-3.5, 3.5)

    # Remove lines on top and on the right
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)


    # -------------------------------------------------------------------------
    # CREATE A LEGEND
    # -------------------------------------------------------------------------
    pos_patch = mpatches.Patch(color="#FF9741", label="Positive Score")
    neg_patch = mpatches.Patch(color="#9D76E4", label="Negative Score")
    ax.legend(handles=[pos_patch, neg_patch], loc="right", fontsize=16)

    plt.tight_layout()

    # Save each figure with a unique filename including the factor name
    filename = f"factor_scores_CQT_{factor.replace(' ', '_')}.pdf"
    fig.savefig(filename)

    # Display the figure
    plt.show()

    # Close the figure to free up memory
    plt.close(fig)

---
* Present factor scores in numerical format
---

In [None]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

# 1) Group by question type and get mean factor scores
group_means = (
    df_factor_scores
    .groupby("Conceptual Question Type")[factor_names]
    .mean()
    .reset_index()
)

# Round to 2 decimal places for neat printing
group_means_rounded = group_means.copy()
group_means_rounded[factor_names] = group_means_rounded[factor_names].round(2)

print("Mean factor scores by question type:\n")
print(group_means_rounded)


pd.reset_option('display.max_colwidth')
pd.reset_option('display.width')

---
* Use raw factor score data (before aggregation) to fit a General Linear Model + ANOVA for each factor
* Test whether the factor scores differ significantly by question type

---

In [None]:
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm

# Assume df_factor_scores is your DataFrame and factor_names is your list of factor columns.
# Optionally, rename the question type column for easier reference:
df = df_factor_scores.copy()
df = df.rename(columns={"Conceptual Question Type": "ConceptualQuestionType"})

# Loop over each factor to run the GLM (ANOVA)
for factor in factor_names:
    print(f"\nGeneral Linear Model for {factor}:")

    # Wrap the factor in Q() to handle spaces or special characters
    formula = f'Q("{factor}") ~ C(ConceptualQuestionType)'

    # Fit the model using ordinary least squares (OLS)
    model = smf.ols(formula, data=df).fit()

    # Print a summary of the model (coefficients, p-values, etc.)
    print(model.summary())

    # Get the ANOVA table to test overall differences among groups
    anova_results = anova_lm(model, typ=2)
    print("\nANOVA results:")
    print(anova_results)