Functions from data treatment have been copy-pasted here: while not the best practice. it allows for this notebook to be completely independent from other files.

Imports haven't been formatted yet. Expect a lot of repetitions

In [None]:
# @title
import sys
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install textcomplexity
!{sys.executable} -m pip install stanza
!{sys.executable} -m pip install wordfreq
!{sys.executable} -m spacy download en_core_web_md
!{sys.executable} -m pip install tqdm spacy numpy

In [None]:
# @title
# Standard library imports
import json
from collections import Counter
from functools import lru_cache
from pprint import pprint
from typing import Dict, Set, Iterable, Optional, Any, Tuple
import importlib.resources as pkg_resources

# Third-party imports
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
import spacy
import stanza
import textcomplexity  # only used to access en.json
from tqdm.auto import tqdm

# Download required resources
stanza.download('en')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Make sure WordNet is available; if not, download it.
try:
    _ = wn.synsets("dog")
except LookupError:
    nltk.download("wordnet")
    nltk.download("omw-1.4")

# Load spaCy model
nlp = spacy.load("en_core_web_md", disable=["ner", "textcat"])
spacy_nlp = nlp
spacy_nlp.add_pipe("sentencizer")


# Stanza pipeline cache
@lru_cache(maxsize=None)  # Cache pipelines for different languages
def get_stanza_pipeline(lang: str):
    if lang == 'en':
        # Use pre-trained models for English, including constituency parser
        # for CS metric and dependency parser for MDD metric
        return stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse,constituency')
    else:
        raise ValueError(f"Unsupported language: {lang}")

# Define CONTENT_UPOS - this was missing previously, causing an error if not defined globally or locally where used
# Based on the usage in _compute_lexical_density, these are Universal POS tags for content words
CONTENT_UPOS = {"NOUN", "PROPN", "ADJ", "VERB", "ADV"}

# Define CONTENT_POS - this was also missing previously and is used in the discourse complexity functions
CONTENT_POS =  {"NOUN", "VERB", "ADJ", "ADV"}

Debug

In [None]:
# @title
# Cache stanza pipelines to avoid re-loading models
_STANZA_PIPELINES: Dict[str, stanza.Pipeline] = {}

# UPOS tags considered content words (C)
CONTENT_UPOS = {"NOUN", "PROPN", "VERB", "ADJ", "ADV"}


@lru_cache()
def load_cow_top5000_en() -> Set[str]:
    """
    Load the COW-based list of the 5,000 most frequent English content words
    from textcomplexity's English language definition file (en.json).

    We ignore POS tags and keep only lowercased word forms.
    """
    with pkg_resources.files(textcomplexity).joinpath("en.json").open(
        "r", encoding="utf-8"
    ) as f:
        lang_def = json.load(f)

    most_common = lang_def["most_common"]  # list of [word, xpos]
    cow_top5000 = {w.lower() for w, xpos in most_common}
    return cow_top5000


def get_stanza_pipeline(lang: str = "en", use_gpu: bool = True) -> stanza.Pipeline:
    """
    Get (or create) a cached stanza Pipeline for a given language.

    NOTE: You must have downloaded the models beforehand, e.g.:
        import stanza
        stanza.download('en')
    """
    if lang not in _STANZA_PIPELINES:
        _STANZA_PIPELINES[lang] = stanza.Pipeline(
            lang=lang,
            processors="tokenize,pos,lemma,depparse,constituency",
            use_gpu=use_gpu,
            tokenize_no_ssplit=False,
        )
    return _STANZA_PIPELINES[lang]

In [None]:
# @title
def _compute_mtld(tokens: Iterable[str], ttr_threshold: float = 0.72) -> Optional[float]:
    """
    Compute MTLD (Measure of Textual Lexical Diversity) for a list of tokens.

    MTLD = total_number_of_tokens / number_of_factors

    A factor is a contiguous segment where the running TTR stays >= threshold.
    When the TTR drops below the threshold, we close a factor (at the previous
    token) and start a new one. At the end, the remaining partial segment is
    counted as a fractional factor, with weight proportional to how close the
    final TTR is to the threshold.
    """
    tokens = [tok for tok in tokens if tok]
    if not tokens:
        return None

    types = set()
    factor_count = 0.0
    token_count_in_factor = 0

    for tok in tokens:
        token_count_in_factor += 1
        types.add(tok)
        ttr = len(types) / token_count_in_factor

        if ttr < ttr_threshold:
            factor_count += 1.0
            types = set()
            token_count_in_factor = 0

    # final partial factor
    if token_count_in_factor > 0:
        final_ttr = len(types) / token_count_in_factor
        if final_ttr < 1.0:
            fractional = (1.0 - final_ttr) / (1.0 - ttr_threshold)
            fractional = max(0.0, min(1.0, fractional))
            factor_count += fractional

    if factor_count == 0:
        return None

    return len(tokens) / factor_count



def _compute_lexical_density(total_tokens: int, content_tokens: int) -> Optional[float]:
    """
    LD = |C| / |T|
    where:
        |C| = number of content-word tokens
        |T| = total number of non-punctuation tokens
    """
    if total_tokens == 0:
        return None
    return content_tokens / total_tokens


def _compute_lexical_sophistication_cow(
    content_forms: Iterable[str],
    cow_top5000: set,
) -> Optional[float]:
    """
    LS = |{ w in C : w not in R }| / |C|
    where:
        C = content-word tokens (surface forms, lowercased)
        R = COW top-5000 content word forms (lowercased)
    """
    forms = [f for f in content_forms if f]
    if not forms:
        return None

    off_list = sum(1 for f in forms if f not in cow_top5000)
    return off_list / len(forms)


@lru_cache(maxsize=1)
def load_cow_top5000_en() -> Set[str]:
    """
    Load the COW top-5000 English content word forms from textcomplexity package data.
    The list is expected to be in 'textcomplexity/data/en.json' under the key 'cow_top5000'.
    """
    # Use importlib.resources.files for modern package data access
    json_path = pkg_resources.files('textcomplexity').joinpath('en.json')
    with json_path.open('r') as f:
        data = json.load(f)
        return set(data.get("cow_top5000", []))


def lexical_measures_from_doc(doc) -> Dict[str, Optional[float]]:
    """
    Compute MTLD, LD, LS from a stanza Document.
    """
    cow_top5000 = load_cow_top5000_en()

    mtld_tokens = []
    total_tokens = 0
    content_tokens = 0
    content_forms = []

    for sent in doc.sentences:
        for word in sent.words:
            if word.upos == "PUNCT":
                continue

            lemma = (word.lemma or word.text or "").lower()
            if not lemma:
                continue

            mtld_tokens.append(lemma)
            total_tokens += 1

            if word.upos in CONTENT_UPOS:
                content_tokens += 1
                form = (word.text or "").lower()
                content_forms.append(form)

    mtld = _compute_mtld(mtld_tokens) if mtld_tokens else None
    ld = _compute_lexical_density(total_tokens, content_tokens)
    ls = _compute_lexical_sophistication_cow(content_forms, cow_top5000)

    return {"MTLD": mtld, "LD": ld, "LS": ls}


def lexical_measures_from_text(text: str, lang: str = "en") -> Dict[str, Optional[float]]:
    """
    Convenience wrapper: parse a single text and compute lexical measures.
    """
    if text is None:
        text = ""
    text = str(text)

    if not text.strip():
        return {"MTLD": None, "LD": None, "LS": None}

    nlp = get_stanza_pipeline(lang)
    doc = nlp(text)
    return lexical_measures_from_doc(doc)



def compute_lexical_measures_df(
    df: pd.DataFrame,
    column: str = "text",
    lang: str = "en",
) -> Dict[str, Dict[Any, Optional[float]]]:
    """
    Compute lexical measures for each row in df[column].

    Returns:
        {
            "MTLD": {index: value},
            "LD":   {index: value},
            "LS":   {index: value},
        }
    """
    mtld_res: Dict[Any, Optional[float]] = {}
    ld_res: Dict[Any, Optional[float]] = {}
    ls_res: Dict[Any, Optional[float]] = {}

    for idx, text in df[column].items():
        metrics = lexical_measures_from_text(text, lang=lang)
        mtld_res[idx] = metrics["MTLD"]
        ld_res[idx] = metrics["LD"]
        ls_res[idx] = metrics["LS"]

    return {"MTLD": mtld_res, "LD": ld_res, "LS": ls_res}

In [None]:
# @title
def mdd_from_doc(doc) -> Optional[float]:
    """
    Compute Mean Dependency Distance (MDD) from a stanza Document.

    For each sentence s_i with dependency set D_i:
        MDD_i = (1 / |D_i|) * sum_{(h,d) in D_i} |h - d|
    Then:
        MDD = (1 / k) * sum_i MDD_i, over all sentences with at least one dependency.
    """
    sentence_mdds = []

    for sent in doc.sentences:
        distances = []
        for w in sent.words:
            if w.head is None or w.head == 0:
                continue
            distances.append(abs(w.id - w.head))

        if distances:
            sentence_mdds.append(sum(distances) / len(distances))

    if not sentence_mdds:
        return None
    return sum(sentence_mdds) / len(sentence_mdds)



def _count_clauses_in_tree(tree) -> int:
    """
    Count clause nodes in a constituency tree.

    A simple and standard heuristic (PTB-style) is:
        count all nodes whose label starts with 'S'
        (S, SBAR, SBARQ, SINV, SQ, etc.).

    This aligns with the idea of counting finite and subordinate clauses
    as in Hunt (1965) and later complexity work.
    """
    if tree is None:
        return 0

    # Stanza's constituency tree: tree.label, tree.children
    count = 1 if getattr(tree, "label", "").startswith("S") else 0

    for child in getattr(tree, "children", []):
        # leaves can be strings or terminals without 'label'
        if hasattr(child, "label"):
            count += _count_clauses_in_tree(child)

    return count


def cs_from_doc(doc) -> Optional[float]:
    """
    Compute CS (clauses per sentence) from a stanza Document.

        CS = (1 / k) * sum_i L_i

    where L_i is the number of clauses in sentence s_i, estimated by counting
    all constituents whose label starts with 'S' in the constituency tree of s_i.
    """
    clause_counts = []
    for sent in doc.sentences:
        tree = getattr(sent, "constituency", None)
        if tree is None:
            # No constituency tree available for this sentence
            continue
        num_clauses = _count_clauses_in_tree(tree)
        clause_counts.append(num_clauses)

    if not clause_counts:
        return None

    return sum(clause_counts) / len(clause_counts)



def syntactic_measures_from_doc(doc) -> Dict[str, Optional[float]]:
    """
    Compute MDD and CS from a stanza Document.
    """
    mdd = mdd_from_doc(doc)
    cs = cs_from_doc(doc)
    return {"MDD": mdd, "CS": cs}


def syntactic_measures_from_text(text: str, lang: str = "en") -> Dict[str, Optional[float]]:
    """
    Convenience wrapper: parse a single text and compute syntactic measures.
    """
    if text is None:
        text = ""
    text = str(text)

    if not text.strip():
        return {"MDD": None, "CS": None}

    nlp = get_stanza_pipeline(lang)
    doc = nlp(text)
    return syntactic_measures_from_doc(doc)


def compute_syntactic_measures_df(
    df: pd.DataFrame,
    column: str = "text",
    lang: str = "en",
) -> Dict[str, Dict[Any, Optional[float]]]:
    """
    Compute syntactic measures for each row in df[column].

    Returns:
        {
            "MDD": {index: value},
            "CS":  {index: value},
        }
    """
    mdd_res: Dict[Any, Optional[float]] = {}
    cs_res: Dict[Any, Optional[float]] = {}

    for idx, text in df[column].items():
        metrics = syntactic_measures_from_text(text, lang=lang)
        mdd_res[idx] = metrics["MDD"]
        cs_res[idx] = metrics["CS"]

    return {"MDD": mdd_res, "CS": cs_res}

In [None]:
# @title
# Approximate set of content POS tags (spaCy universal POS)
CONTENT_POS =  {"NOUN", "VERB", "ADJ", "ADV"}


def is_content_token(tok):
    """
    Return True if token is considered a content word.
    We ignore stopwords, punctuation, and non-alphabetic tokens.
    """
    return (
        tok.is_alpha
        and not tok.is_stop
        and tok.pos_ in CONTENT_POS
    )


@lru_cache(maxsize=100000)
def get_related_lemmas(lemma):
    """
    Return a set of semantically related lemmas for the given lemma
    using WordNet, including:
      - synonyms
      - antonyms
      - hypernyms / hyponyms
      - meronyms (part/member/substance)
      - coordinate terms (siblings under the same hypernym)

    NOTE: Some older examples mention 'troponyms', but in NLTK's
    WordNet interface there is no 'troponyms()' method on Synset,
    so we do NOT use it here.
    """
    lemma = lemma.lower()
    related = set()
    synsets = wn.synsets(lemma)

    for syn in synsets:
        # Synonyms and antonyms
        for l in syn.lemmas():
            related.add(l.name().lower().replace("_", " "))
            for ant in l.antonyms():
                related.add(ant.name().lower().replace("_", " "))

        # Hypernyms (more general) and hyponyms (more specific)
        for hyper in syn.hypernyms():
            for l in hyper.lemmas():
                related.add(l.name().lower().replace("_", " "))
        for hypo in syn.hyponyms():
            for l in hypo.lemmas():
                related.add(l.name().lower().replace("_", " "))

        # Meronyms: part/member/substance
        for mer in syn.part_meronyms() + syn.member_meronyms() + syn.substance_meronyms():
            for l in mer.lemmas():
                related.add(l.name().lower().replace("_", " "))

        # Coordinate terms (siblings under same hypernym)
        for hyper in syn.hypernyms():
            for sibling in hyper.hyponyms():
                if sibling == syn:
                    continue
                for l in sibling.lemmas():
                    related.add(l.name().lower().replace("_", " "))

    # Remove the lemma itself if present
    related.discard(lemma)
    return related


def lexical_cohesion_single(text, nlp):
    """
    Compute Lexical Cohesion (LC) for a single document:

        LC = |C| / m

    where:
      - |C| is the number of cohesive devices between sentences
        (lexical repetition + semantic relations),
      - m  is the total number of word tokens (alphabetic) in the document.

    If the document has fewer than 2 sentences or no valid words,
    LC is returned as 0.0.
    """
    if not isinstance(text, str) or not text.strip():
        return 0.0

    doc = nlp(text)

    # Total number of alphabetic tokens (denominator m)
    m = sum(1 for tok in doc if tok.is_alpha)
    if m == 0:
        return 0.0

    sentences = list(doc.sents)
    if len(sentences) < 2:
        # With only one sentence, cross-sentence cohesion is not defined
        return 0.0

    # Collect sets of content lemmas per sentence
    sent_lemmas = []
    for sent in sentences:
        lemmas = set(
            tok.lemma_.lower()
            for tok in sent
            if is_content_token(tok)
        )
        if lemmas:
            sent_lemmas.append(lemmas)

    if len(sent_lemmas) < 2:
        return 0.0

    cohesive_count = 0

    for i in range(len(sent_lemmas) - 1):
        for j in range(i + 1, len(sent_lemmas)):
            li = sent_lemmas[i]
            lj = sent_lemmas[j]

            # 1) Lexical repetition: shared lemmas
            shared = li & lj
            cohesive_count += len(shared)

            # 2) Semantic relations via WordNet
            for lemma in li:
                related = get_related_lemmas(lemma)
                cohesive_count += len(related & lj)

    return float(cohesive_count) / float(m)


def sentence_vector(sent, vector_size):
    """
    Represent a sentence as the average of token vectors.
    If no token has a vector, return a zero vector.
    """
    vecs = [
        tok.vector
        for tok in sent
        if tok.has_vector and not tok.is_punct and not tok.is_space
    ]
    if not vecs:
        return np.zeros(vector_size, dtype="float32")
    return np.mean(vecs, axis=0)


def coherence_single(text, nlp):
    """
    Compute Coherence (CoH) for a single document as the average
    cosine similarity between adjacent sentence vectors:

        CoH = (1 / (k-1)) * sum_{i=1}^{k-1} cos(h_i, h_{i+1})

    where h_i is the sentence/topic vector for sentence i.

    If the document has fewer than 2 sentences, CoH = 0.0.
    """
    if not isinstance(text, str) or not text.strip():
        return 0.0

    if nlp.vocab.vectors_length == 0:
        raise ValueError(
            "The loaded spaCy model does not contain word vectors "
            "(nlp.vocab.vectors_length == 0). "
            "Use a model like 'en_core_web_md' or similar."
        )

    doc = nlp(text)
    sentences = list(doc.sents)
    k = len(sentences)

    if k < 2:
        # Only one sentence: no adjacent pair, coherence = 0.0
        return 0.0

    vector_size = nlp.vocab.vectors_length
    sent_vectors = [
        sentence_vector(sent, vector_size)
        for sent in sentences
    ]

    sims = []
    for i in range(k - 1):
        v1 = sent_vectors[i]
        v2 = sent_vectors[i + 1]
        norm1 = np.linalg.norm(v1)
        norm2 = np.linalg.norm(v2)
        denom = norm1 * norm2
        if denom == 0.0:
            # Skip pairs where at least one sentence vector is zero
            continue
        cos_sim = float(np.dot(v1, v2) / denom)
        sims.append(cos_sim)

    if not sims:
        return 0.0

    return float(np.mean(sims))



def compute_lexical_cohesion_vector(df, nlp, column="text"):
    """
    Compute LC for each row of a DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing the texts.
    nlp : spaCy Language object
        Pre-loaded spaCy pipeline with lemmatizer, POS tagger, etc.
    column : str, default "text"
        Name of the column that contains the text.

    Returns
    -------
    np.ndarray
        1D array of LC scores, length == len(df).
    """
    texts = df[column].fillna("").astype(str)
    scores = [lexical_cohesion_single(t, nlp) for t in texts]
    return np.array(scores, dtype="float32")


def compute_coherence_vector(df, nlp, column="text"):
    """
    Compute CoH for each row of a DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing the texts.
    nlp : spaCy Language object
        Pre-loaded spaCy pipeline with word vectors.
    column : str, default "text"
        Name of the column that contains the text.

    Returns
    -------
    np.ndarray
        1D array of CoH scores, length == len(df).
    """
    texts = df[column].fillna("").astype(str)
    scores = [coherence_single(t, nlp) for t in texts]
    return np.array(scores, dtype="float32")


def compute_discourse_measures(df, nlp, column="text"):
    """
    Compute both LC and CoH for each row of a DataFrame and return
    them in a dictionary.

    Returns
    -------
    dict
        {
            "LC":  np.ndarray of lexical cohesion scores,
            "CoH": np.ndarray of coherence scores
        }
    """
    lc_vec = compute_lexical_cohesion_vector(df, nlp, column=column)
    coh_vec = compute_coherence_vector(df, nlp, column=column)
    return {"LC": lc_vec, "CoH": coh_vec}

In [None]:
# @title
def _analyze_text_all(text: str, lang: str = "en") -> Dict[str, Optional[float]]:
    """
    Parse a text with stanza and compute all measures (lexical + syntactic)
    in a single pass.

    Returns a dict with keys:
        "MTLD", "LD", "LS", "MDD", "CS"
    (Discourse measures LC/CoH are added later at DataFrame level, via spaCy.)
    """
    if text is None:
        text = ""
    text = str(text)

    if not text.strip():
        return {"MTLD": None, "LD": None, "LS": None, "MDD": None, "CS": None}

    nlp = get_stanza_pipeline(lang)
    doc = nlp(text)

    lex = lexical_measures_from_doc(doc)
    syn = syntactic_measures_from_doc(doc)

    out: Dict[str, Optional[float]] = {}
    out.update(lex)
    out.update(syn)
    return out


def compute_all_complexity_measures_df(
    df: pd.DataFrame,
    column: str = "text",
    lang: str = "en",
    spacy_nlp=None,
) -> Dict[str, Dict[Any, Optional[float]]]:
    """
    Compute all complexity measures for each row in df[column].

    Args
    ----
    df : pandas.DataFrame
        DataFrame with a text column.
    column : str, default "text"
        Name of the text column.
    lang : str, default "en"
        Language code for stanza.
    n_jobs : int, default 1
        Number of worker processes to use.
            - 1  : sequential execution (no multiprocessing).
            - >1 : multiprocessing with that many workers.
            - 0 or None : use cpu_count() workers.
    spacy_nlp : spaCy Language, required for LC / CoH
        Pre-loaded spaCy pipeline with:
            - POS / lemmatizer for LC
            - word vectors for CoH (e.g. 'en_core_web_md').

    Returns
    -------
    dict
        {
            "MTLD": {index: value},
            "LD":   {index: value},
            "LS":   {index: value},
            "MDD":  {index: value},
            "CS":   {index: value},
            "LC":   {index: value},
            "CoH":  {index: value},
        }
    """
    mtld_res: Dict[Any, Optional[float]] = {}
    ld_res: Dict[Any, Optional[float]] = {}
    ls_res: Dict[Any, Optional[float]] = {}
    mdd_res: Dict[Any, Optional[float]] = {}
    cs_res: Dict[Any, Optional[float]] = {}

    items = list(df[column].items())  # list[(index, text)]
    total_items = len(items)

    # ---- Lexical + syntactic (stanza) ----
    for idx, text in tqdm(
        items,
        total=total_items,
        desc="Computing lexical & syntactic complexity (sequential)",
    ):
        metrics = _analyze_text_all(text, lang=lang)
        mtld_res[idx] = metrics["MTLD"]
        ld_res[idx] = metrics["LD"]
        ls_res[idx] = metrics["LS"]
        mdd_res[idx] = metrics["MDD"]
        cs_res[idx] = metrics["CS"]


    # ---- Discourse measures (spaCy: LC & CoH) ----
    if spacy_nlp is None:
        raise ValueError(
            "spacy_nlp must be provided to compute LC and CoH. "
            "Load a spaCy model with vectors, e.g. 'en_core_web_md', and "
            "pass it as spacy_nlp=..."
        )

    discourse = compute_discourse_measures(df, spacy_nlp, column=column)
    lc_vec = discourse["LC"]
    coh_vec = discourse["CoH"]

    lc_res: Dict[Any, float] = {}
    coh_res: Dict[Any, float] = {}

    # Map arrays back to DataFrame indices
    for i, idx in enumerate(df.index):
        lc_res[idx] = float(lc_vec[i])
        coh_res[idx] = float(coh_vec[i])

    return {
        "MTLD": mtld_res,
        "LD": ld_res,
        "LS": ls_res,
        "MDD": mdd_res,
        "CS": cs_res,
        "LC": lc_res,
        "CoH": coh_res,
    }

In [None]:
# @title
stanza_nlp = get_stanza_pipeline("en", use_gpu=True)

# Crea i "alias" (puntatori) per le funzioni che cercano nomi diversi
nlp_stanza = stanza_nlp
nlp_spacy = spacy_nlp # Cella necessaria per un mio momento di confusione. Non sono nemmeno sicuro che serva più

Preliminary Requirements.

Installs

In [None]:
!pip install -q transformers accelerate bitsandbytes agno jsonlines
!pip install -U bitsandbytes
!pip install -q agno transformers accelerate bitsandbytes sentencepiece protobuf

Imports

In [None]:
import torch
import json
import jsonlines
import pandas as pd
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from agno.agent import Agent

METRICS_ORDER = ['MTLD', 'LD', 'LS', 'MDD', 'CS', 'LC', 'CoH']

pip install ollama # nonostante venga installato dopo, per qualche motivo una singola installazione è assente senza questa riga.

LLM setup

In [None]:
# Installa Ollama
!curl -fsSL https://ollama.com/install.sh | sh

import subprocess
import time
import os

# Avvia il server Ollama in background
# Usiamo env per forzare il rilevamento della GPU se necessario
env = os.environ.copy()
env["OLLAMA_HOST"] = "127.0.0.1:11434"

print(" Avvio server Ollama...")
with open("ollama.log", "w") as f:
    subprocess.Popen(["ollama", "serve"], stdout=f, stderr=f, env=env)

# Attesa critica per l'inizializzazione del server
time.sleep(10)

# Scarica il modello richiesto
#limitarsi a 1-2 modelli per volta per non riempire la VRAM, 4 modelli solo se stiamo usando una A100 o simili
print(" Download modello...")
!ollama pull qwen2.5:7b-instruct-q4_K_M
!ollama pull llama3.1:8b
!ollama pull mistral:7b
!ollama pull falcon:7b

print(" Ollama è pronto e i modelli sono caricati.")

AGENTS setup

In [None]:
from agno.agent import Agent
from agno.models.ollama import Ollama

local_llm = Ollama(
    id="qwen2.5:7b-instruct-q4_K_M",
    options={
        "temperature": 0.01,
        "num_predict" : 2000, #passare a 12000 per swipe vikidia
        "num_ctx": 8192,
    }
)

# LINEE GUIDA CONDIVISE
COMPLEXITY_GUIDELINES = """
[COMPLEXITY GUIDELINES]
Lexical diversity is measured with MTLD: the text is scanned left-to-right
and right-to-left, computing factor lengths—the number of tokens before the
running type—token ratio falls below 0.72—and MTLD is the text length di
vided by the mean factor length; increasing MTLD means varying lemmas and
avoiding repeated phrasings.
Lexical density (LD) is evaluated through three quantities. Lexical density
is the proportion of content words among all tokens, where content words
are tokens tagged as NOUN, VERB, ADJ, or ADV (proper nouns are ex
cluded); increasing LD means using more information-bearing words and fewer
function-word fillers.
Lexical sophistication (LS) measures the proportion of advanced vocabulary in
a text by comparing content words against high-frequency vocabulary. Specif
ically, LS is calculated as the ratio of sophisticated content-word tokens to
total content words. A content word is classified as sophisticated if its lemma
does not appear among the 5,000 most frequent English content-word lemmas.
increasing LS means choosing more specific, lower-frequency vocabulary while
staying faithful to the source facts.
Mean Dependency Distance (MDD) reflects the average span between words
that depend on each other; higher values arise when the sentence struc
ture places modifiers and complements further from their heads (e.g.,
fronted clauses, heavy nominal modification, postponed complements, rela
tive clauses), increasing structural load. Higher MDD reflects longer, well
formed dependencies—e.g., fronted adverbials, heavy nominal modification,
postponed complements, relative clauses whose antecedent is distant—thus a
greater structural/memory load.
Clausal density (CS) reflects how many clauses are packed into each sentence;
higher values arise when subordinate, complement, and relative clauses are
embedded rather than splitting ideas into multiple simple sentences. Higher
CS reflects packaging more propositions per sentence by adding subordinate
structures rather than relying on coordination or splitting into simple sen
tences.
Lexical cohesion (LC) reflects how consistently the text maintains a lexical
thread across sentences through repetition and semantic relatedness (e.g., syn
onyms or semantically close terms); higher values indicate stronger linking of
entities and ideas over the paragraph.
Coherence (CoH) reflects how smoothly topics progress between adjacent sen
tences; higher values indicate natural transitions, clear connections, and sus
tained thematic continuity. Higher CoH indicates that sentences follow one
another naturally, with clear thematic continuity and well-signposted transi
tions; abrupt topic shifts or loosely linked sentences reduce the score.
"""

# STRUTTURA OUTPUT INTERMEDIO
from pydantic import BaseModel, Field
from typing import List, Optional



# WRITER AGENT: Text Complexification Assistant
writer_agent = Agent(
    name="Writer",
    model=local_llm,
    description="""
    [ROLE]
    You are a Text Complexification Assistant in a multi-agent framework for text
    complexification. You interact with a Critic Assistant that evaluates the complexity
    of your outputs and, when needed, sends you short, concrete action plans for revision
    in JSON format. Never flip roles. Never try to provide an action plan. Only the Critic
    Assistant is allowed to create or modify action plans. You and the Critic Assistant share
    a common interest in collaborating to successfully complete the task.""",
    instructions=[
        COMPLEXITY_GUIDELINES,
        """Your task is to rewrite a given source text so that the generated result is more complex
        in lexicon, syntax, and discourse complexity, according to the guidelines provided in
        [COMPLEXITY GUIDELINES] and satisfying all the constraints in [OBJECTIVES].
        When you are given a source text in [SOURCE TEXT] with its current complexity
        profile in [TEXT COMPLEXITY PROFILE] and a target complexity profile in [TARGET
        COMPLEXITY PROFILE], you must generate a rewritten version whose complexity
        measures, as defined in [COMPLEXITY GUIDELINES], satisfy all the constraints defined
        in [OBJECTIVES].
        When you are given a previous version of your own output in [PREVIOUS TEXT]
        together with an [ACTION PLAN] produced by the Critic Assistant, you must apply only
        the specified actions in the plan to rewrite [PREVIOUS TEXT].
        You must strictly follow the task description, the objectives, the action plan (when given),
        and the output format specified in the current prompt. You must output only the rewrit
        ten text, as a continuous passage, with no explanations, no metric values, and no meta
        commentary. Never mention the multi-agent framework, the Critic Assistant, the guide
        lines, or the objectives in your output."""
    ],
    expected_output="""Return only the rewritten text, with no additional headings, no metric values, and no meta
    text. Do not report any explanations.""",
    markdown=False,
    tool_call_limit=0,
)

# CRITIC AGENT: Evaluation and Action Planning
critic_agent = Agent(
    name="Critic",
    model=local_llm,
    description="""You are a Critic Assistant in a multi-agent framework for text complexification. You interact with a
    Text Complexification Assistant that rewrites texts according to shared complexity guidelines and
    objectives. Never flip roles. Never attempt to rewrite the text yourself. Only the Complexification
    Assistant is allowed to produce rewritten texts. You and the Complexification Assistant share a common
    interest in collaborating to successfully complete the task. You have always access to the original text
    in [SOURCE TEXT].""",
    instructions=[
        COMPLEXITY_GUIDELINES,
        """Your task is to review the following information:- The text currently generated by the Complexification Assistant (in [CURRENT])- Its current complexity profile (in [TEXT COMPLEXITY PROFILE])- The target complexity profile (in [TARGET COMPLEXITY PROFILE])- Any additional diagnostics (in [DIAGNOSTICS])- The original text (in [SOURCE TEXT])
        Then, produce a concrete ACTION PLAN that helps the Complexification Assistant to rewrite the text
        in [CURRENT] so that all the constraints in [OBJECTIVES] are satisfied according to the definitions
        provided in [COMPLEXITY GUIDELINES]."""
    ],
    expected_output="""Your output must always be a single ACTION PLAN in valid JSON format, composed of a few precise,
    immediately actionable editing instructions directed to the Complexification Assistant. Each instruction
    must clearly indicate what kind of change is needed (lexical, syntactic, or discourse-related) and how it
    should move the text towards satisfying the complexity guidelines and objectives.
    Your ACTION PLAN should focus on modifications that increase lexical, syntactic, or discourse complex
    ity, or that help satisfy length and structural constraints, as defined in [COMPLEXITY GUIDELINES]
    and [OBJECTIVES].
    You must strictly follow the task description, the guidelines, the objectives, and the required output
    format specified in the current prompt. You must output only a single ACTION PLAN in JSON
    format, with no rewritten text, no alternative candidate rewrites, and no additional explanations or
    meta-commentary outside the JSON structure.
    You must respond with only a single JSON object beginning with { and ending with }. Do not rewrite
    the text. Do not provide explanations, commentary, or any additional text outside the JSON object.
    If all objectives in [OBJECTIVES] are already satisfied, you must output exactly:
    {"status": "objectives satisfied"} and nothing else. Do not include an action plan field in this case.
    If at least one objective is not satisfied, you must output a JSON object of the following form:
    {
        "status": "revision required",
        "action_plan": [
        {
            "id": <integer>,
            "type": "<lexical | syntactic | discourse | length | mixed>",
            "target_metrics": ["<metric1>", "<metric2>", ...],
            "location": "<where to intervene in the CURRENT text>",
            "instruction": "<one concrete, immediately actionable edit>"
            },
        ...
        ]
    }

    The status field must be exactly "revision required" when you provide an action plan.
    The action plan array must contain between 1 and 6 actions.
    Each action must specify a single, immediately actionable edit, clearly indicating where to intervene
    (for example, “paragraph 2, sentences 3–5”) and what to do concretely (for example, “replace repeated
    ’important’ with ’crucial’, ’vital’, ’essential’”).
    An example of JSON response, when revision is required is as follows:

    {
        "status": "revision required",
        "action_plan": [
        {
            "id": 1,
            "type": "lexical",
            "target_metrics": ["MTLD", "LD"],
            "location": "paragraph 1, sentences 2-3",
            "instruction": "Replace the repeated phrase ’very important’ with more
            varied expressions such as ’crucial’, ’fundamental’,
            and ’pivotal’."
            }
        ]
    }
    """,
    markdown=False,
    tool_call_limit=0,
)

In [None]:
# Se l'agente risponde, tutto funziona correttamente

agent = Agent(
    name="tester",
    model=local_llm,
    description="Test di funzionamento del framework",
    instructions=["Verify if the system works. answer yes or no"],
    markdown=False,
    tool_call_limit=0,
)

response = agent.run("Are you online?")
print(response.content)


Dominance and Diagnostic logic

In [None]:
def check_dominance(curr_metrics, target_metrics, curr_text, complex_text):
    # Length constraint: 80% to 120% of the complex text word count
    target_words = len(complex_text.split())
    curr_words = len(curr_text.split())
    length_ok = (0.8 * target_words) <= curr_words <= (1.2 * target_words)

    # Metric dominance: every measure >= target
    metrics_met = all(curr_metrics[m] >= target_metrics[m] for m in METRICS_ORDER)

    # Strict improvement in each category
    # Note: Requires comparison with original source_profile

    return length_ok and metrics_met

def get_diagnostics(curr_metrics, target_metrics, curr_text, complex_text):
    below = [m for m in METRICS_ORDER if curr_metrics[m] < target_metrics[m]]

    # Categorize missing drivers
    missing_drivers = []
    if any(m in below for m in ['MTLD', 'LD', 'LS']): missing_drivers.append('lexical')
    if any(m in below for m in ['MDD', 'CS']): missing_drivers.append('syntactic')
    if any(m in below for m in ['LC', 'CoH']): missing_drivers.append('discourse')

    # Length diagnostics
    length_issue = None
    target_words = len(complex_text.split())
    curr_words = len(curr_text.split())
    if curr_words < 0.8 * target_words: length_issue = "Text too short"
    elif curr_words > 1.2 * target_words: length_issue = "Text too long"

    return {"below_target": below, "drivers_missing": missing_drivers, "length_issues": length_issue}

Bridge Function

In [None]:
def compute_metrics(text):
    """
    Calcola le 7 metriche per una singola stringa di testo.
    Utilizza le funzioni già presenti nel notebook.
    """
    if not text or not isinstance(text, str):
        # Return default zero values for all metrics if text is empty or invalid
        return {m: 0.0 for m in METRICS_ORDER}

    # 1. Calcola le misure lessicali (MTLD, LD, LS) e sintattiche (MDD, CS) in un solo passaggio
    #    _analyze_text_all gestisce la pipeline Stanza internamente.
    all_lex_syn_results = _analyze_text_all(text, lang='en')

    # 2. Calcola le misure di discorso (LC, CoH)
    #    Queste funzioni richiedono l'oggetto spaCy globale (spacy_nlp).
    lc_score = lexical_cohesion_single(text, spacy_nlp)
    coh_score = coherence_single(text, spacy_nlp)

    # Uniamo tutti i risultati in un dizionario nel formato richiesto dal framework
    return {
        "MTLD": all_lex_syn_results.get("MTLD", 0.0),
        "LD": all_lex_syn_results.get("LD", 0.0),
        "LS": all_lex_syn_results.get("LS", 0.0),
        "MDD": all_lex_syn_results.get("MDD", 0.0),
        "CS": all_lex_syn_results.get("CS", 0.0),
        "LC": lc_score,
        "CoH": coh_score
    }

Cella di batch sampling

In [None]:
#da utilizzare prima del lancio dell'analisi massiva per verificare un corretto funzionamento della pipeline

"""import json
import pandas as pd
from datetime import datetime
from pydantic import BaseModel, Field
from typing import List, Optional

# 1. DEFINIZIONE SCHEMI PYDANTIC
class ActionItem(BaseModel):
    id: int
    type: str = Field(description="lexical | syntactic | discourse | length | mixed")
    target_metrics: List[str]
    location: str
    instruction: str

class CriticResponse(BaseModel):
    status: str = Field(description="revision required | objectives satisfied")
    action_plan: Optional[List[ActionItem]] = None

# 2. Caricamento Dataset
df = pd.read_csv("CLEANED_final_complexity_ose_adv_ele.csv", sep='\t')

MODELS_TO_TEST = ["llama3.1:8b"]
NUM_ROWS_TO_TEST = 5
MAX_ITERATIONS = 10
METRICS_ORDER = ['MTLD', 'LD', 'LS', 'MDD', 'CS', 'LC', 'CoH']

def run_benchmark_session(row_index, df, output_file, model_id, max_k=10):
    row = df.iloc[row_index]
    source_text = row['Simple']
    complex_ref_text = row['Complex']
    target_profile = [round(float(row[f'Complex_{m}']), 3) for m in METRICS_ORDER]
    target_map = {m: float(row[f'Complex_{m}']) for m in METRICS_ORDER}

    ref_word_count = int(row['Complex_word_count'])
    min_w, max_w = int(ref_word_count * 0.8), int(ref_word_count * 1.2)

    run_id = f"exp_{model_id.replace(':', '_')}_row_{row_index}"
    current_text = source_text
    last_critic_json = {}
    previous_text_profile = [round(float(row[f'Simple_{m}']), 3) for m in METRICS_ORDER]

    for k in range(max_k + 1):
        print(f"       Iterazione {k}...", end=" ")
        mode = "bootstrap" if k == 0 else "refinement"

        # --- FASE 1: WRITER AGENT ---
        if k == 0:
            instruction = (
                f"[TASK]: Rewrite [SOURCE TEXT] to match [TARGET COMPLEXITY PROFILE].\n"
                f"[TARGET COMPLEXITY PROFILE]: {target_profile}\n"
                f"[SOURCE TEXT]: {source_text}\n"
                f"[OBJECTIVES]: Word count in [{min_w}, {max_w}].\n"
                f"[OUTPUT FORMAT]: Return only the rewritten text."
            )
            input_for_writer = source_text
        else:
            plan_to_pass = json.dumps(last_critic_json, indent=2)
            instruction = (
                f"Rewrite [PREVIOUS TEXT] applying ONLY the 'action plan' in [ACTION PLAN].\n"
                f"[ACTION PLAN]: {plan_to_pass}\n"
                f"[PREVIOUS TEXT]: {current_text}\n"
            )
            input_for_writer = current_text
            print(f"\n      [DEBUG WRITER] Action Plan inviato:\n{plan_to_pass}")

        writer_res = writer_agent.run(instruction)
        rewritten_text = writer_res.content.strip()

        if k > 0 and rewritten_text == current_text:
            print("       WARNING: Testo identico rilevato.")

        # FASE 2: SCORING-
        rewritten_metrics = compute_metrics(rewritten_text)
        rewritten_profile = [round(rewritten_metrics[m], 3) for m in METRICS_ORDER]
        diag_text = get_diagnostics(rewritten_metrics, target_map, rewritten_text, complex_ref_text)

        # FASE 3: CRITIC AGENT
        critic_instruction = (
            f"[TASK]: Review [CURRENT] against [TARGET COMPLEXITY PROFILE].\n"
            f"[CURRENT]: {rewritten_text}\n"
            f"[DIAGNOSTICS]: {diag_text}\n"
        )

        try:

            critic_res = critic_agent.run(critic_instruction, response_model=CriticResponse)

            # Debug per vedere l'output reale
            print(f"      [DEBUG CRITIC] RAW: {str(critic_res.content)[:100]}...")

            if isinstance(critic_res.content, CriticResponse):
                critic_output_obj = critic_res.content
            else:
                # Gestione stringa in caso Agno non faccia l'auto-cast
                import re
                clean_json = re.sub(r'```json\s*|```', '', str(critic_res.content)).strip()
                critic_output_obj = CriticResponse(**json.loads(clean_json))

            last_critic_json = critic_output_obj.model_dump(exclude_none=True)
            print(f"      [DEBUG CRITIC] Action Plan generato con successo.")

        except Exception as e:
            print(f"      [DEBUG CRITIC] ERRORE: {str(e)}")
            last_critic_json = {
                "status": "revision required",
                "action_plan": [{"id": 1, "type": "mixed", "target_metrics": [], "location": "all", "instruction": "Increase complexity."}]
            }

        # LOGGING E AGGIORNAMENTO STATO
        log_entry = {
            "run_id": run_id, "iteration": k, "timestamp": datetime.now().isoformat(),
            "source_text": source_text, "target_complexity_profile": target_profile,
            "complexification_output": {"rewritten_text": rewritten_text, "rewritten_text_profile": rewritten_profile},
            "critic_output": last_critic_json
        }
        output_file.write(json.dumps(log_entry) + "\n")
        output_file.flush()

        current_text = rewritten_text
        previous_text_profile = rewritten_profile
        status = last_critic_json.get("status", "revision required")
        print(f"      Status: {status}")

        if "satisfied" in status.lower():
            print("      Successo raggiunto.")
            break

# --- LOOP DI TEST ---
for model_id in MODELS_TO_TEST:
    print(f"\n CAMBIO MODELLO DI ALIMENTAZIONE: {model_id}")
    new_llm = Ollama(id=model_id, options={"temperature": 0.01, "num_predict": 2000, "num_ctx": 8192})
    writer_agent.model = new_llm
    critic_agent.model = new_llm

    output_filename = f"trace_{model_id.replace(':', '_')}.jsonl"
    with open(output_filename, "w", encoding="utf-8") as f_trace:
        for i in range(NUM_ROWS_TO_TEST):
            print(f"   Test riga {i}...")
            run_benchmark_session(i, df, f_trace, model_id, max_k=MAX_ITERATIONS)

print("\n TUTTI I TEST COMPLETATI.")"""

IN CASO DI ARRESTO MANUALE DELLA CELLA, RIAVVIARE OLLAMA DA QUI

In [None]:
env = os.environ.copy()
env["OLLAMA_HOST"] = "127.0.0.1:11434"

print(" Avvio server Ollama...")
with open("ollama.log", "w") as f:
    subprocess.Popen(["ollama", "serve"], stdout=f, stderr=f, env=env)

Analisi massiva per campionamento manuale e automatico

In [None]:
from google.colab import drive

# --- 0. MOUNT DRIVE E SETUP PERCORSI ---
drive.mount('/content/drive')

# Definisci la cartella di output sul tuo Drive
# Assicurati che questa cartella esista o creala manualmente
OUTPUT_FOLDER = "/content/drive/MyDrive/Tesi_Complexity_Results"
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

In [None]:
# Testiamo il montaggio del drive. Eseguiamo un remount per sicurezza nonostante sia ridondante

"""import os
import json
from google.colab import drive

# Montaggio Drive
print(" Montaggio Google Drive...")
drive.mount('/content/drive', force_remount=True)

# Configurazione percorsi
# Assicuriamoci che il nome della cartella e quello del file siano distinti
OUTPUT_FOLDER = "/content/drive/MyDrive/Tesi_Complexity_Results"
test_file_name = "test_salvataggio_v2.jsonl"
test_file_path = os.path.join(OUTPUT_FOLDER, test_file_name)

# Creazione cartella se non esiste
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    print(f" Cartella creata: {OUTPUT_FOLDER}")
else:
    print(f" Cartella già esistente: {OUTPUT_FOLDER}")

# Test di scrittura (Specificando il file, non la cartella)
test_data = {
    "test_id": "check_final",
    "timestamp": "2024-05-20T10:30:00",
    "message": "Test di scrittura corretto per la tesi"
}

print(f" Scrittura file di test in: {test_file_path}...")
try:
    # Verifichiamo che il path non sia una cartella prima di scrivere
    if os.path.isdir(test_file_path):
        print(f" Errore: {test_file_path} è una directory, non un file!")
    else:
        with open(test_file_path, "w", encoding="utf-8") as f:
            f.write(json.dumps(test_data) + "\n")
        print(" Scrittura completata con successo.")
except Exception as e:
    print(f" Errore durante la scrittura: {e}")

# Verifica di lettura
print(" Verifica integrità dati...")
try:
    with open(test_file_path, "r", encoding="utf-8") as f:
        read_data = json.loads(f.readline())
        print(f" Test superato! Dati letti: {read_data}")
except Exception as e:
    print(f" Errore durante la lettura: {e}")"""

In [None]:
import json
import pandas as pd
import os
import re
from datetime import datetime
from pydantic import BaseModel, Field
from typing import List, Optional
from tqdm.auto import tqdm

# --- SCHEMI PYDANTIC ---
class ActionItem(BaseModel):
    id: int
    type: str = Field(description="lexical | syntactic | discourse | length | mixed")
    target_metrics: List[str]
    location: str
    instruction: str

class CriticResponse(BaseModel):
    status: str = Field(description="revision required | objectives satisfied")
    action_plan: Optional[List[ActionItem]] = None

# CONFIGURAZIONE DATASET E PARAMETRI
DATASET_FILES = [
    "CLEANED_final_complexity_ose_adv_ele.csv",
    "CLEANED_final_complexity_ose_adv_int.csv",
    "CLEANED_final_complexity_swipe.csv",
    "CLEANED_final_complexity_vikidia.csv"
]

MODELS_TO_TEST = ["llama3.1:8b", "mistral:7b", "qwen2.5:7b-instruct-q4_K_M", "falcon:7b"]
MAX_ITERATIONS = 10
METRICS_ORDER = ['MTLD', 'LD', 'LS', 'MDD', 'CS', 'LC', 'CoH']
# OUTPUT_FOLDER = "/content/drive/MyDrive/Tesi/Risultati" # O qualsiasi cartella si sia indicata in precedenza

def run_benchmark_session(row_index, df, output_file, model_id, max_k=10):
    row = df.iloc[row_index]
    source_text = row['Simple']
    complex_ref_text = row['Complex']
    target_profile = [round(float(row[f'Complex_{m}']), 3) for m in METRICS_ORDER]
    target_map = {m: float(row[f'Complex_{m}']) for m in METRICS_ORDER}

    ref_word_count = int(row['Complex_word_count'])
    min_w, max_w = int(ref_word_count * 0.8), int(ref_word_count * 1.2)

    run_id = f"exp_{model_id.replace(':', '_')}_row_{row_index}"
    current_text = source_text
    last_critic_json = {}

    for k in range(max_k + 1):
        print(f"       Iterazione {k}...", end=" ")

        # FASE 1: WRITER AGENT
        if k == 0:
            instruction = (f"[TASK]: Rewrite [SOURCE TEXT] to match profile {target_profile}.\n"
                           f"[SOURCE TEXT]: {source_text}\n"
                           f"[OBJECTIVES]: Word count in [{min_w}, {max_w}].\n"
                           f"[OUTPUT FORMAT]: Return only the rewritten text.")
        else:
            plan_to_pass = json.dumps(last_critic_json, indent=2)
            print(f"\n      [DEBUG WRITER] Action Plan inviato:\n{plan_to_pass}")
            instruction = (f"Rewrite [PREVIOUS TEXT] applying ONLY the 'action plan' in [ACTION PLAN].\n"
                           f"[ACTION PLAN]: {plan_to_pass}\n"
                           f"[PREVIOUS TEXT]: {current_text}\n"
                           f"STRICT RULE: Do not truncate. Return FULL text.")

        writer_res = writer_agent.run(instruction)
        rewritten_text = writer_res.content.strip()

        if k > 0 and rewritten_text == current_text:
            print("       WARNING: Testo identico rilevato.")

        # FASE 2: SCORING
        rewritten_metrics = compute_metrics(rewritten_text)
        rewritten_profile = [round(rewritten_metrics[m], 3) for m in METRICS_ORDER]
        diag_text = get_diagnostics(rewritten_metrics, target_map, rewritten_text, complex_ref_text)

        # FASE 3: CRITIC AGENT
        critic_instruction = (f"[TASK]: Review [CURRENT] against [TARGET PROFILE].\n"
                              f"[CURRENT]: {rewritten_text}\n"
                              f"[DIAGNOSTICS]: {diag_text}")

        try:
            critic_res = critic_agent.run(critic_instruction, response_model=CriticResponse)
            print(f"      [DEBUG CRITIC] RAW: {str(critic_res.content)[:80]}...")

            if isinstance(critic_res.content, CriticResponse):
                critic_output_obj = critic_res.content
            else:
                clean_json = re.sub(r'```json\s*|```', '', str(critic_res.content)).strip()
                critic_output_obj = CriticResponse(**json.loads(clean_json))

            last_critic_json = critic_output_obj.model_dump(exclude_none=True)
            print(f"      [DEBUG CRITIC] Action Plan generato con successo.")
        except Exception as e:
            print(f"      ❌ [DEBUG CRITIC] ERRORE: {str(e)}")
            last_critic_json = {"status": "revision required", "action_plan": [{"id": 1, "instruction": "Increase complexity."}]}

        # --- LOGGING ---
        log_entry = {
            "run_id": run_id, "iteration": k, "timestamp": datetime.now().isoformat(),
            "source_text": source_text, "human_reference": complex_ref_text,
            "complexification_output": {"rewritten_text": rewritten_text, "rewritten_text_profile": rewritten_profile},
            "critic_output": last_critic_json
        }
        output_file.write(json.dumps(log_entry) + "\n")
        output_file.flush()

        current_text = rewritten_text
        status = last_critic_json.get("status", "revision required")
        print(f"      Status: {status}")

        if "satisfied" in status.lower():
            print("       Successo raggiunto.")
            break

# LOOP DI ESECUZIONE COMPARATIVA
pbar_global = tqdm(total=len(DATASET_FILES) * len(MODELS_TO_TEST), desc=" Esperimento")

for ds_path in DATASET_FILES:
    df_current = pd.read_csv(ds_path, sep='\t')
    ds_name = ds_path.replace('.csv', '')
    is_heavy = any(x in ds_path.lower() for x in ["swipe", "vikidia"])

    # COMPARAZIONE DIRETTA: Stesse righe per ogni modello
    target_range = range(0, 3) if is_heavy else range(0, 12)

    for model_id in MODELS_TO_TEST:
        print(f"\n MODELLO: {model_id} | DATASET: {ds_name}")

        # Configurazione specidica per il peso del dataset
        target_predict = 12000 if is_heavy else 2500
        target_ctx = 16384 if is_heavy else 8192
        new_llm = Ollama(id=model_id, options={"temperature": 0.01, "num_predict": target_predict, "num_ctx": target_ctx})
        writer_agent.model = new_llm
        critic_agent.model = new_llm

        output_filename = os.path.join(OUTPUT_FOLDER, f"trace_{ds_name}_{model_id.replace(':', '_')}.jsonl")

        # Checkpoint: quante righe sono già state salvate?
        done_indices = []
        if os.path.exists(output_filename):
            with open(output_filename, "r", encoding="utf-8") as f_check:
                for line in f_check:
                    try:
                        data = json.loads(line)
                        idx = int(data['run_id'].split('_')[-1])
                        if idx not in done_indices: done_indices.append(idx)
                    except: continue

        with open(output_filename, "a", encoding="utf-8") as f_trace:
            for i in target_range:
                if i in done_indices:
                    print(f"   Riga {i} già completata. Salto.")
                    continue
                print(f"   Elaborazione riga {i}...")
                run_benchmark_session(i, df_current, f_trace, model_id, max_k=MAX_ITERATIONS)

        pbar_global.update(1)

print(f"\n TUTTI I TEST COMPLETATI. Risultati pronti in: {OUTPUT_FOLDER}")