In [1]:
%pip install pandas ufal.udpipe scikit-learn seaborn matplotlib numpy tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os
import re 
from ufal.udpipe import Model, Pipeline, ProcessingError 
import sys
import csv 
import tqdm
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import math
import numpy as np

Funzioni


In [3]:

# Function to evaluate predictions
def evaluate_predictions(predictions, gold_data):
    correct_lemmas = 0
    correct_pos = 0
    total = len(gold_data)
    wrong_lemmas = []
    wrong_postag = []

    for pred, gold in zip(predictions, gold_data):
        # Compare lemma
        if pred['LEMMA'] == gold['@lemma']:
            correct_lemmas += 1
        else:
            if gold['@lemma']!= 'punc1':
                wrong_lemmas.append([gold['@form'],gold['@lemma'],pred['LEMMA']])    
        # Compare POS tag
        if pred['XPOS'] == gold['@postag']:
            correct_pos += 1
        else:
            wrong_postag.append([gold['@form'],gold['@postag'],pred['XPOS']])    

    return total, correct_lemmas, correct_pos, pd.DataFrame(wrong_lemmas, columns = ['Form','Gold', 'Pred']), pd.DataFrame(wrong_postag, columns = ['Form','Gold', 'Pred']) 


# Function to evaluate predictions
def evaluate_predictions_per_position(predictions, gold_data):
    correct_lemmas = 0
    pos_accuracies = {i: [] for i in range(9)}  # Track accuracy for each position

    for pred, gold in zip(predictions, gold_data):
        # Compare lemma
        if pred['LEMMA'] == gold['@lemma']:
            correct_lemmas += 1

        # Compare POS tags by each character position, ensuring they are strings and of length 9
        gold_pos = str(gold['@postag'])
        pred_pos = str(pred['XPOS'])
        if type(gold_pos)==str:
          if len(gold_pos) == 9 and len(pred_pos) == 9:
              for i in range(9):
                  pos_accuracies[i].append(1 if pred_pos[i] == gold_pos[i] else 0)

    total = len(gold_data)
    return total, correct_lemmas, pos_accuracies

In [4]:
def convert_conllu_to_blank(input_path, output_path):
    """
    Converte un file .conllu annotato in una versione "vuota" per UDPipe, 
    mantenendo solo le colonne ID e FORM e salvando il risultato.
    
    Args:
        input_path (str): percorso del file .conllu di input.
        output_path (str): percorso del file .conllu da scrivere.
    """
    with open(input_path, 'r', encoding='utf-8') as infile:
        lines = infile.readlines()

    blank_lines = []
    for line in lines:
        line = line.rstrip('\n')
        if line.startswith('#') or line.strip() == "":
            blank_lines.append(line)
        else:
            cols = line.split('\t')
            if len(cols) >= 2:
                blank_line = f"{cols[0]}\t{cols[1]}\t" + "\t".join(["_"] * 8)
                blank_lines.append(blank_line)

    with open(output_path, 'w', encoding='utf-8') as outfile:
        outfile.write('\n'.join(blank_lines) + '\n')


Preprocessing

UDPipe

In [5]:
#Scaricare il modello linguistico: Da runnare una sola volta, LO DEVO RIRUNNARE PER SCARICARE PROIEL
#!curl -k https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131/ancient_greek-proiel-ud-2.5-191206.udpipe -O ancient_greek-proiel-ud-2.5-191206.udpipe


Controllo importante! Dopo aver usato UDPipe, controllare che la lunghezza del dataframe finale corrisponda.

In [6]:
MODEL_PATHS = {
    'perseus': {
        'file': 'ancient_greek-perseus-ud-2.5-191206.udpipe',
        'url': 'https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131/ancient_greek-perseus-ud-2.5-191206.udpipe'
    },
    'proiel': {
        'file': 'ancient_greek-proiel-ud-2.5-191206.udpipe',
        'url': 'https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131/ancient_greek-proiel-ud-2.5-191206.udpipe'
    }
}

In [7]:
#load the model for ancient greek
#model = Model.load(MODEL_PATHS['perseus']['file'])
model = Model.load(MODEL_PATHS['proiel']['file'])

In [8]:
if not model:                                           #dice solo se il modello è stato caricato
    sys.stderr.write("Cannot load model from file")
    sys.exit(1)
sys.stderr.write('done\n')

done


5

In [9]:
# Create a pipeline
pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")      #sto passando il modello, la modalità horizontal, le pipeline di lemma e pos, output in connllu

In [10]:

import glob
from conllu import parse

input_dir = "file udpipe input output con test stanza/input"        #connllu con solo frase e form ma colonne vuote, che il modello deve riempire
output_file = "file udpipe input output con test stanza/output/output_stanzatest_udpipe_proiel.conllu"            #Connllu in cui ha fatto lemmatizzazione e pos il modello 


for input_file in tqdm.tqdm(glob.glob(os.path.join(input_dir, "*.conllu"))):
    filename = os.path.basename(input_file)

    try: 
        with open(input_file, "r", encoding="utf-8") as f:
            conllu_text = f.read()

        sentences = parse(conllu_text)

        processed_sentences = []
        for i, sentence in enumerate(sentences):
            conllu_str = sentence.serialize()
            result = pipeline.process(conllu_str)

            if not result.strip(): #mancano degli id in output  no anomalies 
                print(f"⚠️ Empty output for sentence {i+1} in file {filename}")
                print(sentence)
            else:
                processed_sentences.append(result)

        final_output = "\n".join(processed_sentences)

        with open(output_file, "w", encoding="utf-8") as f:
            f.write(final_output)

    except Exception as e:
        print(f"❌ Failed on file: {filename}")
        print(f"Error: {e}")

100%|██████████| 1/1 [01:11<00:00, 71.12s/it]


Performance

In [11]:
def conllu_to_csv(input_file, output_file):
    # Assicurati che la directory di output esista
    output_dir = os.path.dirname(output_file)
    if output_dir: # Crea solo se non è vuoto
        os.makedirs(output_dir, exist_ok=True)

    with open(input_file, "r", encoding="utf-8") as conllu_file, open(output_file, "w", encoding="utf-8", newline="") as csv_file:
        writer = csv.writer(csv_file)
        
        # Scrive l'intestazione del CSV (adatta le colonne secondo il tuo caso)
        writer.writerow(["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"])
        
        for line in conllu_file:
            line = line.strip()
            if not line or line.startswith("#"):
                # Ignora righe vuote e commenti
                continue
            # Divide la riga in colonne basandosi sulla tabulazione
            columns = line.split("\t")
            writer.writerow(columns)

In [12]:
#convert conllu to csv 
#Conversione degli output UDPipe conllu to csv

conllu_to_csv('file udpipe input output con test stanza/output/output_stanzatest_udpipe_proiel.conllu','file udpipe input output con test stanza/output/output_stanzatest_udpipe_proiel.csv')

In [13]:
import pandas as pd

# Classe per il calcolo delle metriche
class Score:
    def __init__(self, correct, gold_total, system_total, aligned_total=None):
        self.correct = correct
        self.gold_total = gold_total
        self.system_total = system_total
        self.aligned_total = aligned_total or min(gold_total, system_total)
        self.precision = correct / system_total if system_total else 0.0
        self.recall = correct / gold_total if gold_total else 0.0
        self.f1 = 2 * correct / (system_total + gold_total) if (system_total + gold_total) else 0.0
        self.aligned_accuracy = correct / self.aligned_total if self.aligned_total else 0.0

    def as_dict(self):
        return {
            'correct': self.correct,
            'gold_total': self.gold_total,
            'system_total': self.system_total,
            'precision': self.precision,
            'recall': self.recall,
            'f1': self.f1,
            'aligned_accuracy': self.aligned_accuracy
        }

# Funzione di valutazione
def evaluate_predictions(predictions, gold_data):
    correct_lemmas = 0
    correct_pos = 0
    total = len(gold_data)
    wrong_lemmas = []
    wrong_postag = []

    for pred, gold in zip(predictions, gold_data):
        gold_lemma = gold['@lemma']
        gold_pos = gold['@postag']
        pred_lemma = pred['LEMMA']
        pred_pos = pred['XPOS']

        if pred_lemma == gold_lemma:
            correct_lemmas += 1
        else:
            if gold_lemma != 'punc1':
                wrong_lemmas.append([gold['@form'], gold_lemma, pred_lemma])

        if pred_pos == gold_pos:
            correct_pos += 1
        else:
            wrong_postag.append([gold['@form'], gold_pos, pred_pos])

    # Crea oggetti Score per lemma e POS
    lemma_score = Score(correct_lemmas, total, total)
    pos_score = Score(correct_pos, total, total)

    # Restituisce le metriche e gli errori come DataFrame
    return {
        'total': total,
        'lemma_score': lemma_score,
        'pos_score': pos_score,
        'wrong_lemmas': pd.DataFrame(wrong_lemmas, columns=['Form', 'Gold', 'Pred']),
        'wrong_postag': pd.DataFrame(wrong_postag, columns=['Form', 'Gold', 'Pred'])
    }
#ACCURACY: noi possiamo fare solo questa, non possiamo fare le altre metriche, perchè non distinguiamo tra classi. non abbiamo positivi e negativi!


In [14]:

def estrai_sent_id(file_path):
    """
    Estrae tutti i valori di `sent_id` da un file CoNLL-U.

    Parametri:
        file_path (str): Percorso al file CoNLL-U.

    Ritorna:
        list[str]: Lista di stringhe contenenti i sent_id trovati.
    """
    sent_ids = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('# sent_id ='):
                sent_id = line[len('# sent_id ='):].strip()
                sent_ids.append(sent_id)
    return sent_ids

file = "C:/Users/lcolo/Desktop/TESI/GRCGorman/dati/train_dev_test/grc_gorman-ud-test.conllu"
lista_sent_id = estrai_sent_id(file)
print(lista_sent_id)
from pathlib import Path
import pandas as pd

df_list = []

for sent_id in lista_sent_id:
    try:
        dir_opera, indice = sent_id.split('#')
        csv_path = Path('output_gorman_no_anomalies_aggiornati_per_lemmi_e_postag') / dir_opera / f"{indice}.csv"
        csv_path = csv_path.resolve()

        print(f"🔍 Controllo file: {csv_path}")

        if not csv_path.exists():
            print(f"⚠️ File non trovato: {csv_path}")
            continue

        df = pd.read_csv(csv_path, sep=";")  # ← qui la correzione
        df_list.append(df)

    except Exception as e:
        print(f"❌ Errore con sent_id {sent_id}: {e}")

if df_list:
    df_concat = pd.concat(df_list, ignore_index=True)
    print(f"✅ CSV concatenato con {len(df_concat)} righe.")
else:
    print("❌ Nessun file CSV valido trovato.")


['aeschines-1-1-50-bu1.xml#164', 'aeschines-1-1-50-bu1.xml#165', 'aeschines-1-1-50-bu1.xml#166', 'aeschines-1-1-50-bu1.xml#167', 'aeschines-1-1-50-bu1.xml#168', 'aeschines-1-1-50-bu1.xml#169', 'aeschines-1-1-50-bu1.xml#170', 'aeschines-1-1-50-bu1.xml#171', 'aeschines-1-1-50-bu1.xml#172', 'aeschines-1-1-50-bu1.xml#173', 'aeschines-1-1-50-bu1.xml#174', 'aeschines-1-1-50-bu1.xml#175', 'aeschines-1-1-50-bu1.xml#176', 'aeschines-1-1-50-bu1.xml#177', 'aeschines-1-1-50-bu1.xml#178', 'aeschines-1-1-50-bu1.xml#179', 'aeschines-1-1-50-bu1.xml#180', 'aeschines-1-1-50-bu1.xml#181', 'aeschines-1-1-50-bu1.xml#182', 'aeschines-1-1-50-bu1.xml#183', 'aeschines-1-101-150-bu1.xml#146', 'aeschines-1-101-150-bu1.xml#147', 'aeschines-1-101-150-bu1.xml#148', 'aeschines-1-101-150-bu1.xml#149', 'aeschines-1-101-150-bu1.xml#150', 'aeschines-1-101-150-bu1.xml#151', 'aeschines-1-101-150-bu1.xml#152', 'aeschines-1-101-150-bu1.xml#153', 'aeschines-1-101-150-bu1.xml#154', 'aeschines-1-101-150-bu1.xml#155', 'aeschine

In [15]:


output_udpipe = pd.read_csv('file udpipe input output con test stanza/output/output_stanzatest_udpipe_proiel.csv')
opera = df_concat
predictions = output_udpipe[['FORM', 'LEMMA', 'XPOS']].to_dict('records')       
gold_data = opera [['@form', '@lemma', '@postag']].to_dict('records')
results = evaluate_predictions(predictions, gold_data)

print("Lemma Metrics:", results['lemma_score'].as_dict())
print("POS Metrics:", results['pos_score'].as_dict())

# Per visualizzare gli errori
print(results['wrong_lemmas'].head())
print(results['wrong_postag'].head())

Lemma Metrics: {'correct': 37744, 'gold_total': 53708, 'system_total': 53708, 'precision': 0.7027630892976837, 'recall': 0.7027630892976837, 'f1': 0.7027630892976837, 'aligned_accuracy': 0.7027630892976837}
POS Metrics: {'correct': 0, 'gold_total': 53708, 'system_total': 53708, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'aligned_accuracy': 0.0}
       Form   Gold   Pred
0        δ᾽     δέ     δ᾽
1       ἄρα    ἄρα    ἆρα
2         ,   punc      ,
3  τράπηται  τρέπω  τράπω
4         ,   punc      ,
       Form       Gold Pred
0       ἐὰν  c--------   G-
1        δ᾽  d--------   V-
2       ἄρα  d--------   Df
3  ὑπακούσῃ  v3sasa---   V-
4       μέν  d--------   Df


In [16]:
#correct sono quanti lemmi in questo caso udpipe ha azzeccato uguali a stanza rispetto al totale.
#gold total è il numero totale di token, di form di stanza del test. system total è uguale ma sono quelle di udpipe. 
#aligned acuracy è semplicemente l'accuracy. 

In [17]:
#questo codice è servito per creare un conllu vuoto con le frasi presenti nel test di Stanza. l'esigenza è dovuta al fatto che il conllu fornito da Giovanni non era coerente con i miei file. 
'''from pathlib import Path
import pandas as pd

output_lines = []

for sent_id in lista_sent_id:
    try:
        dir_opera, indice = sent_id.split('#')
        csv_path = Path('output_gorman_no_anomalies_aggiornati_per_lemmi_e_postag') / dir_opera / f"{indice}.csv"
        csv_path = csv_path.resolve()

        print(f"🔍 Controllo file: {csv_path}")
        if not csv_path.exists():
            print(f"⚠️ File non trovato: {csv_path}")
            continue

        df = pd.read_csv(csv_path, sep=';')
        df = df.reset_index(drop=True)
        df["@id"] = range(1, len(df) + 1)

        # Header
        output_lines.append(f"# sent_id = {sent_id}")
        text_line = " ".join(df["@form"].astype(str))
        output_lines.append(f"# text = {text_line}")

        # Riga per ogni token (solo form, gli altri 9 campi sono "_")
        for i, row in df.iterrows():
            token_line = f"{row['@id']}\t{row['@form']}\t_\t_\t_\t_\t_\t_\t_\t_"
            output_lines.append(token_line)

        output_lines.append("")  # Riga vuota tra frasi

    except Exception as e:
        print(f"❌ Errore con {sent_id}: {e}")

# Salva il file
with open("frasi_udpipe_vuote.conllu", "w", encoding="utf-8") as f:
    f.write("\n".join(output_lines))

print("✅ File conllu (vuoto) generato: frasi_udpipe_vuote.conllu")'''


'from pathlib import Path\nimport pandas as pd\n\noutput_lines = []\n\nfor sent_id in lista_sent_id:\n    try:\n        dir_opera, indice = sent_id.split(\'#\')\n        csv_path = Path(\'output_gorman_no_anomalies_aggiornati_per_lemmi_e_postag\') / dir_opera / f"{indice}.csv"\n        csv_path = csv_path.resolve()\n\n        print(f"🔍 Controllo file: {csv_path}")\n        if not csv_path.exists():\n            print(f"⚠️ File non trovato: {csv_path}")\n            continue\n\n        df = pd.read_csv(csv_path, sep=\';\')\n        df = df.reset_index(drop=True)\n        df["@id"] = range(1, len(df) + 1)\n\n        # Header\n        output_lines.append(f"# sent_id = {sent_id}")\n        text_line = " ".join(df["@form"].astype(str))\n        output_lines.append(f"# text = {text_line}")\n\n        # Riga per ogni token (solo form, gli altri 9 campi sono "_")\n        for i, row in df.iterrows():\n            token_line = f"{row[\'@id\']}\t{row[\'@form\']}\t_\t_\t_\t_\t_\t_\t_\t_"\n       