Analisi preliminare dei dati ottenuti da data_treatment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import csv

# Configurazione file
file_names = [
    "final_complexity_vikidia.csv",
    "final_complexity_swipe.csv",
    "final_complexity_ose_adv_int.csv",
    "final_complexity_ose_adv_ele.csv"
]

metrics = ['MTLD', 'LD', 'LS', 'MDD', 'CS', 'LC', 'CoH']

# Creazione cartelle di output
os.makedirs("output/discarded_ids", exist_ok=True)
os.makedirs("output/plots", exist_ok=True)
os.makedirs("output/cleaned_data", exist_ok=True)

def run_analysis():
    all_summaries = []

    max_int = sys.maxsize
    while True:
        try:
            csv.field_size_limit(max_int)
            break
        except OverflowError:
            max_int = int(max_int / 2)

    for file_path in file_names:
        if not os.path.exists(file_path):
            continue

        try:
            df = pd.read_csv(file_path, sep='\t', engine='python')
            print(f"\n>>> Analizzando: {file_path}")
        except pd.errors.ParserError as e:
            print(f"Errore di parsing nel file '{file_path}': {e}. Prova a controllare la riga 446 per virgolette non chiuse o formattazione errata.")
            continue # Salta il file corrente e passa al successivo

        #Conteggio Parole e Token
        for col in ['Simple', 'Complex']:
            if col in df.columns:
                # Calcolo lunghezza parole
                df[f'{col}_word_count'] = df[col].apply(lambda x: len(str(x).split()))
                # Calcolo token (rimozione punteggiatura base per stima)
                df[f'{col}_token_count'] = df[col].apply(lambda x: len(str(x).replace('.', ' ').split()))

        # Analisi Dominanza (MDD)
        violations = df[df['Simple_MDD'] >= df['Complex_MDD']]

        # Salvataggio ID scartati (usiamo l'indice del dataframe come ID)
        txt_name = f"output/discarded_ids/discarded_{file_path.replace('.csv', '.txt')}"
        violations.index.to_series().to_csv(txt_name, index=False, header=False)

        # Creazione Dataset Pulito
        # Modificato per mantenere l'indice originale nel file CSV pulito
        df_cleaned = df[df['Simple_MDD'] < df['Complex_MDD']]
        df_cleaned.to_csv(f"output/cleaned_data/CLEANED_{file_path}", index=True, sep='\t')

        # Grafico di Distribuzione MDD
        plt.figure(figsize=(10, 6))
        sns.kdeplot(df['Simple_MDD'], fill=True, label='Simple MDD', color='blue')
        sns.kdeplot(df['Complex_MDD'], fill=True, label='Complex MDD', color='orange')
        plt.title(f'Distribuzione MDD - {file_path}')
        plt.legend()
        plt.savefig(f"output/plots/dist_{file_path.replace('.csv', '.png')}")
        plt.close()

        # Raccolta Statistiche
        all_summaries.append({
            "File": file_path,
            "Righe Totali": len(df),
            "Scarti (Violazioni)": len(violations),
            "Media Parole (S)": f"{df['Simple_word_count'].mean():.1f}",
            "Media Parole (C)": f"{df['Complex_word_count'].mean():.1f}",
            "Media Token (S)": f"{df['Simple_token_count'].mean():.1f}",
            "Media Token (C)": f"{df['Complex_token_count'].mean():.1f}",
            "Mediana Token (S)": f"{df['Simple_token_count'].median():.1f}",
            "Mediana Token (C)": f"{df['Complex_token_count'].median():.1f}",
            "Token più lunghi (S)": f"{df['Simple_token_count'].max()}",
            "Token più lunghi (C)": f"{df['Complex_token_count'].max()}",
            "Dominanza OK": f"{(1 - len(violations)/len(df))*100:.2f}%"
        })

    # Visualizzazione Tabella Finale
    summary_df = pd.DataFrame(all_summaries)
    print("\n--- RIEPILOGO STATISTICO FINALE ---")
    print(summary_df.to_string(index=False))

if __name__ == "__main__":
    run_analysis()

Once finished the work on the pipeline, use the JSONL files here to get the cosine similarity.

In [None]:
import pandas as pd
import json
import os

jsonl_files = [
    '/content/trace_CLEANED_final_complexity_ose_adv_ele_falcon_7b.jsonl',
    '/content/trace_CLEANED_final_complexity_ose_adv_ele_llama3.1_8b.jsonl',
    '/content/trace_CLEANED_final_complexity_ose_adv_ele_mistral_7b.jsonl',
    '/content/trace_CLEANED_final_complexity_ose_adv_ele_qwen2.5_7b-instruct-q4_K_M.jsonl',
    '/content/trace_CLEANED_final_complexity_ose_adv_int_falcon_7b.jsonl',
    '/content/trace_CLEANED_final_complexity_ose_adv_int_llama3.1_8b.jsonl',
    '/content/trace_CLEANED_final_complexity_ose_adv_int_mistral_7b.jsonl',
    '/content/trace_CLEANED_final_complexity_ose_adv_int_qwen2.5_7b-instruct-q4_K_M.jsonl'
]

all_data = []

print("Caricamento dei file JSONL...")
for file_path in jsonl_files:
    if os.path.exists(file_path):
        print(f"  Caricamento: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                record = json.loads(line)
                record['original_jsonl_file'] = os.path.basename(file_path) # Add original filename
                all_data.append(record)
    else:
        print(f"  File non trovato e skippato: {file_path}")

df = pd.DataFrame(all_data)

print(f"\nCaricamento completato. Numero totale di record: {len(df)}")
print("Prime 5 righe del DataFrame combinato 'df':")
display(df.head())

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

results = []

# Iteriamo per ogni run_id nel DataFrame combinato
for run_id, group in df.groupby('run_id'):
    try:
        # 1. Recupera Source Text (dall'iterazione 0)
        source_row = group[group['iteration'] == 0]
        source_text = source_row.iloc[0].get('source_text', '') if not source_row.empty else ""

        # 2. Recupera Rewritten Text (dall'ultima iterazione disponibile)
        max_iter = group['iteration'].max()
        target_row = group[group['iteration'] == max_iter]

        rewritten_text = ""
        if not target_row.empty:
            complexification_output_raw = target_row.iloc[0].get('complexification_output', '')

            if isinstance(complexification_output_raw, str):
                try:
                    complexification_output_dict = json.loads(complexification_output_raw)
                    rewritten_text = complexification_output_dict.get('rewritten_text', '')
                except json.JSONDecodeError:
                    # Handle cases where the string is not valid JSON
                    pass
            elif isinstance(complexification_output_raw, dict):
                rewritten_text = complexification_output_raw.get('rewritten_text', '')

        # 3. Pulizia e verifica (Gestione NaN di Pandas)
        if pd.isna(source_text): source_text = ""
        if pd.isna(rewritten_text): rewritten_text = ""

        # Verifica finale che ci sia testo
        if not source_text or not rewritten_text:
            results.append({
                'Run ID': run_id,
                'Iterations': max_iter,
                'Cosine Similarity': 0.0,
                'Source Text': source_text,
                'Rewritten Text': rewritten_text,
                'Note': f"Testo mancante (Source: {bool(source_text)}, Rewritten: {bool(rewritten_text)})"
            })
            continue

        # 4. Calcolo Cosine Similarity
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform([str(source_text), str(rewritten_text)])
        similarity_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

        results.append({
            'Run ID': run_id,
            'Iterations': max_iter,
            'Cosine Similarity': round(similarity_score, 4),
            'Source Text': source_text,
            'Rewritten Text': rewritten_text,
            'Note': 'OK'
        })

    except Exception as e:
        results.append({
            'Run ID': run_id,
            'Iterations': group['iteration'].max(),
            'Cosine Similarity': 0.0,
            'Source Text': source_text,
            'Rewritten Text': rewritten_text,
            'Note': f"Errore: {str(e)}"
        })

# Output
results_df = pd.DataFrame(results)

# Function to truncate text for display
def truncate_text(text, length=100):
    return (text[:length] + '...') if len(text) > length else text

# Apply truncation for display purposes
display_df = results_df.copy()
display_df['Source Text'] = display_df['Source Text'].apply(truncate_text)
display_df['Rewritten Text'] = display_df['Rewritten Text'].apply(truncate_text)

print(display_df.to_markdown(index=False))

Salva i risultati

In [None]:

output_detailed_results_csv_path = 'detailed_cosine_similarity_results.csv'

results_df.to_csv(output_detailed_results_csv_path, index=False)

print(f"I risultati dettagliati dell'analisi sono stati salvati con successo in '{output_detailed_results_csv_path}'")

Medie similarità per file

In [None]:
# Estrai il nome del file da 'Run ID' per raggruppare i risultati
results_df['File Name'] = results_df['Run ID'].apply(lambda x: x.split('_row_')[0].replace('exp_', '') + '.jsonl')

# Calcola la media della similarità coseno per ogni file
average_cosine_similarity_per_file = results_df.groupby('File Name')['Cosine Similarity'].mean().reset_index()

print("Similarità Cosine Media per File:")
print(average_cosine_similarity_per_file.to_markdown(index=False))

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

metrics = ['MTLD', 'LD', 'LS', 'MDD', 'CS', 'LC', 'CoH']
results_with_metrics = []

# Iteriamo per ogni combinazione unica di file originale e run_id
# This ensures that entries from different files but with the same 'run_id' are treated separately
for (original_jsonl_file, run_id_in_file), group in df.groupby(['original_jsonl_file', 'run_id']):
    # Create a unique identifier for this specific row across all files
    unique_run_identifier = f"{original_jsonl_file}_{run_id_in_file}"
    try:
        # Recupera Source Text (dall'iterazione 0)
        source_row = group[group['iteration'] == 0]
        source_text = source_row.iloc[0].get('source_text', '') if not source_row.empty else ""

        # Recupera Rewritten Text (dall'ultima iterazione disponibile)
        max_iter = group['iteration'].max()
        target_row = group[group['iteration'] == max_iter]

        rewritten_text = ""
        if not target_row.empty:
            complexification_output_raw = target_row.iloc[0].get('complexification_output', '')
            if isinstance(complexification_output_raw, str):
                try:
                    complexification_output_dict = json.loads(complexification_output_raw)
                    rewritten_text = complexification_output_dict.get('rewritten_text', '')
                except json.JSONDecodeError:
                    pass
            elif isinstance(complexification_output_raw, dict):
                rewritten_text = complexification_output_raw.get('rewritten_text', '')

        # Pulizia e verifica (Gestione NaN di Pandas)
        if pd.isna(source_text): source_text = ""
        if pd.isna(rewritten_text): rewritten_text = ""

        # Calcolo Cosine Similarity
        similarity_score = 0.0
        if source_text and rewritten_text:
            tfidf_vectorizer = TfidfVectorizer()
            texts_to_vectorize = [str(source_text), str(rewritten_text)]
            if all(t.strip() for t in texts_to_vectorize):
                tfidf_matrix = tfidf_vectorizer.fit_transform(texts_to_vectorize)
                if tfidf_matrix.shape[0] == 2 and tfidf_matrix.shape[1] > 0:
                    similarity_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

        # Estrazione e confronto delle metriche
        source_profile = None
        target_profile = None

        if not source_row.empty:
            source_output_raw = source_row.iloc[0].get('complexification_output', {}) # This should contain the profile for iteration 0
            if isinstance(source_output_raw, str):
                try:
                    source_output_dict = json.loads(source_output_raw)
                    source_profile = source_output_dict.get('rewritten_text_profile')
                except json.JSONDecodeError:
                    pass
            elif isinstance(source_output_raw, dict):
                source_profile = source_output_raw.get('rewritten_text_profile')

        if not target_row.empty:
            target_output_raw = target_row.iloc[0].get('complexification_output', {}) # This should contain the profile for the last iteration's rewritten text
            if isinstance(target_output_raw, str):
                try:
                    target_output_dict = json.loads(target_output_raw)
                    target_profile = target_output_dict.get('rewritten_text_profile')
                except json.JSONDecodeError:
                    pass
            elif isinstance(target_output_raw, dict):
                target_profile = target_output_raw.get('rewritten_text_profile')

        metric_exceeded_flags = {}
        for i, metric_name in enumerate(metrics): # Renamed 'metric' to 'metric_name' to avoid conflict
            metric_exceeded_flags[f'{metric_name}_exceeded'] = False

            if source_profile and target_profile and len(source_profile) > i and len(target_profile) > i:
                src_val = source_profile[i]
                tgt_val = target_profile[i]
                if isinstance(src_val, (int, float)) and isinstance(tgt_val, (int, float)):
                    metric_exceeded_flags[f'{metric_name}_exceeded'] = (tgt_val > src_val)

        result_entry = {
            'Run ID': unique_run_identifier, # Use unique identifier here
            'Original File': original_jsonl_file, # Add original file name
            'Iterations': max_iter,
            'Cosine Similarity': round(similarity_score, 4),
            'Source Text': source_text, # Keep for potential debugging or future use
            'Rewritten Text': rewritten_text, # Keep for potential debugging or future use
            'Note': 'OK'
        }
        result_entry.update(metric_exceeded_flags)
        results_with_metrics.append(result_entry)

    except Exception as e:
        error_entry = {
            'Run ID': unique_run_identifier, # Use unique identifier here
            'Original File': original_jsonl_file, # Add original file name
            'Iterations': group['iteration'].max(),
            'Cosine Similarity': 0.0,
            'Source Text': '',
            'Rewritten Text': '',
            'Note': f"Errore: {str(e)}"
        }
        for metric_name in metrics:
            error_entry[f'{metric_name}_exceeded'] = False
        results_with_metrics.append(error_entry)

# Converti la lista `results_with_metrics` in un DataFrame pandas
metrics_summary_df = pd.DataFrame(results_with_metrics)

columns_to_average = ['Cosine Similarity'] + [f'{metric}_exceeded' for metric in metrics]

final_summary = metrics_summary_df.groupby('Original File')[columns_to_average].mean().reset_index()

print("Similarità Cosine Media e Percentuali di Superamento delle Metriche per File:")

# Formattazione per la visualizzazione in Markdown
final_summary['Cosine Similarity'] = final_summary['Cosine Similarity'].apply(lambda x: f'{x:.4f}')

for metric_name in metrics:
    col_name = f'{metric_name}_exceeded'
    final_summary[col_name] = final_summary[col_name].apply(lambda x: f'{x * 100:.2f}%')

print(final_summary.to_markdown(index=False))

In [None]:
analysis_results = []

# Convert percentage strings to numeric for comparison
analysis_df = final_summary.copy()
for col in analysis_df.columns:
    if '_exceeded' in col:
        analysis_df[col] = analysis_df[col].str.replace('%', '').astype(float)
    elif col == 'Cosine Similarity':
        analysis_df[col] = analysis_df[col].astype(float)

# Analyze Cosine Similarity
metric = 'Cosine Similarity'
best_model_cs = analysis_df.loc[analysis_df[metric].idxmax()]
worst_model_cs = analysis_df.loc[analysis_df[metric].idxmin()]

analysis_results.append({
    'Metric': metric,
    'Best Model': best_model_cs['Original File'],
    'Best Value': f"{best_model_cs[metric]:.4f}",
    'Worst Model': worst_model_cs['Original File'],
    'Worst Value': f"{worst_model_cs[metric]:.4f}"
})

# Analyze metric exceedance percentages
for metric in metrics: # 'metrics' list is defined in a previous cell
    col_name = f'{metric}_exceeded'

    # Check if the column exists and has non-null values
    if col_name in analysis_df.columns and not analysis_df[col_name].isnull().all():
        best_model = analysis_df.loc[analysis_df[col_name].idxmax()]
        worst_model = analysis_df.loc[analysis_df[col_name].idxmin()]

        analysis_results.append({
            'Metric': metric,
            'Best Model': best_model['Original File'],
            'Best Value': f"{best_model[col_name]:.2f}%",
            'Worst Model': worst_model['Original File'],
            'Worst Value': f"{worst_model[col_name]:.2f}%"
        })

analysis_df_summary = pd.DataFrame(analysis_results)

print("\n--- Analisi dei Modelli Migliori e Peggiori per Categoria ---")
print(analysis_df_summary.to_markdown(index=False))

Ultimi Salvataggi

In [None]:
output_csv_path = 'model_analysis_summary.csv'
analysis_df_summary.to_csv(output_csv_path, index=False)

print(f"L'analisi dei modelli è stata salvata con successo in '{output_csv_path}'")

In [None]:
output_summary_csv_path = 'metric_summary_per_file.csv'

final_summary.to_csv(output_summary_csv_path, index=False)

print(f"Il riepilogo delle metriche per file è stato salvato con successo in '{output_summary_csv_path}'")

Cosine Similarity avanzata:

# Task
Perform a semantic similarity analysis on the rewritten texts and human references using a pre-trained SBERT model. Calculate the cosine similarity for each `run_id`, then analyze the distribution of these scores, and summarize the findings to determine if the text complexification process preserved semantic content. The analysis should include calculating average semantic similarity scores across the dataset and per file, and visualizing the distribution of scores with a clear legend.

## Carica Modello Sentence Embedding

### Subtask:
Installa le librerie necessarie (ad es. `sentence-transformers` o `transformers` con un modello SBERT) e carica un modello pre-addestrato per la generazione di embedding di frasi.


**Reasoning**:
The first step is to install the `sentence-transformers` library as requested in the instructions.



In [None]:
import sys
!{sys.executable} -m pip install sentence-transformers

print("Libreria 'sentence-transformers' installata con successo.")

**Reasoning**:
The previous step installed the `sentence-transformers` library. The next instruction is to load a pre-trained Sentence-BERT model, specifically 'all-MiniLM-L6-v2', for generating sentence embeddings.



In [None]:
from sentence_transformers import SentenceTransformer

# Carica un modello pre-addestrato di Sentence-BERT
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

print(f"Modello SBERT '{model_name}' caricato con successo.")

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

semantic_results = []

for (original_jsonl_file, run_id_in_file), group in df.groupby(['original_jsonl_file', 'run_id']):
    unique_run_identifier = f"{original_jsonl_file}_{run_id_in_file}"

    try:
        # Recupera Rewritten Text (dall'ultima iterazione disponibile)
        max_iter = group['iteration'].max()
        target_row = group[group['iteration'] == max_iter]

        rewritten_text = ""
        if not target_row.empty:
            complexification_output_raw = target_row.iloc[0].get('complexification_output', '')

            if isinstance(complexification_output_raw, str):
                try:
                    complexification_output_dict = json.loads(complexification_output_raw)
                    rewritten_text = complexification_output_dict.get('rewritten_text', '')
                except json.JSONDecodeError:
                    pass # If not valid JSON, rewritten_text remains empty
            elif isinstance(complexification_output_raw, dict):
                rewritten_text = complexification_output_raw.get('rewritten_text', '')

        # Recupera Human Reference (dalla prima iterazione, o da qualsiasi riga per il run_id)
        # Assuming human_reference is consistent across iterations for a given run_id
        human_reference_row = group.iloc[0]
        human_reference = human_reference_row.get('human_reference', '')

        # Pulizia e verifica (Gestione NaN di Pandas e stringhe vuote)
        if pd.isna(rewritten_text): rewritten_text = ""
        if pd.isna(human_reference): human_reference = ""

        semantic_similarity_score = 0.0
        note = 'OK'

        # Check if both texts are non-empty after cleaning
        if not rewritten_text.strip() or not human_reference.strip():
            note = f"Testo mancante (Rewritten: {bool(rewritten_text.strip())}, Human Ref: {bool(human_reference.strip())})"
        else:
            # Genera embeddings per i testi
            embeddings = model.encode([rewritten_text, human_reference])

            # Calcola la similarità coseno tra gli embeddings
            # Reshape per sklearn.metrics.pairwise.cosine_similarity
            semantic_similarity_score = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1].reshape(1, -1))[0][0]

        semantic_results.append({
            'Run ID': unique_run_identifier,
            'Original File': original_jsonl_file,
            'Semantic Cosine Similarity': round(semantic_similarity_score, 4),
            'Note': note
        })

    except Exception as e:
        semantic_results.append({
            'Run ID': unique_run_identifier,
            'Original File': original_jsonl_file,
            'Semantic Cosine Similarity': 0.0,
            'Note': f"Errore: {str(e)}"
        })

semantic_df = pd.DataFrame(semantic_results)

print("Prime 5 righe del DataFrame con la similarità semantica:")
display(semantic_df.head())

# Calcola la media della similarità semantica per ogni file originale
average_semantic_similarity_per_file = semantic_df.groupby('Original File')['Semantic Cosine Similarity'].mean().reset_index()

print("\nSimilarità Cosine Semantica Media per File:")
print(average_semantic_similarity_per_file.to_markdown(index=False))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 7))
sns.histplot(semantic_df['Semantic Cosine Similarity'], kde=True, bins=20, color='skyblue')
plt.title('Distribuzione della Similarità Cosine Semantica')
plt.xlabel('Similarità Cosine Semantica')
plt.ylabel('Frequenza')
plt.grid(axis='y', alpha=0.75)
plt.show()

# Visualize distribution per original file (e.g., using boxplots or KDE plots per group)
plt.figure(figsize=(14, 8))
sns.boxplot(x='Original File', y='Semantic Cosine Similarity', data=semantic_df)
plt.xticks(rotation=45, ha='right')
plt.title('Distribuzione della Similarità Cosine Semantica per File Originale')
plt.xlabel('File Originale')
plt.ylabel('Similarità Cosine Semantica')
plt.tight_layout()
plt.show()

print("Visualizzazione della distribuzione della Similarità Cosine Semantica completata.")

**Reasoning**:
The semantic similarity analysis and its interpretation have been completed. The final step is to save the detailed semantic similarity results to a CSV file for record-keeping and potential future analysis, as the task implies summarizing and saving findings.



In [None]:
output_semantic_results_csv_path = 'detailed_semantic_similarity_results.csv'
semantic_df.to_csv(output_semantic_results_csv_path, index=False)

print(f"I risultati dettagliati della similarità semantica sono stati salvati con successo in '{output_semantic_results_csv_path}'")