In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Assicurati di avere i tokenizer di NLTK
nltk.download('punkt')

def compute_statistics(file_path, column_name):
    # Carica il file CSV con gestione degli errori di parsing
    df = pd.read_csv(file_path, sep=";", on_bad_lines='skip', dtype={column_name: str})
    
    # Sostituisci NaN e valori non stringa con stringhe vuote
    df[column_name] = df[column_name].fillna("").astype(str)
    
    # Filtra eventuali righe completamente vuote
    df = df[df[column_name].str.strip() != ""]

    # Conta il numero di data points
    num_data_points = len(df)

    # Tokenizzazione e conteggi
    tokenized_texts = [word_tokenize(text) for text in df[column_name]]
    num_tokens = sum(len(tokens) for tokens in tokenized_texts)
    avg_tokens_per_caption = num_tokens / num_data_points if num_data_points > 0 else 0
    unique_words = set(word for tokens in tokenized_texts for word in tokens)

    # Stampa le statistiche
    print(f"Statistics for {file_path} ({column_name} column):")
    print(f"Number of data points: {num_data_points}")
    print(f"Total number of tokens: {num_tokens}")
    print(f"Average number of tokens per caption: {avg_tokens_per_caption:.2f}")
    print(f"Number of unique words: {len(unique_words)}")
    print("\n")

# File paths (modifica con il percorso corretto)
goldcorpus_path = "goldcorpus_comparisons.csv"
europarl_path = "europarl_unito_500righe.csv"


[nltk_data] Downloading package punkt to /home/guest/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#compute_statistics(goldcorpus_path, "English")

In [3]:
compute_statistics(europarl_path, "Inglese")


Statistics for europarl_unito_500righe.csv (Inglese column):
Number of data points: 499
Total number of tokens: 15356
Average number of tokens per caption: 30.77
Number of unique words: 2481




In [5]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from datasets import load_dataset

# Assicurati di avere i tokenizer di NLTK
nltk.download('punkt')

def compute_statistics_from_dataset(dataset_name, split, column_name, start, end):
    # Carica solo le righe necessarie
    dataset = load_dataset(dataset_name, split=split, streaming=True)
    
    # Seleziona solo le righe necessarie
    texts = []
    for i, row in enumerate(dataset):
        if i >= start and i < end:
            texts.append(row[column_name])
        if i >= end:
            break
    
    # Conta il numero di data points
    num_data_points = len(texts)
    
    # Tokenizzazione e conteggi
    tokenized_texts = [word_tokenize(text) for text in texts]
    num_tokens = sum(len(tokens) for tokens in tokenized_texts)
    avg_tokens_per_caption = num_tokens / num_data_points if num_data_points > 0 else 0
    unique_words = set(word for tokens in tokenized_texts for word in tokens)
    
    # Stampa le statistiche
    print(f"Statistics for {dataset_name} ({column_name} column, rows {start}-{end}):")
    print(f"Number of data points: {num_data_points}")
    print(f"Total number of tokens: {num_tokens}")
    print(f"Average number of tokens per caption: {avg_tokens_per_caption:.2f}")
    print(f"Number of unique words: {len(unique_words)}")
    print("\n")

# Calcola statistiche per il dataset LAION-2B-EN-RESEARCHSAFE
compute_statistics_from_dataset("laion/laion2B-en", "train", "caption", 7000, 8000)


[nltk_data] Downloading package punkt to /home/guest/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Resolving data files:   0%|          | 0/128 [00:00<?, ?it/s]

Statistics for laion/laion2B-en (caption column, rows 7000-8000):
Number of data points: 1000
Total number of tokens: 11885
Average number of tokens per caption: 11.88
Number of unique words: 5999


