In [None]:
import pandas as pd
from tqdm import tqdm
from utils import (
    mixed_language_word_seg,
    random_translate,
    clean_mixed_language_sentence,
    compare_sentences,
    compute_token_nll,
    compute_word_nll,
    calculate_sentence_perplexity,
)

# Parameters
input_csv = "data/ASDEND_filtered.csv"
output_data = []
n_translations = 5  # Number of translated sentences to generate for each original sentence

# Load the CSV file
df = pd.read_csv(input_csv)

# Process each sentence
for sentence in df["sentence"][:10]:
    original_sentence = " ".join(mixed_language_word_seg(sentence))
    cleaned_original = clean_mixed_language_sentence(original_sentence)
    
    # Compute properties of the original sentence
    original_token_nll = compute_token_nll(cleaned_original)
    original_word_nll = compute_word_nll(cleaned_original, mixed_language_word_seg)
    original_perplexity = calculate_sentence_perplexity(cleaned_original)
    
    # Generate and process n translated sentences
    for _ in range(n_translations):
        translated_sentence = " ".join(random_translate(mixed_language_word_seg(sentence), 3))
        cleaned_translated = clean_mixed_language_sentence(translated_sentence)
        
        # Compare sentences using Llama
        llama_choice = compare_sentences( cleaned_translated,cleaned_original)
        
        # Compute properties of the translated sentence
        translated_token_nll = compute_token_nll(cleaned_translated)
        translated_word_nll = compute_word_nll(cleaned_translated, mixed_language_word_seg)
        translated_perplexity = calculate_sentence_perplexity(cleaned_translated)
        
        # Store results in the output list
        output_data.append({
            "original": cleaned_original,
            "transformed": cleaned_translated,
            "llama_preference": llama_choice,
            "original_word_nll": original_word_nll,
            "original_token_nll": original_token_nll,
            "original_perplexity": original_perplexity,
            "transformed_word_nll": translated_word_nll,
            "transformed_token_nll": translated_token_nll,
            "transformed_perplexity": translated_perplexity,
        })

# Convert the output to a DataFrame and save to CSV
output_df = pd.DataFrame(output_data)
output_csv = "processed_sentences.csv"
output_df.to_csv(output_csv, index=False)

print(f"Processing complete. Results saved to {output_csv}.")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id

Processing complete. Results saved to processed_sentences.csv.
