In [3]:
%pip install nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m83.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
import random
import pandas as pd
from transformers import pipeline
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Define a dataset generator
def generate_tagalog_dataset(size=100):
    templates = [
        "Ang panahon ngayon ay {}.",
        "Ako ay naglalakad sa {}.",
        "Kapag umuulan, gusto kong {}.",
        "Pumunta ako sa tindahan upang {}.",
        "Ang kanyang boses ay parang {}.",
    ]
    completions = [
        ["maganda", "mainit", "maalinsangan", "maulan", "mahangin"],
        ["park", "tabing-dagat", "kalsada", "gubat", "bundok"],
        ["uminom ng tsaa", "magbasa ng libro", "magpahinga", "manood ng sine", "kumain"],
        ["bumili ng pagkain", "maghanap ng trabaho", "makipag-usap", "maglakad-lakad", "mamalengke"],
        ["musika", "kaluluwa", "huni ng ibon", "tula", "bulong ng hangin"],
    ]
    data = []
    for _ in range(size):
        template_idx = random.randint(0, len(templates) - 1)
        template = templates[template_idx]
        completion_options = completions[template_idx]
        expected_output = random.choice(completion_options)
        input_text = template.format("")
        expected_sentence = template.format(expected_output)
        data.append({"input": input_text, "expected": expected_sentence})
    return pd.DataFrame(data)

# Generate dataset
dataset = generate_tagalog_dataset(size=100)

# Initialize models
models = {
    "dialogpt_tagalog_30": pipeline("text-generation", model="gabtan99/dialogpt-tagalog-medium-30"),
    "dialogpt_tagalog_20": pipeline("text-generation", model="gabtan99/dialogpt-tagalog-medium-20"),
    "dialogpt_tagalog_10": pipeline("text-generation", model="gabtan99/dialogpt-tagalog-medium-10"),
    "gpt2_tagalog": pipeline("text-generation", model="jcblaise/gpt2-tagalog"),
}

# Evaluate models using BLEU score
def evaluate_models_with_bleu(dataset, models):
    results = []
    bleu_scores = {model_name: [] for model_name in models.keys()}
    smoothing_function = SmoothingFunction().method1
    
    for _, row in dataset.iterrows():
        input_text = row["input"]
        expected_output = row["expected"]
        for model_name, model in models.items():
            generated_text = model(input_text, max_length=50, num_return_sequences=1)[0]["generated_text"]
            # Compute BLEU score
            reference = [expected_output.split()]  # Reference sentence
            candidate = generated_text.split()  # Candidate sentence
            bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothing_function)
            bleu_scores[model_name].append(bleu_score)
            results.append({
                "Input": input_text,
                "Expected Output": expected_output,
                "Generated Text": generated_text,
                "BLEU Score": bleu_score,
                "Model": model_name
            })
    # Calculate average BLEU scores for each model
    avg_bleu_scores = {model_name: sum(scores) / len(scores) for model_name, scores in bleu_scores.items()}
    return pd.DataFrame(results), avg_bleu_scores

# Run evaluation
results, avg_bleu_scores = evaluate_models_with_bleu(dataset, models)

# Display BLEU scores
print("\nModel Average BLEU Scores:")
for model_name, avg_bleu in avg_bleu_scores.items():
    print(f"{model_name}: {avg_bleu:.4f}")

# Display sample results
print("\nSample Results:")
print(results.head(10))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=Tr


Model Average BLEU Scores:
dialogpt_tagalog_30: 0.5399
dialogpt_tagalog_20: 0.4360
dialogpt_tagalog_10: 0.6187
gpt2_tagalog: 0.1584

Sample Results:
                    Input                Expected Output  \
0  Ako ay naglalakad sa .  Ako ay naglalakad sa kalsada.   
1  Ako ay naglalakad sa .  Ako ay naglalakad sa kalsada.   
2  Ako ay naglalakad sa .  Ako ay naglalakad sa kalsada.   
3  Ako ay naglalakad sa .  Ako ay naglalakad sa kalsada.   
4  Ako ay naglalakad sa .     Ako ay naglalakad sa park.   
5  Ako ay naglalakad sa .     Ako ay naglalakad sa park.   
6  Ako ay naglalakad sa .     Ako ay naglalakad sa park.   
7  Ako ay naglalakad sa .     Ako ay naglalakad sa park.   
8  Ako ay naglalakad sa .    Ako ay naglalakad sa gubat.   
9  Ako ay naglalakad sa .    Ako ay naglalakad sa gubat.   

                                      Generated Text  BLEU Score  \
0  Ako ay naglalakad sa . Natutuwa ako kung ano a...    0.234624   
1  Ako ay naglalakad sa . Me mga kasama kong kaib... 

In [6]:
%pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9be23121620651228dc76104a8f57c0394479713c608b9d9823356cc5cbbd5f1
  Stored in directory: /home/zeus/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [9]:
# Import necessary libraries
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# Function to load the dataset
def load_dataset(file_path):
    return pd.read_csv(file_path)

# Function to perform model inference
def get_model_output(model, tokenizer, input_text, task="next_word"):
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=inputs['input_ids'].shape[1] + 1)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if task == "next_word":
        return decoded_output.split()[-1]  # Return only the predicted next word
    return decoded_output

# Function to evaluate models on the dataset
def evaluate_models(dataset, models, tokenizers, task="next_word"):
    results = {model_name: {"BLEU": [], "ROUGE-1": [], "ROUGE-2": [], "ROUGE-L": []} for model_name in models}
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    for _, row in dataset.iterrows():
        partial_input = row['partial_sentence']
        ground_truth = row['next_word'] if task == "next_word" else row['full_sentence']
        
        for model_name, model in models.items():
            tokenizer = tokenizers[model_name]
            predicted_output = get_model_output(model, tokenizer, partial_input, task)
            
            # Calculate BLEU score
            bleu_score = sentence_bleu([ground_truth.split()], predicted_output.split())
            results[model_name]["BLEU"].append(bleu_score)
            
            # Calculate ROUGE scores
            rouge_scores = scorer.score(ground_truth, predicted_output)
            results[model_name]["ROUGE-1"].append(rouge_scores['rouge1'].fmeasure)
            results[model_name]["ROUGE-2"].append(rouge_scores['rouge2'].fmeasure)
            results[model_name]["ROUGE-L"].append(rouge_scores['rougeL'].fmeasure)
    
    # Aggregate scores
    aggregated_results = {
        model_name: {
            "BLEU": sum(scores["BLEU"]) / len(scores["BLEU"]),
            "ROUGE-1": sum(scores["ROUGE-1"]) / len(scores["ROUGE-1"]),
            "ROUGE-2": sum(scores["ROUGE-2"]) / len(scores["ROUGE-2"]),
            "ROUGE-L": sum(scores["ROUGE-L"]) / len(scores["ROUGE-L"]),
        }
        for model_name, scores in results.items()
    }
    
    return aggregated_results


In [10]:
# Example usage
if __name__ == "__main__":
    # Load the dataset
    dataset_path = '/teamspace/studios/this_studio/tagalog_dataset.csv'
    dataset = load_dataset(dataset_path)
    
    # Initialize models and tokenizers
    models = {
        "dialogpt_tagalog_30": AutoModelForCausalLM.from_pretrained("gabtan99/dialogpt-tagalog-medium-30"),
        "dialogpt_tagalog_20": AutoModelForCausalLM.from_pretrained("gabtan99/dialogpt-tagalog-medium-20"),
        "dialogpt_tagalog_10": AutoModelForCausalLM.from_pretrained("gabtan99/dialogpt-tagalog-medium-10"),
        "gpt2_tagalog": AutoModelForCausalLM.from_pretrained("jcblaise/gpt2-tagalog"),
    }
    tokenizers = {
        "dialogpt_tagalog_30": AutoTokenizer.from_pretrained("gabtan99/dialogpt-tagalog-medium-30"),
        "dialogpt_tagalog_20": AutoTokenizer.from_pretrained("gabtan99/dialogpt-tagalog-medium-20"),
        "dialogpt_tagalog_10": AutoTokenizer.from_pretrained("gabtan99/dialogpt-tagalog-medium-10"),
        "gpt2_tagalog": AutoTokenizer.from_pretrained("jcblaise/gpt2-tagalog"),
    }
    
    # Evaluate models
    results = evaluate_models(dataset, models, tokenizers)
    print(results)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

{'dialogpt_tagalog_30': {'BLEU': 1.821831989445342e-234, 'ROUGE-1': 0.006, 'ROUGE-2': 0.001, 'ROUGE-L': 0.006}, 'dialogpt_tagalog_20': {'BLEU': 0.0, 'ROUGE-1': 0.005, 'ROUGE-2': 0.001, 'ROUGE-L': 0.005}, 'dialogpt_tagalog_10': {'BLEU': 1.821831989445342e-234, 'ROUGE-1': 0.005, 'ROUGE-2': 0.001, 'ROUGE-L': 0.005}, 'gpt2_tagalog': {'BLEU': 1.0930991936672054e-233, 'ROUGE-1': 0.01, 'ROUGE-2': 0.001, 'ROUGE-L': 0.01}}
