In [None]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=03d2bb41384caa95129701b98851dd7c6065fb8af6bd511d145894a79ffb39c8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
pip install --upgrade peft

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
  Attempting uninstall: peft
    Found existing installation: peft 0.13.2
    Uninstalling peft-0.13.2:
      Successfully uninstalled peft-0.13.2
Successfully installed peft-0.14.0


In [None]:
import torch
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5Config
from safetensors.torch import load_file
import json
import os
from peft import PeftModel, PeftConfig

In [None]:
class ContextQAEvaluator:
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model
        self.rouge_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    def generate_qa(self, context):
        inputs = self.tokenizer(context, return_tensors="pt")
        outputs = self.model.generate(**inputs, max_length=100, num_return_sequences=1, num_beams=4)
        question_answer = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
        question_answer = question_answer.replace(self.tokenizer.pad_token, "").replace(self.tokenizer.eos_token, "")
        question, answer = question_answer.split(self.tokenizer.sep_token)

        generated_qa = f"{question} {answer}"

        return generated_qa

    def compute_rouge_l(self, reference, candidate):
        rouge_result = self.rouge_scorer.score(reference, candidate)

        # length_penalty = min(len(candidate) / len(reference), 1.0)
        # adjusted_score = rouge_l_score * length_penalty

        return rouge_result['rougeL'].fmeasure

    def compute_bert_score(self, reference, candidate):
        _, _, f1 = score([candidate], [reference], lang="en", verbose=False)
        bert_f1 = f1.mean().item()

        # ref_tokens = reference.split()
        # cand_tokens = candidate.split()

        # length_ratio = min(len(cand_tokens) / len(ref_tokens), 1.0)

        # adjusted_score = bert_f1 * length_ratio

        return bert_f1

    def evaluate_context_matching(self, contexts):
        rouge_l_scores = []
        bert_scores = []

        for context in contexts:
            generated_qa = self.generate_qa(context)

            rouge_l_score = self.compute_rouge_l(context, generated_qa)
            rouge_l_scores.append(rouge_l_score)

            bert_score = self.compute_bert_score(context, generated_qa)
            bert_scores.append(bert_score)

        summary = {
            'rouge_l_scores': {
                'mean': np.mean(rouge_l_scores),
                'std': np.std(rouge_l_scores)
            },
            'bert_scores': {
                'mean': np.mean(bert_scores),
                'std': np.std(bert_scores)
            }
        }

        return summary

In [None]:
def generate_qa(context, model, tokenizer):
    inputs = tokenizer(context, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=100, num_return_sequences=1, num_beams=4)
    question_answer = tokenizer.decode(outputs[0], skip_special_tokens=False)
    question_answer = question_answer.replace(tokenizer.pad_token, "").replace(tokenizer.eos_token, "")
    question, answer = question_answer.split(tokenizer.sep_token)

    generated_qa = f"{question} {answer}"

    return generated_qa

def compute_rouge_l(reference, candidate):
    rouge_result = rouge_scorer.score(reference, candidate)

    # length_penalty = min(len(candidate) / len(reference), 1.0)
    # adjusted_score = rouge_l_score * length_penalty

    return rouge_result['rougeL'].fmeasure

def compute_bert_score(reference, candidate):
    _, _, f1 = score([candidate], [reference], lang="en", verbose=False)
    bert_f1 = f1.mean().item()

    # ref_tokens = reference.split()
    # cand_tokens = candidate.split()

    # length_ratio = min(len(cand_tokens) / len(ref_tokens), 1.0)

    # adjusted_score = bert_f1 * length_ratio

    return bert_f1

def evaluate_context_matching(contexts, model, tokenizer):
    rouge_l_scores = []
    bert_scores = []

    generated_qa = generate_qa(context, model, tokenizer)

    rouge_l_score = compute_rouge_l(context, generated_qa)
    rouge_l_scores.append(rouge_l_score)

    bert_score = compute_bert_score(context, generated_qa)
    bert_scores.append(bert_score)

    summary = {
        'rouge_l_scores': {
            'mean': np.mean(rouge_l_scores),
            'std': np.std(rouge_l_scores)
        },
        'bert_scores': {
            'mean': np.mean(bert_scores),
            'std': np.std(bert_scores)
        }
    }

    return summary

In [None]:
def evaluate_model(context):
    pretrained_model_name = "potsawee/t5-large-generation-squad-QuestionAnswer"

    pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
    pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name)

    finetuned_model_dir = "/content/finetuned_squad"

    finetuned_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_dir)
    finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(finetuned_model_dir)

    finetuned_model = PeftModel.from_pretrained(finetuned_model, finetuned_model_dir)

    pretrained_metrics = evaluate_context_matching(context, pretrained_model, pretrained_tokenizer)
    finetuned_metrics = evaluate_context_matching(context, finetuned_model, finetuned_tokenizer)

    print("Pretrained Model Metrics:", pretrained_metrics)
    print("Fine-tuned Model Metrics:", finetuned_metrics)

In [None]:
context = "A frameshift mutation is a deletion or insertion of one or more nucleotides that changes the reading frame of the base sequence. Deletions remove nucleotides, and insertions add nucleotides. Consider the following sequence of bases in RNA:"

In [None]:
evaluate_model(context)

tokenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pretrained Model Metrics: {'rouge_l_scores': {'mean': 0.6999999999999998, 'std': 0.0}, 'bert_scores': {'mean': 0.9287546873092651, 'std': 0.0}}
Fine-tuned Model Metrics: {'rouge_l_scores': {'mean': 0.6333333333333333, 'std': 0.0}, 'bert_scores': {'mean': 0.9184529781341553, 'std': 0.0}}


In [None]:
finetuned_model_dir = "/content/finetuned"

pretrained_model_name = "potsawee/t5-large-generation-race-QuestionAnswer"

# pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
# pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name)

tokenizer = AutoTokenizer.from_pretrained(finetuned_model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(finetuned_model_dir)
model = PeftModel.from_pretrained(model, finetuned_model_dir)

In [None]:
def generate_qa(context, model, tokenizer):
    inputs = tokenizer(context, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=100, num_return_sequences=1, num_beams=4)
    question_answer = tokenizer.decode(outputs[0], skip_special_tokens=False)
    question_answer = question_answer.replace(tokenizer.pad_token, "").replace(tokenizer.eos_token, "")
    question, answer = question_answer.split(tokenizer.sep_token)

    generated_qa = f"{question} {answer}"

    return generated_qa

In [None]:
generate_qa(context, model, tokenizer)

' What is a deletion or insertion of one or more nucleotides that changes the reading frame of the base sequence called?  frameshift mutation'

In [None]:
print(question)
print(answer)

 A frameshift mutation is _ .
 a deletion or an insertion of one or more nucleotides that changes the reading frame of the base sequence


In [None]:
print(question)
print(answer)

 What is a deletion or insertion of one or more nucleotides that changes the reading frame of the base sequence called?
 frameshift mutation
