<a href="https://colab.research.google.com/github/arun-mishra22/Data-Science-Interview-Assistant/blob/main/model_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import torch
print(torch.cuda.is_available())


True


In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
!pip install transformers peft datasets accelerate bitsandbytes evaluate rouge_score



Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=6f18c200e1786bb193f6b9bf9c85cdc93294fc8642688304d073dd257a4e0f0b
  Stored in directory: /root

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

from peft import PeftModel
import torch

base_path = "/content/drive/MyDrive/LLM_Base_Models/Llama3B_base"
adapter_path = "/content/drive/MyDrive/LLM_Base_Models/Llama3B_lora_adapter"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(base_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


base_model = AutoModelForCausalLM.from_pretrained(
    base_path,
    quantization_config=bnb_config,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, adapter_path)

print("Fine-tuned model loaded!")




Loading weights:   0%|          | 0/254 [00:00<?, ?it/s]

Fine-tuned model loaded!


In [11]:
from datasets import Dataset

test_dataset = Dataset.from_file("/content/test.arrow")

print(test_dataset)


Dataset({
    features: ['text'],
    num_rows: 479
})


In [12]:
print(test_dataset.column_names)
print(test_dataset[0])


['text']
{'text': '### Instruction:\nIs it possible to create a GPT model without a GPU?\n\n### Response:\nYes, it is possible to create a GPT model without a GPU. GPUs are used to improve the speed at which a model runs, however it is possible to use a lower-end processor or a much slower CPU to run a GPT model.'}


In [13]:
def split_prompt_and_answer(text):
    parts = text.split("### Response:")

    if len(parts) == 2:
        prompt = parts[0] + "### Response:"
        answer = parts[1].strip()
        return prompt, answer
    else:
        return text, ""


In [14]:
def generate_answer(prompt_list):

    inputs = tokenizer(
        prompt_list,
        return_tensors="pt",
        padding="longest",
        truncation=True
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=150
    )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


In [15]:
prompts = []
references = []

for item in test_dataset:
    prompt, true_answer = split_prompt_and_answer(item["text"])
    prompts.append(prompt)
    references.append(true_answer)

predictions = []

batch_size = 8

for i in range(0, len(prompts), batch_size):

    batch = prompts[i : i + batch_size]

    preds = generate_answer(batch)

    predictions.extend(preds)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [26]:
import pandas as pd

df = pd.DataFrame({
    "prompt": prompts,
    "reference": references,
    "prediction": predictions
})

df.to_csv("model_predictions.csv", index=False)

print("Saved results to model_predictions.csv")


Saved results to model_predictions.csv


In [16]:
print(len(predictions))
print(len(references))


479
479


In [17]:
cleaned_predictions = []

for p in predictions:
    # If model returns prompt + answer together, keep only answer part
    if "### Response:" in p:
        cleaned_predictions.append(p.split("### Response:")[-1].strip())
    else:
        cleaned_predictions.append(p.strip())


In [18]:
correct = 0

for pred, ref in zip(cleaned_predictions, references):
    if pred.strip().lower() == ref.strip().lower():
        correct += 1

accuracy = correct / len(references)

print("Exact Match Accuracy:", accuracy)


Exact Match Accuracy: 0.006263048016701462


In [19]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoothie = SmoothingFunction().method4

scores = []

for pred, ref in zip(cleaned_predictions, references):
    score = sentence_bleu(
        [ref.split()],
        pred.split(),
        smoothing_function=smoothie
    )
    scores.append(score)

print("Average BLEU Score:", sum(scores) / len(scores))



Average BLEU Score: 0.05102039022656898


In [20]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

rouge_scores = []

for pred, ref in zip(cleaned_predictions, references):
    score = scorer.score(ref, pred)
    rouge_scores.append(score['rougeL'].fmeasure)

print("Average ROUGE-L Score:", sum(rouge_scores) / len(rouge_scores))


Average ROUGE-L Score: 0.1751907713106612


In [22]:
!pip install bert-score


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [24]:
!pip install --upgrade transformers tokenizers evaluate bert-score


Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Downloading transformers-5.1.0-py3-none-any.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Successfully uninstalled transformers-5.0.0
Successfully installed transformers-5.1.0


Direct manual testing

In [32]:
from transformers import pipeline, AutoTokenizer, GenerationConfig

tokenizer = AutoTokenizer.from_pretrained(adapter_path)

# FIX: Explicitly set pad token
tokenizer.pad_token = tokenizer.eos_token

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

gen_config = GenerationConfig(
    max_new_tokens=200,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

question = "What is bias variance tradeoff? How can you avoid it?"

prompt = f"""### Instruction:
{question}

### Response:
"""

output = pipe(
    prompt,
    generation_config=gen_config,
    return_full_text=False   # VERY IMPORTANT
)

print(output[0]["generated_text"])



This is a core concept in ML theory and practice. Understanding it helps build robust systems and choose appropriate ML approaches. Key dimensions: computational cost, generalization performance, interpretability vs. lack of explanatory power. Alternatives include Bayesian approach (intractable), empirical risk minimization (ERM), early stopping, data augmentation, architecture search, multi-task learning, ensemble methods, model averaging, and other approaches.


In [33]:
!pip install bert-score




In [34]:
from evaluate import load

bertscore = load("bertscore")

bert_results = bertscore.compute(
    predictions=predictions,
    references=references,
    lang="en"
)

print("Average BERTScore F1:", sum(bert_results["f1"]) / len(bert_results["f1"]))


Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
pooler.dense.bias               | MISSING    | 
pooler.dense.weight             | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


AttributeError: RobertaTokenizer has no attribute build_inputs_with_special_tokens

In [39]:
!pip uninstall -y transformers tokenizers bert-score evaluate



Found existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Found existing installation: tokenizers 0.15.2
Uninstalling tokenizers-0.15.2:
  Successfully uninstalled tokenizers-0.15.2
Found existing installation: bert-score 0.3.13
Uninstalling bert-score-0.3.13:
  Successfully uninstalled bert-score-0.3.13
Found existing installation: evaluate 0.4.6
Uninstalling evaluate-0.4.6:
  Successfully uninstalled evaluate-0.4.6


In [None]:
!pip install transformers==4.37.2
!pip install tokenizers==0.15.0
!pip install bert-score==0.3.13
!pip install evaluate==0.4.1


Collecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.2)
  Using cached tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers, transformers
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the followi

Collecting tokenizers==0.15.0
  Downloading tokenizers-0.15.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.15.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/3.8 MB[0m [31m45.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.8/3.8 MB[0m [31m57.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hTraceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_inter

In [7]:
import pandas as pd

df = pd.read_csv("model_predictions.csv")

predictions = df["prediction"].tolist()
references = df["reference"].tolist()

print("Loaded samples:", len(predictions))


Loaded samples: 479


In [8]:
from evaluate import load

bertscore = load("bertscore")

bert_results = bertscore.compute(
    predictions=predictions,
    references=references,
    lang="en"
)

print("Average BERTScore F1:", sum(bert_results["f1"]) / len(bert_results["f1"]))


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore F1: 0.844624831557025


In [5]:
!pip install bert-score


Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


In [2]:
!pip install evaluate



Collecting evaluate
  Using cached evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Using cached evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
from evaluate import load


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]