<a href="https://colab.research.google.com/github/arun-mishra22/Data-Science-Interview-Assistant/blob/main/model_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
print(torch.cuda.is_available())


True


### Connect Google Drive to Colab environment to Load saved models .

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install transformers peft datasets accelerate bitsandbytes evaluate rouge_score



Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=6f18c200e1786bb193f6b9bf9c85cdc93294fc8642688304d073dd257a4e0f0b
  Stored in directory: /root

## Loading the model for evaluation -->

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

from peft import PeftModel
import torch

base_path = "/content/drive/MyDrive/LLM_Base_Models/Llama3B_base"
adapter_path = "/content/drive/MyDrive/LLM_Base_Models/Llama3B_lora_adapter"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(base_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


base_model = AutoModelForCausalLM.from_pretrained(
    base_path,
    quantization_config=bnb_config,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, adapter_path)

print("Fine-tuned model loaded!")




Loading weights:   0%|          | 0/254 [00:00<?, ?it/s]

Fine-tuned model loaded!


## Loading Test data for evaluation -->

In [None]:
from datasets import Dataset

test_dataset = Dataset.from_file("/content/test.arrow")

print(test_dataset)


Dataset({
    features: ['text'],
    num_rows: 479
})


In [None]:
print(test_dataset.column_names)
print(test_dataset[0])


['text']
{'text': '### Instruction:\nIs it possible to create a GPT model without a GPU?\n\n### Response:\nYes, it is possible to create a GPT model without a GPU. GPUs are used to improve the speed at which a model runs, however it is possible to use a lower-end processor or a much slower CPU to run a GPT model.'}


####Helper function for separating prompt and response .

In [None]:
def split_prompt_and_answer(text):
    parts = text.split("### Response:")

    if len(parts) == 2:
        prompt = parts[0] + "### Response:"
        answer = parts[1].strip()
        return prompt, answer
    else:
        return text, ""


In [None]:
def generate_answer(prompt_list):

    inputs = tokenizer(
        prompt_list,
        return_tensors="pt",
        padding="longest",
        truncation=True
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=150
    )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


### Generate predictions from the test dataset in batches -->

In [None]:
prompts = []
references = []

for item in test_dataset:
    prompt, true_answer = split_prompt_and_answer(item["text"])
    prompts.append(prompt)
    references.append(true_answer)

predictions = []

batch_size = 8

for i in range(0, len(prompts), batch_size):

    batch = prompts[i : i + batch_size]

    preds = generate_answer(batch)

    predictions.extend(preds)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

### Saving the Results of the model on Test data -->

In [None]:
import pandas as pd

df = pd.DataFrame({
    "prompt": prompts,
    "reference": references,
    "prediction": predictions
})

df.to_csv("model_predictions.csv", index=False)

print("Saved results to model_predictions.csv")


Saved results to model_predictions.csv


In [None]:
print(len(predictions))
print(len(references))


479
479


### Clean the raw model outputs so that only the answer part remains .

In [None]:
cleaned_predictions = []

for p in predictions:
    # If model returns prompt + answer together, keep only answer part
    if "### Response:" in p:
        cleaned_predictions.append(p.split("### Response:")[-1].strip())
    else:
        cleaned_predictions.append(p.strip())


### Calculate Exact Match Accuracy between predictions and reference answers .


In [None]:
correct = 0

for pred, ref in zip(cleaned_predictions, references):
    if pred.strip().lower() == ref.strip().lower():
        correct += 1

accuracy = correct / len(references)

print("Exact Match Accuracy:", accuracy)


Exact Match Accuracy: 0.006263048016701462


Insight - For open-ended LLM tasks:

Exact match accuracy is usually very low

Because answers can be correct but phrased differently

So this metric is strict.

Evaluating BLEU score -->

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoothie = SmoothingFunction().method4

scores = []

for pred, ref in zip(cleaned_predictions, references):
    score = sentence_bleu(
        [ref.split()],
        pred.split(),
        smoothing_function=smoothie
    )
    scores.append(score)

print("Average BLEU Score:", sum(scores) / len(scores))



Average BLEU Score: 0.05102039022656898


A BLEU of ~0.05 means :

  Model responses have some overlap with references

  But wording is mostly different

  Which is expected for generative models

Evaluating Rouge_score -->

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

rouge_scores = []

for pred, ref in zip(cleaned_predictions, references):
    score = scorer.score(ref, pred)
    rouge_scores.append(score['rougeL'].fmeasure)

print("Average ROUGE-L Score:", sum(rouge_scores) / len(rouge_scores))


Average ROUGE-L Score: 0.1751907713106612


Insight->
So a score of 0.175 means :

 On average, about 17–18% structural overlap exists between your model answers and references.

In [None]:
!pip install bert-score


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
!pip install --upgrade transformers tokenizers evaluate bert-score


Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Downloading transformers-5.1.0-py3-none-any.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Successfully uninstalled transformers-5.0.0
Successfully installed transformers-5.1.0


### Manual testing of the fine-tuned model using a sample question -->


In [None]:
from transformers import pipeline, AutoTokenizer, GenerationConfig

tokenizer = AutoTokenizer.from_pretrained(adapter_path)

# FIX: Explicitly set pad token
tokenizer.pad_token = tokenizer.eos_token

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

gen_config = GenerationConfig(
    max_new_tokens=200,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

question = "What is bias variance tradeoff? How can you avoid it?"

prompt = f"""### Instruction:
{question}

### Response:
"""

output = pipe(
    prompt,
    generation_config=gen_config,
    return_full_text=False   # VERY IMPORTANT
)

print(output[0]["generated_text"])



This is a core concept in ML theory and practice. Understanding it helps build robust systems and choose appropriate ML approaches. Key dimensions: computational cost, generalization performance, interpretability vs. lack of explanatory power. Alternatives include Bayesian approach (intractable), empirical risk minimization (ERM), early stopping, data augmentation, architecture search, multi-task learning, ensemble methods, model averaging, and other approaches.


In [None]:
!pip uninstall -y transformers tokenizers bert-score evaluate



Found existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Found existing installation: tokenizers 0.15.2
Uninstalling tokenizers-0.15.2:
  Successfully uninstalled tokenizers-0.15.2
Found existing installation: bert-score 0.3.13
Uninstalling bert-score-0.3.13:
  Successfully uninstalled bert-score-0.3.13
Found existing installation: evaluate 0.4.6
Uninstalling evaluate-0.4.6:
  Successfully uninstalled evaluate-0.4.6


Loading previously saved model predictions and references from a CSV file -->

In [None]:
import pandas as pd

df = pd.read_csv("model_predictions.csv")

predictions = df["prediction"].tolist()
references = df["reference"].tolist()

print("Loaded samples:", len(predictions))


Loaded samples: 479


### Evaluate semantic similarity between predictions and references using BERTScore -->

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [1]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


### * BertScore  -->

In [None]:
from evaluate import load

bertscore = load("bertscore")

bert_results = bertscore.compute(
    predictions=predictions,
    references=references,
    lang="en"
)

print("Average BERTScore F1:", sum(bert_results["f1"]) / len(bert_results["f1"]))


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore F1: 0.844624831557025


## Insight -->
Unlike BLEU/ROUGE, BERTScore measures :

    Semantic similarity, not word overlap

    So a score of 0.844 (~84%) means :

    Your model answers are semantically very close to reference answers.

Loading the fine tuned model in hugging face hub -->


In [1]:
from huggingface_hub import login
login()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from huggingface_hub import whoami
whoami()


{'type': 'user',
 'id': '67dff5bc40f9ecf77e609313',
 'name': 'aruuuuuuuuunnnn',
 'fullname': 'arun msihra',
 'email': 'arun484116@gmail.com',
 'emailVerified': True,
 'canPay': False,
 'billingMode': 'prepaid',
 'periodEnd': 1772323200,
 'isPro': False,
 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/YVGpLF1oaARnC1SmehYNl.png',
 'orgs': [],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'token_llm',
   'role': 'write',
   'createdAt': '2026-02-06T07:57:42.473Z'}}}

In [4]:
!pip install -U bitsandbytes accelerate transformers peft


Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-5.1.0-py3-none-any.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m134.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Successfully uninstalled transformers-5.0.0
Successfully installed bitsandbytes-0.49.1 transformers-5.1.0


In [5]:
!pip install -U bitsandbytes
!pip install -U accelerate
!pip install -U transformers
!pip install -U peft




In [8]:
import json

adapter_path = "/content/drive/MyDrive/LLM_Base_Models/Llama3B_lora_adapter/adapter_config.json"

with open(adapter_path, "r") as f:
    data = json.load(f)

print(data)


{'alora_invocation_tokens': None, 'alpha_pattern': {}, 'arrow_config': None, 'auto_mapping': None, 'base_model_name_or_path': '/content/drive/MyDrive/LLM_Base_Models/Llama3B_base', 'bias': 'none', 'corda_config': None, 'ensure_weight_tying': False, 'eva_config': None, 'exclude_modules': None, 'fan_in_fan_out': False, 'inference_mode': True, 'init_lora_weights': True, 'layer_replication': None, 'layers_pattern': None, 'layers_to_transform': None, 'loftq_config': {}, 'lora_alpha': 16, 'lora_bias': False, 'lora_dropout': 0.05, 'megatron_config': None, 'megatron_core': 'megatron.core', 'modules_to_save': None, 'peft_type': 'LORA', 'peft_version': '0.18.1', 'qalora_group_size': 16, 'r': 16, 'rank_pattern': {}, 'revision': None, 'target_modules': ['v_proj', 'q_proj'], 'target_parameters': None, 'task_type': 'CAUSAL_LM', 'trainable_token_indices': None, 'use_dora': False, 'use_qalora': False, 'use_rslora': False}


In [10]:
import json

adapter_config_path = "/content/drive/MyDrive/LLM_Base_Models/Llama3B_lora_adapter/adapter_config.json"

# Load existing config
with open(adapter_config_path, "r") as f:
    data = json.load(f)

# Fix the base model reference
data["base_model_name_or_path"] = "meta-llama/Llama-3.2-3B"

# Save it back
with open(adapter_config_path, "w") as f:
    json.dump(data, f, indent=2)

print(" adapter_config.json successfully fixed!")


 adapter_config.json successfully fixed!


In [11]:
with open(adapter_config_path, "r") as f:
    print(json.load(f))


{'alora_invocation_tokens': None, 'alpha_pattern': {}, 'arrow_config': None, 'auto_mapping': None, 'base_model_name_or_path': 'meta-llama/Llama-3.2-3B', 'bias': 'none', 'corda_config': None, 'ensure_weight_tying': False, 'eva_config': None, 'exclude_modules': None, 'fan_in_fan_out': False, 'inference_mode': True, 'init_lora_weights': True, 'layer_replication': None, 'layers_pattern': None, 'layers_to_transform': None, 'loftq_config': {}, 'lora_alpha': 16, 'lora_bias': False, 'lora_dropout': 0.05, 'megatron_config': None, 'megatron_core': 'megatron.core', 'modules_to_save': None, 'peft_type': 'LORA', 'peft_version': '0.18.1', 'qalora_group_size': 16, 'r': 16, 'rank_pattern': {}, 'revision': None, 'target_modules': ['v_proj', 'q_proj'], 'target_parameters': None, 'task_type': 'CAUSAL_LM', 'trainable_token_indices': None, 'use_dora': False, 'use_qalora': False, 'use_rslora': False}


In [15]:
readme_path = "/content/drive/MyDrive/LLM_Base_Models/Llama3B_lora_adapter/README.md"

with open(readme_path, "r") as f:
    content = f.read()

# Replace local path with HF model ID
content = content.replace(
    "/content/drive/MyDrive/LLM_Base_Models/Llama3B_base",
    "meta-llama/Llama-3.2-3B"
)

with open(readme_path, "w") as f:
    f.write(content)

print(" README.md metadata fixed!")


 README.md metadata fixed!


In [16]:
with open(readme_path, "r") as f:
    print(f.read())


---
base_model: meta-llama/Llama-3.2-3B
library_name: peft
pipeline_tag: text-generation
tags:
- base_model:adapter:meta-llama/Llama-3.2-3B
- lora
- transformers
---

# Model Card for Model ID

<!-- Provide a quick summary of what the model is/does. -->



## Model Details

### Model Description

<!-- Provide a longer summary of what this model is. -->



- **Developed by:** [More Information Needed]
- **Funded by [optional]:** [More Information Needed]
- **Shared by [optional]:** [More Information Needed]
- **Model type:** [More Information Needed]
- **Language(s) (NLP):** [More Information Needed]
- **License:** [More Information Needed]
- **Finetuned from model [optional]:** [More Information Needed]

### Model Sources [optional]

<!-- Provide the basic links for the model. -->

- **Repository:** [More Information Needed]
- **Paper [optional]:** [More Information Needed]
- **Demo [optional]:** [More Information Needed]

## Uses

<!-- Address questions around how the model is intende

In [19]:
from huggingface_hub import login, create_repo, upload_folder

login()

repo_name = "aruuuuuuuuunnnn/data-science-interview-assistant-adapter"

# Step 1: Create repo (only needs to be done once)
create_repo(repo_name, repo_type="model", exist_ok=True)

print(" Repository created successfully!")

# Step 2: Now upload adapter
adapter_path = "/content/drive/MyDrive/LLM_Base_Models/Llama3B_lora_adapter"

upload_folder(
    folder_path=adapter_path,
    repo_id=repo_name,
    repo_type="model"
)

print(" Adapter uploaded successfully!")


 Repository created successfully!


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ra_adapter/tokenizer.json:  48%|####8     | 8.33MB / 17.2MB            

  ...adapter_model.safetensors:  46%|####5     | 8.37MB / 18.4MB            

No files have been modified since last commit. Skipping to prevent empty commit.


 Adapter uploaded successfully!
