# TP 7: Finetune llama 3.2 on medical dataset with Hugging Face and peft for fine-tuning

In [3]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import json
import random
import time


# ============================================================
# STEP: DETECT DEVICE (Works on Colab AND Mac)
# ============================================================

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"✅ Using CUDA device: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("✅ Using Apple Silicon (MPS)")
else:
    device = torch.device("cpu")
    print("⚠️ Using CPU (slow!)")


from huggingface_hub import login
login()

from huggingface_hub import whoami

try:
    user_info = whoami()
    print(f"✅ Logged in as: {user_info['name']}")
except Exception as e:
    print(f"❌ Not logged in: {e}")


# ============================================================
# STEP: LOAD MODEL AND TOKENIZER
# ============================================================

model_name = "meta-llama/Llama-3.2-1B-Instruct"

print("📥 Loading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map={"": device},
)
print(f"✅ Model loaded: {model_name}")


# ============================================================
# STEP: INITIALIZE LORA CONFIGURATION (IMPROVED)
# ============================================================

print("\n⚙ Configuring LoRA...")
lora_config = LoraConfig(
    r=32,  # Increased from 16 - more capacity
    lora_alpha=64,  # Increased from 32 - stronger adaptation
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.1,  # Increased from 0.05 - better regularization
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


# ============================================================
# STEP: LOAD AND FORMAT DATASET
# ============================================================

def format_prompt(example):
    """Format with CORRECT field names"""
    question = example.get('Open-ended Verifiable Question', '')
    answer = example.get('Ground-True Answer', '')

    if not question or len(question) < 10:
        return None
    if not answer or len(answer) < 2:
        return None

    text = f"""<|begin_of_text|>
<|start_header_id|>user<|end_header_id|>
{question}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
The answer is: {answer}<|eot_id|>"""

    return {"text": text}

print("\n📊 Loading dataset...")
dataset = load_dataset("FreedomIntelligence/medical-o1-verifiable-problem")

print("🔄 Formatting dataset...")
# Use MORE examples - 500 is too few for good results
train_dataset = dataset['train'].select(range(2000)).map(
    format_prompt,
    remove_columns=dataset['train'].column_names
).filter(lambda x: x is not None)

print(f"✅ Training on {len(train_dataset)} examples")


# ============================================================
# STEP: TOKENIZE TRAIN DATASET
# ============================================================

def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

print("🔄 Tokenizing...")
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)


# ============================================================
# STEP: SET UP TRAINING ARGUMENTS (IMPROVED)
# ============================================================

print("⚙ Setting up training...")
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,  # Increased from 3
    per_device_train_batch_size=2,  # Increased from 1
    gradient_accumulation_steps=8,  # Increased from 4 - effective batch size 16
    learning_rate=5e-5,  # Decreased from 2e-4 - more stable
    weight_decay=0.01,  # Added weight decay for regularization
    lr_scheduler_type="cosine",  # Better than default linear
    warmup_ratio=0.1,  # Changed from warmup_steps - 10% warmup
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    fp16=True,  # Changed to True - faster on T4
    logging_dir="./logs",
    report_to="none",
    use_mps_device=False,
    max_grad_norm=1.0,  # Gradient clipping for stability
)


# ============================================================
# STEP: USE DATA COLLATOR
# ============================================================

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


# ============================================================
# STEP: SET UP TRAINER
# ============================================================

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)


# ============================================================
# STEP: START TRAINING
# ============================================================

print("\n🚀 Starting training...")
print("="*60)
trainer.train()
print("="*60)
print("✅ Training complete!")


# ============================================================
# STEP: SAVE MODEL AND TOKENIZER
# ============================================================

print("\n💾 Saving model...")
model.save_pretrained("./llama3_medical_lora")
tokenizer.save_pretrained("./llama3_medical_lora")
print("✅ Model saved to: ./llama3_medical_lora")


# ============================================================
# CLEAR MEMORY (OPTIONAL)
# ============================================================

import gc
if torch.backends.mps.is_available():
    torch.mps.empty_cache()
gc.collect()
print("✅ Memory cleared!")

✅ Using CUDA device: Tesla T4


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

✅ Logged in as: amgzamgz
📥 Loading model...


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

✅ Model loaded: meta-llama/Llama-3.2-1B-Instruct

⚙ Configuring LoRA...
trainable params: 22,544,384 || all params: 1,258,358,784 || trainable%: 1.7916

📊 Loading dataset...


README.md: 0.00B [00:00, ?B/s]

medical_o1_verifiable_problem.json:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40644 [00:00<?, ? examples/s]

🔄 Formatting dataset...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1986 [00:00<?, ? examples/s]

✅ Training on 1986 examples
🔄 Tokenizing...


Map:   0%|          | 0/1986 [00:00<?, ? examples/s]

⚙ Setting up training...

🚀 Starting training...


Step,Training Loss
10,3.9699
20,3.2607
30,2.5281
40,2.1374
50,1.9653
60,1.8369
70,1.7969
80,1.7817
90,1.6985
100,1.6651


✅ Training complete!

💾 Saving model...
✅ Model saved to: ./llama3_medical_lora
✅ Memory cleared!


In [4]:
# ============================================================
# STEP 1: LOAD AND SPLIT THE DATASET
# ============================================================

print("\n📊 Loading dataset...")
dataset = load_dataset("FreedomIntelligence/medical-o1-verifiable-problem")

# Define train/test split
train_size = 1000
test_dataset = dataset['train'].select(range(train_size, len(dataset['train'])))

# Verify dataset size
print(f"Total dataset size: {len(dataset['train'])}")
print(f"Training set: 0 to {train_size}")
print(f"Test set: {train_size} to {len(dataset['train'])}")


📊 Loading dataset...
Total dataset size: 40644
Training set: 0 to 1000
Test set: 1000 to 40644


In [5]:
# ============================================================
# STEP 2: SAMPLE TEST EXAMPLES
# ============================================================

# Set random seed for reproducibility
random.seed(42)

# Randomly select 20 examples from test set
selected_indices = random.sample(range(len(test_dataset)), 20)

# Record indices
print(f"\n🎲 Randomly selected {len(selected_indices)} test examples")
print(f"Indices: {selected_indices[:5]}... (showing first 5)")


🎲 Randomly selected 20 test examples
Indices: [7296, 1639, 18024, 16049, 14628]... (showing first 5)


In [6]:
# ============================================================
# STEP 3: CREATE INFERENCE FUNCTION
# ============================================================

def get_prediction(question, max_tokens=50):
    """Generate prediction for a question"""
    # Format question with chat template
    prompt = f"""<|begin_of_text|>
<|start_header_id|>user<|end_header_id|>
{question}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""

    # Tokenize and move to device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.3,
            top_p=0.9,
            do_sample=True
        )

    # Extract assistant's response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)

    if "<|start_header_id|>assistant<|end_header_id|>" in full_response:
        answer = full_response.split("<|start_header_id|>assistant<|end_header_id|>")[-1]
        answer = answer.split("<|eot_id|>")[0].strip()
        return answer

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [7]:
# ============================================================
# STEP 4: IMPLEMENT ACCURACY CHECKING
# ============================================================

def check_accuracy(prediction, ground_truth):
    """
    Check if prediction is correct
    Returns: (is_correct, match_type)
    """
    pred_lower = prediction.lower()
    truth_lower = ground_truth.lower()

    # Exact match: Ground truth appears verbatim in prediction
    if truth_lower in pred_lower:
        return True, "exact_match"

    # Partial match: At least 70% of key medical terms appear
    # Filter out common stop words
    stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'of', 'in', 'to', 'for', 'with', 'on', 'at', 'by'}

    truth_words = [w for w in truth_lower.split() if w not in stop_words and len(w) > 2]

    if len(truth_words) == 0:
        return False, "no_match"

    matches = sum(1 for word in truth_words if word in pred_lower)
    match_ratio = matches / len(truth_words)

    if match_ratio >= 0.7:
        return True, "partial_match"

    return False, "no_match"

In [8]:

# ============================================================
# STEP 5: RUN EVALUATION LOOP
# ============================================================

print("\n" + "="*80)
print("EVALUATING MODEL")
print("=" * 80)

results = []
correct_exact = 0
correct_partial = 0
total = 0

start_time = time.time()

for i, idx in enumerate(selected_indices, 1):
    # Extract question and ground truth
    example = test_dataset[idx]
    question = example['Open-ended Verifiable Question']
    ground_truth = example['Ground-True Answer']

    # Display question (truncated if long)
    print(f"\n{'='*80}")
    print(f"TEST {i}/20")
    print(f"{'='*80}")
    print(f"Question: {question[:100]}...")
    print(f"Ground Truth: {ground_truth}")

    # Generate prediction
    print("⏳ Generating prediction...")
    prediction = get_prediction(question)
    print(f"Prediction: {prediction[:200]}")

    # Check if prediction is correct
    is_correct, match_type = check_accuracy(prediction, ground_truth)

    # Display result
    if is_correct:
        if match_type == "exact_match":
            correct_exact += 1
            print("✅ CORRECT (Exact match)")
        else:
            correct_partial += 1
            print("✅ CORRECT (Partial match)")
    else:
        print("❌ INCORRECT")

    # Track running accuracy
    total += 1
    current_accuracy = ((correct_exact + correct_partial) / total) * 100
    print(f"\nRunning accuracy: {current_accuracy:.1f}% ({correct_exact + correct_partial}/{total})")

    # Store result
    results.append({
        'question': question,
        'ground_truth': ground_truth,
        'prediction': prediction,
        'correct': is_correct,
        'match_type': match_type
    })

total_time = time.time() - start_time



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



EVALUATING MODEL

TEST 1/20
Question: After a 60-year-old man underwent a successful orthotopic liver transplantation, the transplanted li...
Ground Truth: Reactive oxygen species
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Superoxide anion (O2-) and hydrogen peroxide (H2O2) are the substances most likely responsible for causing reperfusion injury in the transplanted liver. Superoxide anion and hydrogen
❌ INCORRECT

Running accuracy: 0.0% (0/1)

TEST 2/20
Question: In a 37-year-old female patient with a fractured clavicle where the junction of the inner and middle...
Ground Truth: Thrombosis of the subclavian vein, causing a pulmonary embolism
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Kienböck's disease.
❌ INCORRECT

Running accuracy: 0.0% (0/2)

TEST 3/20
Question: In which condition does the antagonism of histamine by H1 antihistaminics not afford any benefit?...
Ground Truth: Common cold
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Asthma attack in a child with bronchial asthma.
❌ INCORRECT

Running accuracy: 0.0% (0/3)

TEST 4/20
Question: A 74-year-old man has a 1.5-centimeter, faintly erythematous, raised lesion with irregular borders o...
Ground Truth: Irreversible nuclear changes in the stratum basale
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Intraepithelial carcinoma cells with keratinization and nuclear grooves and pseudoinclusions.
❌ INCORRECT

Running accuracy: 0.0% (0/4)

TEST 5/20
Question: A 24-year-old male presents to the psychiatry emergency department with symptoms of excitement, gran...
Ground Truth: Risperidone
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Risperidone
The reason is: Risperidone is an atypical antipsychotic that is effective in treating the symptoms of schizophrenia and can be used for mood stabilization in cases of acute 
✅ CORRECT (Exact match)

Running accuracy: 20.0% (1/5)

TEST 6/20
Question: An 18-year-old pregnant woman, who is 10 weeks along, presents at her first prenatal visit reporting...
Ground Truth: Treat with nitrofurantoin for seven days.
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Ceftriaxone and azithromycin for 7 days.
❌ INCORRECT

Running accuracy: 16.7% (1/6)

TEST 7/20
Question: A 40-year-old male presented with right loin pain referred to the right iliac fossa. After an ultras...
Ground Truth: Mid ureter
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Renal pelvis and calyces of the right kidney.
❌ INCORRECT

Running accuracy: 14.3% (1/7)

TEST 8/20
Question: What is the most common functioning pancreatic islet cell tumor?...
Ground Truth: Insulinoma
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Islet cell tumor of the pancreas (insulinoma) is the most common functioning pancreatic islet cell tumor.
✅ CORRECT (Exact match)

Running accuracy: 25.0% (2/8)

TEST 9/20
Question: In an MRI scan showing a transaxial section through the head, which structure may be obliterated by ...
Ground Truth: The optic chiasm.
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Inferior ophthalmic vein. A pituitary tumor may cause obliteration of the inferior ophthalmic vein. The other options are not correct. The optic chiasm is not obliterated by a pituitary
❌ INCORRECT

Running accuracy: 22.2% (2/9)

TEST 10/20
Question: What artery is a direct branch of the gastroduodenal artery?...
Ground Truth: Right gastroepiploic artery
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Common hepatic artery.
❌ INCORRECT

Running accuracy: 20.0% (2/10)

TEST 11/20
Question: A patient diagnosed with bronchiectasis has now presented with nephrotic syndrome. What is the most ...
Ground Truth: Amyloidosis
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: IgA nephropathy with bronchial involvement and pulmonary complications.
❌ INCORRECT

Running accuracy: 18.2% (2/11)

TEST 12/20
Question: What is the most general term for the process by which the amount of active drugs in the body is red...
Ground Truth: Elimination
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Metabolism and Excretion (ME) process.
❌ INCORRECT

Running accuracy: 16.7% (2/12)

TEST 13/20
Question: A 7-year-old boy presents with developmental delay, intellectual disability, and a history of cerebr...
Ground Truth: Decreased methionine concentration
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Elevated homocysteine levels (hyperhomocystinemia) in serum.
❌ INCORRECT

Running accuracy: 15.4% (2/13)

TEST 14/20
Question: A 27-year-old male presents with a palpable mass in his scrotum and mild testicular pain. Upon physi...
Ground Truth: Compression of the left renal vein at the aortic origin of the superior mesenteric artery
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Spermatocele (a cyst containing sperm) in the scrotum.
❌ INCORRECT

Running accuracy: 14.3% (2/14)

TEST 15/20
Question: A farmer has a black mole on the cheek that has increased in size to more than 6mm with sharply defi...
Ground Truth: Superficial spreading melanoma
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Acne keloidus with central black lesion.
❌ INCORRECT

Running accuracy: 13.3% (2/15)

TEST 16/20
Question: Which viruses are known to cause hemorrhagic fever?...
Ground Truth: Lassa fever virus, Yellow fever virus, Crimean-Congo hemorrhagic fever virus
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Machupo virus, Lassa virus, Ebola virus, Marburg virus.
❌ INCORRECT

Running accuracy: 12.5% (2/16)

TEST 17/20
Question: What is the name of the vertical crest found in the fundus of the internal auditory canal?...
Ground Truth: Bill's bar
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Forssman's crest.
❌ INCORRECT

Running accuracy: 11.8% (2/17)

TEST 18/20
Question: What is the structure not involved in the Unhappy triad of O'Donoghue?...
Ground Truth: Fibular collateral ligament
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Antrum of Maxillary Digastric Arches.
❌ INCORRECT

Running accuracy: 11.1% (2/18)

TEST 19/20
Question: What is the most likely diagnosis for a 22-year-old woman who develops small itchy wheals after phys...
Ground Truth: Cholinergic urticaria
⏳ Generating prediction...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prediction: The answer is: Angioedema due to mast cell degranulation.
❌ INCORRECT

Running accuracy: 10.5% (2/19)

TEST 20/20
Question: What is the most reliable feature indicating the malignant transformation of pheochromocytoma?...
Ground Truth: Presence of metastasis to other organs
⏳ Generating prediction...
Prediction: The answer is: Metastasis to the liver and lungs.
❌ INCORRECT

Running accuracy: 10.0% (2/20)


In [9]:
# ============================================================
# STEP 6: CALCULATE FINAL METRICS
# ============================================================

print("\n" + "="*80)
print("FINAL RESULTS")
print("="*80)

accuracy = ((correct_exact + correct_partial) / total) * 100

print(f"Total examples evaluated: {total}")
print(f"Exact matches: {correct_exact} ({(correct_exact/total)*100:.1f}%)")
print(f"Partial matches: {correct_partial} ({(correct_partial/total)*100:.1f}%)")
print(f"Total correct: {correct_exact + correct_partial} ({accuracy:.1f}%)")
print(f"Incorrect: {total - correct_exact - correct_partial} ({((total - correct_exact - correct_partial)/total)*100:.1f}%)")
print(f"Total evaluation time: {total_time/60:.1f} minutes")
print(f"Average time per example: {total_time/total:.1f} seconds")


FINAL RESULTS
Total examples evaluated: 20
Exact matches: 2 (10.0%)
Partial matches: 0 (0.0%)
Total correct: 2 (10.0%)
Incorrect: 18 (90.0%)
Total evaluation time: 0.4 minutes
Average time per example: 1.3 seconds


In [10]:

# ============================================================
# STEP 7: ANALYZE DETAILED RESULTS
# ============================================================

print("\n" + "="*80)
print("DETAILED RESULTS")
print("="*80)

# Show incorrect examples
incorrect = [r for r in results if not r['correct']]
if incorrect:
    print(f"\n❌ INCORRECT EXAMPLES ({len(incorrect)}):")
    print("="*80)
    for i, r in enumerate(incorrect, 1):
        print(f"\n{i}. Question: {r['question']}")
        print(f"   Ground Truth: {r['ground_truth']}")
        print(f"   Prediction: {r['prediction'][:100]}...")
else:
    print("\n🎉 ALL EXAMPLES CORRECT!")

# Show correct examples (first 5)
correct = [r for r in results if r['correct']]
if correct:
    print(f"\n✅ CORRECT EXAMPLES ({len(correct)}):")
    print("="*80)
    for i, r in enumerate(correct[:5], 1):
        print(f"\n{i}. Question: {r['question']}")
        print(f"   Ground Truth: {r['ground_truth']}")
        print(f"   Prediction: {r['prediction'][:80]}...")
        print(f"   Match type: {r['match_type']}")
    if len(correct) > 5:
        print(f"\n... and {len(correct) - 5} more correct examples")




DETAILED RESULTS

❌ INCORRECT EXAMPLES (18):

1. Question: After a 60-year-old man underwent a successful orthotopic liver transplantation, the transplanted liver exhibited poor function and produced minimal bile for the first 3 days. This poor graft function is thought to result from 'reperfusion injury.' What substance is most likely responsible for causing reperfusion injury in the transplanted liver?
   Ground Truth: Reactive oxygen species
   Prediction: The answer is: Superoxide anion (O2-) and hydrogen peroxide (H2O2) are the substances most likely re...

2. Question: In a 37-year-old female patient with a fractured clavicle where the junction of the inner and middle third of the bone shows overriding of the medial and lateral fragments, and the arm is rotated medially but not laterally, what medical condition is likely to occur as a complication of this fracture?
   Ground Truth: Thrombosis of the subclavian vein, causing a pulmonary embolism
   Prediction: The answer is: Kien

In [11]:
# ============================================================
# STEP 8: ASSESS PERFORMANCE
# ============================================================

print("\n" + "="*80)
print("PERFORMANCE ASSESSMENT")
print("="*80)

if accuracy >= 80:
    print("🌟 EXCELLENT! Model is performing very well!")
    print("   Your fine-tuning was highly successful.")
elif accuracy >= 60:
    print("✅ GOOD! Model learned successfully!")
    print("   Consider training longer or with more data for improvement.")
elif accuracy >= 40:
    print("⚠ MODERATE. Model shows some learning.")
    print("   Recommend: Train for more epochs or increase dataset size.")
elif accuracy >= 20:
    print("⚠ POOR. Model needs significant improvement.")
    print("   Recommend: Check data quality, train longer, or use more examples.")
else:
    print("❌ VERY POOR. Model barely learned.")
    print("   Recommend: Verify data formatting and retrain from scratch.")




PERFORMANCE ASSESSMENT
❌ VERY POOR. Model barely learned.
   Recommend: Verify data formatting and retrain from scratch.


In [12]:
# ============================================================
# STEP 9: SAVE RESULTS
# ============================================================

print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

results_summary = {
    'total_examples': total,
    'exact_matches': correct_exact,
    'partial_matches': correct_partial,
    'total_correct': correct_exact + correct_partial,
    'accuracy_percentage': accuracy,
    'incorrect': total - correct_exact - correct_partial,
    'total_time_minutes': total_time / 60,
    'avg_time_per_example': total_time / total,
    'selected_indices': selected_indices,
    'detailed_results': results
}

with open('evaluation_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("✅ Results saved to: evaluation_results.json")

print("\n" + "="*80)
print("EVALUATION COMPLETE")
print("="*80)


SAVING RESULTS
✅ Results saved to: evaluation_results.json

EVALUATION COMPLETE
