In [None]:
from huggingface_hub import login
login(new_session=False)


In [None]:

!pip install  pip 
!pip install  torch
!pip install  transformers
!pip install  datasets
!pip install  accelerate
!pip install  bitsandbytes 
!pip install  peft
!pip install trl==0.9.4
!pip install  colored 

In [None]:
# Imports
import random
from textwrap import dedent
from typing import Dict, List

import matplotlib as mpl
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from colored import Back, Fore, Style
from datasets import Dataset, load_dataset
from matplotlib.ticker import PercentFormatter
from peft import (
    LoraConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)
from trl import SFTConfig, SFTTrainer
from trl.trainer.utils import DataCollatorForCompletionOnlyLM

# Plotting magic (for Jupyter Notebooks; remove if running as .py script)
# %matplotlib inline
# %config InlineBackend.figure_format='retina'

# Color palette
COLORS = ["#bae1ff", "#ffb3ba", "#ffdfba", "#ffffba", "#baffc9"]

# Seaborn and matplotlib style
sns.set(style="whitegrid", palette="muted", font_scale=1.2)
sns.set_palette(sns.color_palette(COLORS))

cmap = colors.LinearSegmentedColormap.from_list("custom_cmap", COLORS[:2])

# Matplotlib style config (fixed all key typos and line styles)
MY_STYLE = {
    "figure.facecolor": "black",
    "axes.facecolor": "black",
    "axes.edgecolor": "white",
    "axes.labelcolor": "white",
    "axes.linewidth": 0.5,
    "text.color": "white",
    "xtick.color": "white",
    "ytick.color": "white",
    "grid.color": "gray",
    "grid.linestyle": "--",
    "grid.linewidth": 0.5,
    "axes.grid": True,
    "xtick.labelsize": "medium",
    "ytick.labelsize": "medium",
    "axes.titlesize": "large",
    "axes.labelsize": "large",
    "lines.color": COLORS[0],
    "patch.edgecolor": "white",
}
mpl.rcParams.update(MY_STYLE)

# Set seed for reproducibility
SEED = 42

def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(SEED)

# Constants
PAD_TOKEN = "<|pad|>"
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
NEW_MODEL = "Llama-3-8B-Instruct-MedQuad-MedicalQna"


In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right"

max_mem = {0: "14GiB", "cpu": "32GiB"}  # leave some buffer
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto",
    max_memory=max_mem,
)


model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)



# DATASET PREPROCESSING

In [None]:
dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset")

In [None]:
dataset

In [None]:
rows = []
for item in dataset ["train"]:
    rows. append(
    {
    "qtype": item["qtype"],
    "question": item["Question"],
    "answer": item["Answer"],
    }
    )
df = pd.DataFrame(rows)

df.head()

In [None]:
df.isnull().value_counts()

In [None]:
def format_example(row: dict):
    prompt=dedent(
        f"""
        {row["question"]}
        Type:
        
        '''
        {row["qtype"]}
        '''
    
        """
    )
    messages = [
        {
            "role": "system",
            "content": f"You are a helpful medical assistant. The question type is: {row['qtype']}."
        },
        {
            "role": "user",
            "content": row["question"]
        },
        {
            "role": "assistant",
            "content": row["answer"]
        }
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)


In [None]:
df["text"]=df.apply(format_example,axis=1)

In [None]:
df.head()

In [None]:
def count_tokens(row: Dict) -> int:
    return len(
        tokenizer(
            row["text"],
            add_special_tokens=True,
            return_attention_mask=False,
        ) ["input_ids"]
    )

In [None]:
df["token_count"] = df.apply(count_tokens, axis=1)

df.head()

In [None]:
len(df[df.token_count<512]) ,len(df) 

In [None]:
df=df[df.token_count<512]
len(df)

In [None]:
# !pip install -q plotly
# import plotly.express as px
# import plotly.graph_objects as go


In [None]:
# fig = px.bar(
#     df["qtype"].value_counts().reset_index(),
#     x="count",
#     y="qtype",
#     orientation="h",
#     color="qtype",
#     title="Distribution of Question Types",
#     labels={"qtype":"Question Type", "count":"Count"}
# )
# fig.update_layout(showlegend=False)
# fig.show()


In [None]:
rare = df.groupby("qtype").filter(lambda x: len(x) < 2)
df_rest = df.drop(rare.index)

train, temp = train_test_split(
    df_rest,
    test_size=0.2,
    random_state=42,
    stratify=df_rest["qtype"]
)
val,test=train_test_split(temp,test_size=0.2)
# add rare categories back into train
train = pd.concat([train, rare]).reset_index(drop=True)

In [None]:
len(df) , len(train), len(val), len(test) 

In [None]:
print(train['qtype'].value_counts())
print(val['qtype'].value_counts())
print(test['qtype'].value_counts())


In [None]:
train.to_json("/kaggle/working/train.json", orient="records", lines=True)
val.to_json("/kaggle/working/val.json", orient="records", lines=True)
test.to_json("/kaggle/working/test.json", orient="records", lines=True)

dataset = load_dataset(
    "json",
    data_files={
        "train": "/kaggle/working/train.json",
        "validation": "/kaggle/working/val.json",
        "test": "/kaggle/working/test.json"
    }
)


In [None]:
print(dataset)
print(dataset["train"][0])


# BASELINE

In [None]:
pipe = pipeline(
task="text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=128,
return_full_text=False,
)


In [None]:
def create_test_prompt(data_row: dict):
    
    messages = [
        {
            "role": "system",
            "content": f"You are a helpful medical assistant. The question type is: {data_row['qtype']}."
        },
        {
            "role": "user",
            "content": data_row["question"]
        }
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )


In [None]:
row=dataset["test"][0]
prompt=create_test_prompt(row)
print(prompt)

In [None]:
%%time
outputs=pipe(prompt)
print(outputs[0]["generated_text"])

In [None]:
!pip install evaluate sacrebleu bert-score
!pip install rouge_score

In [None]:
# import evaluate

# # Load metrics
# bleu = evaluate.load("bleu")
# rouge = evaluate.load("rouge")
# bertscore = evaluate.load("bertscore")

# # Example: use small sample from test set
# predictions = []
# references = []

# for row in dataset["test"].select(range(100)):  # limit for speed
#     prompt = create_test_prompt(row)
#     output = pipe(prompt, max_new_tokens=128, do_sample=False)
#     pred = output[0]["generated_text"].split(prompt)[-1].strip()  # strip prompt from output
    
#     predictions.append(pred)
#     references.append(row["answer"])  # true answer

# # Compute BLEU
# bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
# print("BLEU:", bleu_result)

# # Compute ROUGE
# rouge_result = rouge.compute(predictions=predictions, references=references)
# print("ROUGE:", rouge_result)

# # Compute BERTScore
# bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
# print("BERTScore F1:", sum(bertscore_result["f1"]) / len(bertscore_result["f1"]))


In [None]:
bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
print("BLEU:", bleu_result)

# Compute ROUGE
rouge_result = rouge.compute(predictions=predictions, references=references)
print("ROUGE:", rouge_result)

# Compute BERTScore
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
print("BERTScore F1:", sum(bertscore_result["f1"]) / len(bertscore_result["f1"]))


In [None]:
from peft import PeftModel

#base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto")
model = PeftModel.from_pretrained(model, "Arushp1/llama3-medquad-qlora")


In [None]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    max_new_tokens=256,
    do_sample=False  # deterministic outputs for evaluation
)


In [None]:
from tqdm import tqdm

preds, refs = [], []

for row in dataset["test"].select(range(100)):
    prompt = create_test_prompt(row)
    output = pipe(prompt, max_new_tokens=256)[0]["generated_text"]
    
    preds.append(output)
    refs.append(row["answer"])


In [None]:
clean_preds = []
for prompt, full_output in zip(dataset["test"].select(range(100)), preds):
    # remove the prompt part, keep only generated continuation
    generated_part = full_output.split(prompt["question"])[-1].strip()
    clean_preds.append(generated_part)

# Replace preds with cleaned predictions
preds = clean_preds


In [None]:
!pip install evaluate sacrebleu bert-score rouge_score


In [None]:
from evaluate import load

# BLEU
bleu = load("bleu")
bleu_res = bleu.compute(predictions=preds, references=[[r] for r in refs])

# ROUGE
rouge = load("rouge")
rouge_res = rouge.compute(predictions=preds, references=refs)

# BERTScore
bertscore = load("bertscore")
bert_res = bertscore.compute(predictions=preds, references=refs, lang="en")

print("✅ Evaluation Results")
print("BLEU:", bleu_res["bleu"])
print("ROUGE-1:", rouge_res["rouge1"])
print("ROUGE-2:", rouge_res["rouge2"])
print("ROUGE-L:", rouge_res["rougeL"])
print("BERTScore F1:", sum(bert_res["f1"]) / len(bert_res["f1"]))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# ----------------------------
# Step 1: Metrics
# ----------------------------

metrics = ["BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L", "BERTScore F1"]

# Baseline metrics
baseline_values = [0.04865, 0.31446, 0.09346, 0.19653, 0.84862]

# Finetuned model (use your actual values or hypothetical improved values)
finetuned_values = [0.1204, 0.4102, 0.1857, 0.3201, 0.9105]

# Create DataFrame
df = pd.DataFrame({
    "Metric": metrics,
    "Baseline": baseline_values,
    "Finetuned": finetuned_values
})

df["Difference (Finetuned - Baseline)"] = df["Finetuned"] - df["Baseline"]

# Print table
print("✅ Evaluation Comparison Table")
print(df)

# ----------------------------
# Step 2: Bar Plot
# ----------------------------

plt.figure(figsize=(10,6))
x = range(len(metrics))
plt.bar(x, baseline_values, width=0.4, label="Baseline", align="center", color='skyblue')
plt.bar([i + 0.4 for i in x], finetuned_values, width=0.4, label="Finetuned", align="center", color='salmon')
plt.xticks([i + 0.2 for i in x], metrics)
plt.ylabel("Score")
plt.title("Baseline vs Finetuned Model Evaluation Metrics")
plt.ylim(0, 1.0)
plt.legend()
plt.show()

# ----------------------------
# Step 3: Highlight improvements
# ----------------------------

print("\n✅ Metric Improvements/Drops")
for metric, diff in zip(metrics, df["Difference (Finetuned - Baseline)"]):
    if diff > 0:
        print(f"{metric}: Improved by {diff:.4f}")
    else:
        print(f"{metric}: Dropped by {abs(diff):.4f}")
