<a href="https://colab.research.google.com/github/ars235546101/Mini/blob/main/train_lora_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets accelerate peft




In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ds = load_dataset("json", data_files="/content/drive/MyDrive/Colab Notebooks/train_health.jsonl")


In [None]:
import json

examples = [
    {
        "prompt": '{"basic":{"age":48,"weight_kg":92,"height_cm":168,"dietary_pref":"non-veg","activity_level":"sedentary","goals":["weight-loss"]}, "report_kv":{"systolic_bp":150,"diastolic_bp":95,"fasting_glucose_mg_dl":160}}',
        "completion": '{"week_start":"2025-09-22","week_end":"2025-09-28","safety_notes":["Hypertension: reduce salt"],"days":{"monday":{"diet":{"breakfast":{"time":"07:30","items":["Oats","Almonds"]}},"exercise":[{"time":"18:00","activity":"Brisk walk","duration_min":30}]}}}'
    },
    {
        "prompt": '{"basic":{"age":32,"weight_kg":70,"height_cm":172,"dietary_pref":"veg","activity_level":"active","goals":["muscle-gain"]}, "report_kv":{"systolic_bp":120,"diastolic_bp":80,"fasting_glucose_mg_dl":95}}',
        "completion": '{"week_start":"2025-09-22","week_end":"2025-09-28","safety_notes":[],"days":{"monday":{"diet":{"breakfast":{"time":"08:00","items":["Paneer bhurji","Brown bread"]}},"exercise":[{"time":"18:00","activity":"Strength training","duration_min":60}]}}}'
    }
]

with open("train_health.jsonl", "w") as f:
    for ex in examples:
        f.write(json.dumps(ex) + "\n")

print("✅ train_health.jsonl created with", len(examples), "examples")


✅ train_health.jsonl created with 2 examples


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model

# Load base GPT-2
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

# Load dataset properly
ds = load_dataset("json", data_files={"train": "train_health.jsonl"})

def tokenize(batch):
    texts = [p + "###" + c for p, c in zip(batch["prompt"], batch["completion"])]
    enc = tokenizer(texts, truncation=True, max_length=512, padding="max_length")
    enc["labels"] = enc["input_ids"].copy()
    return enc

ds = ds["train"].map(tokenize, batched=True, remove_columns=ds["train"].column_names)

# Apply LoRA
lora = LoraConfig(r=8, lora_alpha=32, target_modules=["c_attn"], lora_dropout=0.05)
model = get_peft_model(model, lora)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training settings
training_args = TrainingArguments(
    output_dir="./health_lora",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    logging_steps=10,
    save_total_limit=1,
    # 🌟 ADD THIS LINE to disable WandB and other loggers
    report_to="none",
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds,
    data_collator=data_collator
)

trainer.train()

# Save model
model.save_pretrained("./health_lora")
tokenizer.save_pretrained("./health_lora")


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


('./health_lora/tokenizer_config.json',
 './health_lora/special_tokens_map.json',
 './health_lora/vocab.json',
 './health_lora/merges.txt',
 './health_lora/added_tokens.json',
 './health_lora/tokenizer.json')

In [None]:
inputs = tokenizer(prompt, return_tensors="pt")
output = model.generate(
    **inputs,
    max_length=600,
    # 💡 ADD THIS: Penalizes repeated tokens, typically between 1.0 and 2.0
    repetition_penalty=1.2,
)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{"basic":{"age":48,"weight_kg":92,"height_cm":168,"dietary_pref":"non-veg","activity_level":"sedentary","goals":["weight-loss"]},"report_kv":{"systolic_bp":150,"diastolic_bp":95,"fasting_glucose_mg_dl":160}}###
The following table shows the average daily energy expenditure of a person who is overweight or obese. The weight loss rate for this group was 0·8% (0·9 kg/m2) and 1·4%, respectively, in men compared with women:


