In [1]:
# Install Libraries
!pip install -q transformers datasets peft loralib torch pandas pyarrow accelerate scikit-learn



In [2]:
# Load and Prepare the Dataset
import os
import inspect
import pandas as pd
from datasets import Dataset as HFDataset, load_dataset

train_df = val_df = test_df = None

try:

    from data_loader import load_and_preprocess_data
    print("Found data_loader.py — calling load_and_preprocess_data()")
    outputs = load_and_preprocess_data()

    if isinstance(outputs, tuple) and len(outputs) >= 2:
        train_df, val_df = outputs[0], outputs[1]
        if len(outputs) > 2:
            test_df = outputs[2]
    else:
        raise ValueError("load_and_preprocess_data() returned unexpected format.")
except Exception as e:
    print("data_loader.py not found or failed. Falling back to Hugging Face load_dataset.", e)
    ds = load_dataset("lavita/AlpaCare-MedInstruct-52k")

    if "train" in ds:
        full = ds["train"]
    else:

        full = ds[list(ds.keys())[0]]

    df = full.to_pandas()

    if "instruction" in df.columns and "input" in df.columns and "output" in df.columns:

        df['instruction'] = df['instruction'] + " " + df['input'].fillna('')

        df['response'] = df['output']

        df = df[['instruction', 'response']]
    elif "instruction" in df.columns and "response" in df.columns:
        pass
    else:
        raise ValueError("Dataset does not contain expected 'instruction', 'input', 'output' or 'instruction', 'response' columns.")


    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(df)
    n_train = int(0.90 * n)
    n_val = int(0.05 * n)
    train_df = df.iloc[:n_train].reset_index(drop=True)
    val_df = df.iloc[n_train:n_train+n_val].reset_index(drop=True)
    test_df = df.iloc[n_train+n_val:].reset_index(drop=True)


print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df) if test_df is not None else "No test set")

train_dataset = HFDataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = HFDataset.from_pandas(val_df.reset_index(drop=True))

data_loader.py not found or failed. Falling back to Hugging Face load_dataset. No module named 'data_loader'


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/944 [00:00<?, ?B/s]

data/train-00000-of-00001-297892d5d4e8a0(…):   0%|          | 0.00/36.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Train size: 46801
Val size: 2600
Test size: 2601


In [3]:
# Load Base Model & Tokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# For GPT-family models set pad token to eos_token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Setting pad_token to eos_token.")

model = AutoModelForCausalLM.from_pretrained(model_name)


if tokenizer.pad_token_id is not None and model.get_input_embeddings().weight.shape[0] != len(tokenizer):
    model.resize_token_embeddings(len(tokenizer))

def tokenize_fn(examples):
    inputs = []
    for instr, resp in zip(examples["instruction"], examples["response"]):
        text = f"Instruction: {instr}\n\nResponse: {resp}"
        inputs.append(text)
    tokenized = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)

    tokenized["labels"] = [list(x) for x in tokenized["input_ids"]]
    return tokenized

tokenized_train_dataset = train_dataset.map(tokenize_fn, batched=True, remove_columns=train_dataset.column_names)
tokenized_val_dataset = val_dataset.map(tokenize_fn, batched=True, remove_columns=val_dataset.column_names)

print("Tokenized datasets ready:", tokenized_train_dataset.column_names)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting pad_token to eos_token.


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/46801 [00:00<?, ? examples/s]

Map:   0%|          | 0/2600 [00:00<?, ? examples/s]

Tokenized datasets ready: ['input_ids', 'attention_mask', 'labels']


In [4]:
# Configure LoRA (Low-Rank Adaptation)
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn"]
)

peft_model = get_peft_model(model, lora_config)

print("Trainable Parameters")
peft_model.print_trainable_parameters()
print("LoRA configured.")


Trainable Parameters
trainable params: 147,456 || all params: 82,060,032 || trainable%: 0.1797
LoRA configured.




In [5]:
# Set Up the Trainer
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    fp16=True,
    push_to_hub=False,
    report_to="none" # Disable reporting to Weights & Biases
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
)
print("Trainer ready")

Trainer ready


In [6]:
# Train the Model and Save the Adapter

import os
os.environ["WANDB_MODE"] = "disabled"

print("\n--- Starting Model Fine-Tuning ---")
trainer.train()
print("--- Fine-Tuning Complete ---")

# Save the newly trained LoRA adapter and tokenizer
peft_model.save_pretrained("./lora_adapter")
tokenizer.save_pretrained("./lora_adapter")

print("\nTraining complete!")




--- Starting Model Fine-Tuning ---


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,2.6437,2.469096
2,2.614,2.43669
3,2.5601,2.428586


--- Fine-Tuning Complete ---

Training complete!


In [7]:
import shutil

# Zip the folder
shutil.make_archive("lora_adapter", "zip", "lora_adapter")

# Download the zipped file
from google.colab import files
files.download("lora_adapter.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>