In [1]:
 #1. INSTALL DEPENDENCIES (RUN FIRST)
# ======================
!pip install -q torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers==4.40.1 accelerate==0.29.3
!pip install -q peft==0.5.0 datasets==2.14.5 sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m681.6 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m93.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver d

In [2]:
# 2. SETUP ENVIRONMENT
# ======================
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import json
import re
import os

# Verify GPU
print(f"PyTorch: {torch.__version__}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"CUDA Available: {torch.cuda.is_available()}")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

PyTorch: 2.1.0+cu118
GPU: Tesla T4
CUDA Available: True


In [3]:
# 3. LOAD & PREPARE DATA
# ======================
def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

# Upload your JSON file
from google.colab import files
uploaded = files.upload()

# Get the uploaded filename dynamically
filename = list(uploaded.keys())[0]
print(f"Processing file: {filename}")

with open(filename, "r", encoding='utf-8') as f:
    data = json.load(f)

processed_data = []
for item in data:
    if "text" in item and len(item["text"].strip()) > 50:  # Filter out very short texts
        law_text = clean_text(item["text"])
        # Create better prompts for legal documents
        prompt = f"<|system|>\nYou are a legal expert assistant. Explain legal documents clearly and accurately.\n<|user|>\nExplain the content of {item.get('file_name', 'this legal document')}.\n<|assistant|>\n{law_text[:1500]}<|end|>"
        processed_data.append({"text": prompt})

print(f"Processed {len(processed_data)} legal documents")

# Create dataset with proper split
dataset = Dataset.from_list(processed_data)
if len(processed_data) > 10:
    dataset = dataset.train_test_split(test_size=0.1, seed=42)
else:
    # If dataset is too small, use all for training
    dataset = {"train": dataset, "test": dataset}

print(f"Training samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

Saving pdf_data.json to pdf_data.json
Processing file: pdf_data.json
Processed 919 legal documents
Training samples: 827
Test samples: 92


In [4]:
# 4. INITIALIZE MODEL
# ======================
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="right"
)

# Set pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    use_cache=False  # Disable cache for training
)

print(f"Model loaded: {model_name}")
print(f"Vocab size: {len(tokenizer)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded: TinyLlama/TinyLlama-1.1B-Chat-v1.0
Vocab size: 32000


In [5]:

# 5. CONFIGURE LoRA
# ======================
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Added more target modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.40791125334440875


In [6]:
# 6. TOKENIZATION
# ======================
def tokenize_function(examples):
    # Tokenize the text
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024,
        padding=False,  # Don't pad here, let data collator handle it
        return_tensors=None
    )

    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize datasets
tokenized_train = dataset["train"].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

tokenized_test = dataset["test"].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["test"].column_names
)

print("Tokenization completed!")

Map:   0%|          | 0/827 [00:00<?, ? examples/s]

Map:   0%|          | 0/92 [00:00<?, ? examples/s]

Tokenization completed!


In [7]:
# 7. MANUAL TRAINING LOOP (NO TRAINER CLASS)
# ======================

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

# Create simple DataLoader without problematic sampler
from torch.utils.data import DataLoader

# Simple function to create batches manually
def create_dataloader(dataset, batch_size=1):
    """Create a simple dataloader without sampling issues"""
    batches = []
    for i in range(0, len(dataset), batch_size):
        batch_data = []
        for j in range(i, min(i + batch_size, len(dataset))):
            batch_data.append(dataset[j])
        if batch_data:
            batches.append(data_collator(batch_data))
    return batches

# Create training batches
train_batches = create_dataloader(tokenized_train, batch_size=1)
print(f"Created {len(train_batches)} training batches")

# Training setup
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
num_epochs = 5
gradient_accumulation_steps = 4

print("Starting manual training...")
print("=" * 50)

total_loss = 0
step = 0

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    epoch_loss = 0

    for batch_idx, batch in enumerate(train_batches):
        # Move batch to device
        batch = {k: v.to(model.device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss / gradient_accumulation_steps

        # Backward pass
        loss.backward()

        total_loss += loss.item()
        epoch_loss += loss.item()

        # Update weights every gradient_accumulation_steps
        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            step += 1

            if step % 10 == 0:
                avg_loss = total_loss / step
                print(f"Step {step}, Average Loss: {avg_loss:.4f}")

    # Final optimizer step for the epoch
    optimizer.step()
    optimizer.zero_grad()

    avg_epoch_loss = epoch_loss / len(train_batches)
    print(f"Epoch {epoch + 1} completed. Average Loss: {avg_epoch_loss:.4f}")

print("Training completed!")

Created 827 training batches
Starting manual training...
Epoch 1/5
Step 10, Average Loss: 1.4529
Step 20, Average Loss: 1.3615
Step 30, Average Loss: 1.2926
Step 40, Average Loss: 1.2549
Step 50, Average Loss: 1.2149
Step 60, Average Loss: 1.1969
Step 70, Average Loss: 1.1788
Step 80, Average Loss: 1.1765
Step 90, Average Loss: 1.1571
Step 100, Average Loss: 1.1431
Step 110, Average Loss: 1.1308
Step 120, Average Loss: 1.1200
Step 130, Average Loss: 1.1086
Step 140, Average Loss: 1.0981
Step 150, Average Loss: 1.0890
Step 160, Average Loss: 1.0813
Step 170, Average Loss: 1.0771
Step 180, Average Loss: 1.0702
Step 190, Average Loss: 1.0658
Step 200, Average Loss: 1.0621
Epoch 1 completed. Average Loss: 0.2646
Epoch 2/5
Step 210, Average Loss: 1.0618
Step 220, Average Loss: 1.0555
Step 230, Average Loss: 1.0511
Step 240, Average Loss: 1.0458
Step 250, Average Loss: 1.0426
Step 260, Average Loss: 1.0379
Step 270, Average Loss: 1.0354
Step 280, Average Loss: 1.0299
Step 290, Average Loss: 

In [21]:
# Save the model and tokenizer
peft_save_path = "./finetuned-tinyllama-law-lora"

# Create directory if it doesn't exist
os.makedirs(peft_save_path, exist_ok=True)

# Save with safe serialization (recommended)
model.save_pretrained(peft_save_path, safe_serialization=True)
tokenizer.save_pretrained(peft_save_path)

# Rename config file to adapter_config.json if needed
if os.path.exists(f"{peft_save_path}/config.json"):
    os.rename(f"{peft_save_path}/config.json", f"{peft_save_path}/adapter_config.json")

print(f"Model saved to {peft_save_path}")
print("Contents of saved directory:")
print(os.listdir(peft_save_path))

Model saved to ./finetuned-tinyllama-law-lora
Contents of saved directory:
['tokenizer.model', 'special_tokens_map.json', 'generation_config.json', 'model.safetensors', 'adapter_config.json', 'tokenizer.json', 'tokenizer_config.json']


In [24]:
import json
import os

peft_save_path = "./finetuned-tinyllama-law-lora"
config_path = os.path.join(peft_save_path, "adapter_config.json")

# 1. Verify and repair the config file
with open(config_path, "r") as f:
    config = json.load(f)

# Add missing required fields if they don't exist
required_fields = {
    "peft_type": "LORA",
    "task_type": "CAUSAL_LM",
    "inference_mode": True,
    "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
}

for field, default_value in required_fields.items():
    if field not in config:
        config[field] = default_value
        print(f"Added missing field: {field} = {default_value}")

# Save the repaired config
with open(config_path, "w") as f:
    json.dump(config, f, indent=2)

print("Config file verified and repaired")

Added missing field: peft_type = LORA
Added missing field: task_type = CAUSAL_LM
Added missing field: inference_mode = True
Added missing field: base_model_name_or_path = TinyLlama/TinyLlama-1.1B-Chat-v1.0
Config file verified and repaired


In [25]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from safetensors.torch import load_file

# 1. Load the repaired config
peft_config = PeftConfig.from_pretrained(peft_save_path)
print("Loaded config:", peft_config)

# 2. Load base model
base_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    torch_dtype=torch.float16
)

# 3. Manual loading as a last resort
try:
    model = PeftModel.from_pretrained(
        model,
        peft_save_path,
        is_trainable=False
    )
except Exception as e:
    print(f"Standard loading failed: {e}")
    print("Attempting manual loading...")

    # Manual injection of LoRA adapters
    from peft.tuners.lora import LoraModel

    # Create LoraModel manually
    lora_model = LoraModel(model, peft_config, "default")

    # Load weights manually
    weights_path = os.path.join(peft_save_path, "adapter_model.safetensors")
    adapter_weights = load_file(weights_path)

    # Load weights into model
    lora_model.load_adapter(adapter_weights, "default")
    model = lora_model

model.eval()
print("Model successfully loaded!")

Loaded config: LoraConfig(peft_type='LORA', auto_mapping=None, base_model_name_or_path='TinyLlama/TinyLlama-1.1B-Chat-v1.0', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)
Model successfully loaded!


In [28]:
def generate_legal_response(prompt, max_new_tokens=512):
    """
    Generates precise legal answers using the fine-tuned model

    Args:
        prompt: The legal question
        max_new_tokens: Maximum response length

    Returns:
        Well-formatted legal answer
    """
    # Format prompt exactly as during training
    system_msg = "You are a legal expert assistant specialized in Pakistani law. Provide accurate, concise answers with references to relevant laws when possible."
    formatted_prompt = f"<|system|>\n{system_msg}\n<|user|>\n{prompt}\n<|assistant|>\n"

    # Tokenize with training settings
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(model.device)

    # Generate with legal-optimized parameters
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.3,  # Low for factual accuracy
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.2,  # Prevent repetition
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and clean
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return full_response.split("<|assistant|>")[-1].strip()

# Test questions
test_prompts = [
    "What is the function of the National Committee for Pakistan Study Centres?",
    "What is the purpose of the Pakistan Study Centres Act, 1976?",
    "What is the purpose of the Privatisation programme under this Ordinance?",
    "What is the purpose of the Commission Account?",
    "What is the purpose of the Privatisation Commission Ordinance, 2000?"
]

# Generate and display answers
print("⚖️ Pakistani Legal Expert Answers ⚖️")
print("="*50 + "\n")

for i, prompt in enumerate(test_prompts, 1):
    response = generate_legal_response(prompt)
    print(f"\n🔹 Question {i}: {prompt}")
    print(f"\n💡 Expert Answer:\n{response}")
    print("\n" + "-"*80)

⚖️ Pakistani Legal Expert Answers ⚖️


🔹 Question 1: What is the function of the National Committee for Pakistan Study Centres?

💡 Expert Answer:
The National Committee for Pakistan Study Centers (NCPSC) was established by an Act of Parliament in 1976 as a statutory body under the Ministry of Education and Culture. Its main functions include:
- Developing and promoting educational programs related to Pakistan studies at all levels;
- Establishing study centers across the country to provide students with opportunities to learn about Pakistan's history, culture, language, and society;
- Conducting research on issues relating to Pakistan and disseminating its findings through publications, seminars, conferences, and other means;
- Promoting intercultural understanding and cooperation between Pakistanis and people from other countries.

--------------------------------------------------------------------------------

🔹 Question 2: What is the purpose of the Pakistan Study Centres Act, 1976

In [29]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [31]:
model_path = "/content/drive/MyDrive/tinyllama_models/finetuned-tinyllama-law-lora"
print(model_path)


/content/drive/MyDrive/tinyllama_models/finetuned-tinyllama-law-lora


In [32]:
import shutil
import os

# Local path where your model is saved
local_path = "./finetuned-tinyllama-law-lora"

# Destination in Google Drive
drive_path = "/content/drive/MyDrive/tinyllama_models/finetuned-tinyllama-law-lora"

# Make sure the parent dir exists
os.makedirs("/content/drive/MyDrive/tinyllama_models", exist_ok=True)

# Copy the full directory
shutil.copytree(local_path, drive_path)

print("✅ Model successfully copied to Google Drive!")


✅ Model successfully copied to Google Drive!
