In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # For better error messages

In [None]:
!pip install unsloth trl peft accelerate bitsandbytes pandas psutil psutils
!pip install "trl<0.15.0" psutil --upgrade

In [None]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

In [None]:
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/llama-3.2-3b-bnb-4bit"

max_seq_length = 2048  # Choose sequence length
dtype = None  # Auto detection

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

In [None]:
# !pip install "trl<0.15.0" psutil --upgrade

In [None]:
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template
import pandas as pd
from datasets import Dataset
from unsloth import FastLanguageModel
import torch

# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template = "llama-3", # Use Llama-3 instruction format
# )

# 2. Add LoRA Adapters
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank - higher = more capacity, more memory
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=128,  # LoRA scaling factor (usually 2x rank)
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",     # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized version
    random_state=3407,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None, # LoftQ
)


In [None]:
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template
import pandas as pd
import json
from datasets import Dataset
from unsloth import FastLanguageModel
import torch

# ============================================
# 1. LOAD JSON DATA (Updated for JSON format)
# ============================================
# Load your JSON file
# Expected format:
# [{"instruction": "...", "input": "...", "output": "..."}, ...]
# OR
# [{"question": "...", "answer": "..."}, ...]

json_file_path = "solar_qa.json"  # Change this to your JSON file path

with open(json_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Loaded {len(data)} samples from JSON")
print(f"Sample: {data[0]}")

# ============================================
# 2. CONVERT JSON TO DATASET FORMAT
# ============================================

# Define prompt template for your instruction/question/answer format
prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{response}{eos_token}"""

# Alternative simple template (uncomment if preferred):
# prompt_template = """Question: {question}
# Answer: {response}{eos_token}"""

def convert_json_to_dataset(data):
    """Convert JSON data with various formats to dataset format"""
    texts = []

    for item in data:
        # Handle different JSON formats
        if "instruction" in item and "input" in item and "output" in item:
            # Standard instruction-input-output format
            instruction = item["instruction"]
            input_text = item["input"]
            output = item["output"]

        elif "instruction" in item and "output" in item:
            # Instruction-output format (no input)
            instruction = item["instruction"]
            input_text = ""
            output = item["output"]

        elif "question" in item and "answer" in item:
            # Question-answer format
            instruction = item.get("instruction", "Please answer the following question")
            input_text = item["question"]
            output = item["answer"]

        else:
            print(f"Warning: Unknown format for item: {item}")
            continue

        # Format the prompt
        prompt = prompt_template.format(
            instruction=instruction,
            input_text=input_text,
            response=output,
            eos_token=tokenizer.eos_token
        )

        texts.append(prompt)

    return texts

# Convert data to texts
texts = convert_json_to_dataset(data)

# Create dataset
dataset_dict = {"text": texts}
dataset = Dataset.from_dict(dataset_dict)

print(f"Dataset created with {len(dataset)} samples")
print(f"\nSample prompt:\n{texts[0][:500]}...")


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import psutil



trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1, # Changed from 2 to 1 to fix psutil issues
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        # max_steps = 60, # Set a limit for quick testing (e.g., 1 epoch) #commented for finetuninng
        num_train_epochs = 50, # added for more fine tune training
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine", # "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
  
)

trainer_stats = trainer.train()

# model = model.merge_and_unload()


In [None]:
# Commented to check resposne before merging , it may require back to merge the adapter to model
# import torch
# import gc

# print("Step 1: Merging LoRA adapters...")

# # Clear cache
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()
# gc.collect()

# # Merge adapters
# model = model.merge_and_unload()

# # Save merged model temporarily
# print("Step 2: Saving merged model...")
# model.save_pretrained("merged_model_temp")
# tokenizer.save_pretrained("merged_model_temp")

# # Clear everything
# del model
# gc.collect()
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()

# # Reload the merged model fresh
# print("Step 3: Reloading merged model...")
# from unsloth import FastLanguageModel

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name="merged_model_temp",
#     max_seq_length=2048,
#     dtype=None,
#     load_in_4bit=False,  # Load full precision for inference
# )

# Prepare for inference
# FastLanguageModel.for_inference(model)

In [None]:
# ============================================
# 4. TEST THE FINE-TUNED MODEL
# ============================================

# Enable inference mode
FastLanguageModel.for_inference(model)

# Define test cases based on your data
test_cases = [
    {"instruction": "Explain the following content clearly and accurately.", "input": "What type of inverter is proposed for the system?"},
    {"instruction":"Explain the following content clearly and accurately.", "input": "How is GST applied to the solar project cost?"},
    {"instruction": "Explain the following content clearly and accurately.", "input": "What warranty coverage is provided for the solar inverter?"},
]

print("--- Model Predictions ---\n")

for test in test_cases:
    instruction = test["instruction"]
    input_text = test["input"]

    # Prepare prompt (without response part)
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
"""

    # Tokenize input
    inputs = tokenizer(
        [prompt],
        return_tensors="pt"
    ).to("cuda")

    # Generate response
    # outputs = model.generate(
    #     **inputs,
    #     max_new_tokens=128,
    #     use_cache=True,
    #     temperature=0.7,
    #     top_p=0.9
    # )

    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        temperature=0.2,  # Very low = more deterministic (reduce hallucination)
        top_p=0.9,
        top_k=40,  # Add top_k for more control
        do_sample=True,
        repetition_penalty=1.1,  # Reduce repetition
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,  # Prevent 3-gram repetition
    )
    # Decode and print results
    prediction = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:],
        skip_special_tokens=True
    )[0]

    print(f"Instruction: {instruction}")
    print(f"Input: {input_text}")
    print(f"Response: {prediction.strip()}")
    print("-" * 50)

In [None]:
model.save_pretrained_gguf("gguf_model", tokenizer, quantization_method="q4_k_m") # for balanced quality , sixe will be 2gb
# model.save_pretrained_gguf("gguf_model", tokenizer, quantization_method="Q8_0") # for better quality sixe will be 3gb

In [None]:
# STEPS TO PROCESS 
# Downlaod the file   Llama-3.2-3B.Q4_K_M.gguf and copy in project root folder
# mMake the follwing changes -

in docker-compose file :
      - ./models:/root/.ollama/models   # store models locally for reuse
      - .:/workspace                    # mount root directory to access custom GGUF files
    environment:

create or add in modelFile -
  FROM /workspace/Llama-3.2-3B.Q4_K_M.gguf

  # Optional: Set model parameters
  PARAMETER temperature 0.7
  PARAMETER top_p 0.9
  PARAMETER top_k 40


cd /Users/blpancholi/innovatechs_work/dev/ollama_test && docker-compose down


cd /Users/blpancholi/innovatechs_work/dev/ollama_test && docker-compose up -d

# docker exec -it ollama ollama create llama3.2-3b -f /workspace/Modelfile

docker exec ollama ollama create llama3.2-3b -f /workspace/Modelfile


docker exec ollama ollama list



APi call -

{
  "model": "llama3.2-3b",
  "messages": [
    {"role": "user", "content": "Tell me about APN company?"}
  ]
}

In [None]:

# from google.colab import files
# import os

# gguf_files = [f for f in os.listdir("gguf_model") if f.endswith(".gguf")]
# if gguf_files:
#     gguf_file = os.path.join("gguf_model", gguf_files[0])
#     print(f"Downloading: {gguf_file}")
#     files.download(gguf_file)

# import shutil
# from google.colab import files

# # Create a zip archive of the folder
# shutil.make_archive("gguf_model", "zip", "gguf_model")

# # Download the zip
# files.download("gguf_model.zip")
