In [1]:
!pip install trl

Collecting trl
  Downloading trl-0.27.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.27.0-py3-none-any.whl (532 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.5/532.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.27.0


In [2]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2026.1.3-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2026.1.3 (from unsloth)
  Downloading unsloth_zoo-2026.1.3-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.5-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from unsl

In [3]:
import json
import torch
from datasets import Dataset, load_dataset
from peft import LoraConfig
from transformers import AutoTokenizer, TextStreamer

BASE_SAMPLES = [
    {
        "dealer_name": "CHARBHUJA TRACTOR & COMPRESSOR",
        "model_name": "POWERTRAC EURO 42 PLUS",
        "horse_power": 42,
        "asset_cost": 800000,
        "raw_text": """CHARBHUJA TRACTOR & COMPRESSOR
ESCORTS POWERTRAC TRACTORS
Near Sharma Hospital KANKROLI
Quotation No 426
POWERTRAC EURO 42 PLUS
42 HP
Amount Rs 8,00,000"""
    },
    {
        "dealer_name": "AUTO UNION JODHPUR",
        "model_name": "ME/TAFE TRACTOR ME 241",
        "horse_power": 42,
        "asset_cost": 590000,
        "raw_text": """MASSEY FERGUSON
AUTO UNION JODHPUR
ME/TAFE TRACTOR ME 241
(42 HP)
TOTAL 590000"""
    },
    {
        "dealer_name": "SHREE BALAJI TRACTORS",
        "model_name": "SWARAJ 744 FE",
        "horse_power": 48,
        "asset_cost": 720000,
        "raw_text": """SHREE BALAJI TRACTORS
AUTH DEALER SWARAJ
MODEL SWARAJ 744 FE
48 HP
PRICE 7,20,000"""
    },
    {
        "dealer_name": "MAHINDRA TRACTORS",
        "model_name": "MAHINDRA 275 DI TU",
        "horse_power": 39,
        "asset_cost": 650000,
        "raw_text": """MAHINDRA & MAHINDRA LTD
MAHINDRA 275 DI TU
39 HP
TOTAL AMOUNT 6,50,000"""
    }
]
import random
import re

def add_ocr_noise(text):
    replacements = {
        "O": "0",
        "I": "1",
        "S": "5",
        "E": "F",
        "&": "AND",
        "HP": random.choice(["HP", "H P", "H.P", "H P."]),
    }

    for k, v in replacements.items():
        if random.random() < 0.3:
            text = text.replace(k, v)

    # Random casing
    if random.random() < 0.4:
        text = text.lower()
    elif random.random() < 0.4:
        text = text.upper()

    # Random spacing noise
    text = re.sub(r"\s+", lambda m: " " * random.randint(1, 3), text)

    # Random numeric separators
    text = text.replace("800000", random.choice(["8,00,000", "800000", "8.00.000"]))
    text = text.replace("590000", random.choice(["5,90,000", "590000"]))

    return text
def build_prompt(raw_text):
    return (
        "Extract the following fields as JSON from this noisy OCR text of a tractor quotation: "
        "dealer_name, model_name, horse_power, asset_cost. "
        "Return only valid JSON, use null for missing values.\n\n"
        f"Raw text:\n{raw_text}"
    )


def build_completion(sample):
    return json.dumps({
        "dealer_name": sample["dealer_name"],
        "model_name": sample["model_name"],
        "horse_power": sample["horse_power"],
        "asset_cost": sample["asset_cost"]
    })
def generate_synthetic_data(n_samples=100):
    data = []

    for i in range(n_samples):
        base = BASE_SAMPLES[i % len(BASE_SAMPLES)]

        noisy_text = add_ocr_noise(base["raw_text"])

        data.append({
            "prompt": build_prompt(noisy_text),
            "completion": build_completion(base)
        })

    return data


In [4]:
synthetic_data = generate_synthetic_data(100)

with open("tractor_data.jsonl", "w") as f:
    for row in synthetic_data:
        f.write(json.dumps(row) + "\n")


In [5]:
from trl import SFTTrainer
from unsloth import FastLanguageModel



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [18]:
from trl import SFTConfig

# Load dataset
dataset = load_dataset("json", data_files="tractor_data.jsonl", split="train")

# Step 2: Load Base Model with 4-bit Quantization
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct",
    max_seq_length=max_seq_length,
    dtype=None,  # Auto-detect
    load_in_4bit=True,
)

# Step 3: Configure LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Step 4: Fine-Tune
from transformers import TrainingArguments

# ─────────────────────────────────────────────────────────────
# Pre-format dataset → avoid TRL's automatic EOS / formatting issues
# ─────────────────────────────────────────────────────────────

def format_for_training(example):
    text = f"{example['prompt']}{tokenizer.eos_token}{example['completion']}"
    return {"text": text}

# Apply formatting once, before creating trainer
formatted_dataset = dataset.map(
    format_for_training,
    num_proc=2,           # safe on Colab T4
    desc="Formatting dataset with prompt + completion + EOS"
)

# ─────────────────────────────────────────────────────────────
# Now train with dataset_text_field instead of formatting_func
# ─────────────────────────────────────────────────────────────

from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,

    dataset_text_field="text",          # ← tell trainer where the full text is

    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,                   # testing → change to num_train_epochs=3 later
        # num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        bf16=False,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",

        # Important: keep this false since we're pre-formatting
        completion_only_loss=False,
    ),
)

trainer.train()

model.save_pretrained("custom_phi3_tractor_extractor")         # saves LoRA adapters + config
tokenizer.save_pretrained("custom_phi3_tractor_extractor")

# Optional: merge LoRA into base model (16-bit) for easier inference later
model.save_pretrained_merged(
    "custom_phi3_tractor_extractor_merged",
    tokenizer,
    save_method="merged_16bit"   # or "merged_4bit" if you want smaller size
)

==((====))==  Unsloth 2026.1.3: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Formatting dataset with prompt + completion + EOS (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


🦥 Unsloth: Padding-free auto-enabled, enabling faster training.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 5 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 29,884,416 of 3,850,963,968 (0.78% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.6754
2,2.6058
3,2.5989
4,2.6944
5,2.4936
6,2.3655
7,2.4069
8,2.016
9,1.9547
10,1.9512


config.json: 0.00B [00:00, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 1/2 [02:28<02:28, 148.00s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [04:20<00:00, 130.40s/it]
Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [05:12<00:00, 156.32s/it]


Unsloth: Merge process complete. Saved to `/content/custom_phi3_tractor_extractor_merged`


In [19]:
# Step 6: Load the Model and Infer
# Reload (simulating separate run)
model, tokenizer = FastLanguageModel.from_pretrained(
    "custom_phi3_tractor_extractor_merged",  # Or original + adapters
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)  # Enables faster inference



==((====))==  Unsloth 2026.1.3: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32009)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Llam

In [23]:
# ─────────────────────────────────────────────────────────────
# Fixed Inference: Get ONLY the generated JSON (no prompt repeat)
# ─────────────────────────────────────────────────────────────

from transformers import TextStreamer
import torch

# Your sample raw OCR (unchanged)
sample_raw = """GST - 08AAPPC9391K121\nQuotation\n* (02952) 222298\nMob. 9414172521\nCHARBHUJA TRACTOR & COMPRESSOR\n9929179791\nESCORTS\nAuth. Dealers : Escort Powertrac Tractors\nPOWERTRAC\nTRACTORS\nNear Sharma Hospital, TVS Chouraha, Nathdwara Road, KANKROLI - 313324\nNo.\n426\nDate.\n28|09|25\nM/s./Shri_\nDear Sir,\nunder :\nWe are pleased to submit our quotation for Tractor and Implementsyas per standard specification of the manufactures, as\nKAJSAMAMD\nDescription\nQty.\nPOWERTRAL CURD 42+\n1\nRate\n8,00,000\nAmount\n8,00,000-00\n45 MP\nHPH →\nIDFC FIRLT BANK 4O.\nTOTAL\n(Rs.)\n8,00,000-00\nTerms & Conditions:\nPrices:\nQuoted above are Ex-Godown Rajsamand, inclusive of taxex, duties and additional road transportation charges etc\nThe rates are strictly subjects to change without notice and shal be charged as ruling at the time of Delivery.\nDelivery Period\nSubject to Circumstances beyond our control\nPayment: Rs.\nsecurity deposit and balance full payment against delivery. Altematively. full payment by Demand Draft in\nadvance to be remitted to our Principals\nwhen demanded.\nTechnical Specifications: Manufacturers reserves the right to despatch the tractors from Faridabad or from any other assembly unit by road\nTransport to Rajsamand of the their\nmer Depot. The freight and other incidental charges are to be bome by the customer\nManufactures reserve the\nht to change any specification of the Tractors, Implements and accessones\nAn illustrate iterature of the\nTralcor and the Implements ordered by the customer is enclosed.\n13.00\nHopping to be favoured with valued orders\nThanking You!\nCustomer's Signature\nFor: CHARBHUJA TRACTOR & COMF\nsards niger"""

prompt_template = """Extract the following fields as JSON from this noisy OCR text of a tractor quotation: dealer_name, model_name, horse_power, asset_cost. Return only valid JSON, use null for missing values.

Raw text: {}"""

full_prompt = prompt_template.format(sample_raw)

# Tokenize prompt
inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

# Get length of prompt tokens (for slicing later)
input_length = inputs.input_ids.shape[1]

# Enable fast inference mode (Unsloth)
FastLanguageModel.for_inference(model)

# Streamer: skips prompt, streams only new tokens
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# Generate
outputs = model.generate(
    **inputs,
    max_new_tokens=128,          # enough for JSON
    do_sample=False,             # deterministic for debugging
    temperature=0.0,             # no randomness
    streamer=streamer,           # live stream ONLY new text
    eos_token_id=tokenizer.eos_token_id,
)

# Clean final output: slice only generated tokens + decode
generated_tokens = outputs[0, input_length:]  # everything after prompt
clean_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

print("\n\n=== Clean Generated Text (only model response) ===")
print(clean_output)

# If you want to force JSON parsing/debug (optional)
try:
    import json
    parsed = json.loads(clean_output)
    print("\nParsed JSON:", json.dumps(parsed, indent=2))
except json.JSONDecodeError as e:
    print("\nNot valid JSON yet:", e)

ia POWERTRAC TRACTORS
Model: POWERTRAC CURD 42+   HP 45  MF  Amount 8,00,000


=== Clean Generated Text (only model response) ===
ia POWERTRAC TRACTORS
Model: POWERTRAC CURD 42+   HP 45  MF  Amount 8,00,000

Not valid JSON yet: Expecting value: line 1 column 1 (char 0)
