In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers
    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0.3,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [None]:
import json
input_file = "qwen_dataset/train.json"
with open(input_file, 'r') as f:
    dataset = json.load(f)
dataset

In [None]:
from PIL import Image
image = Image.open("watermark_new/wikiart_01817.png")
image

In [None]:
from pydantic import BaseModel
import json
from transformers import TextStreamer

# Define schema
class Validate(BaseModel):
    watermarks: int
    text: str
    main_object: str
    style: str

# Schema as JSON Schema
schema_as_str = json.dumps(Validate.model_json_schema())

# Instruction with strict JSON requirement
instruction = """
You are an image analysis system. Analyze the provided image and return only a valid JSON object that exactly follows this schema:
{
  "watermarks": integer,
  "text": string,
  "main_object": string,
  "style": string
}
Rules:
Output must be strictly valid JSON, with no explanations or extra text.
"watermarks" = integer (use 0 if none).
"text" = any detected text in the image (empty string if none).
"main_object" = the primary subject of the image in plain English.
"style" = choose exactly one from the following list:
"Abstract_Expressionism"
"Action_painting"
"Analytical_Cubism"
"Art_Nouveau"
"Baroque"
"Color_Field_Painting"
"Contemporary_Realism"
"Cubism"
"Early_Renaissance"
"Expressionism"
"Fauvism"
"High_Renaissance"
"Impressionism"
"Mannerism_Late_Renaissance"
"Minimalism"
"Naive_Art_Primitivism"
"New_Realism"
"Northern_Renaissance"
"Pointillism"
"Pop_Art"
"Post_Impressionism"
"Realism"
"Rococo"
"Romanticism"
"Symbolism"
"Synthetic_Cubism"
"Ukiyo_e"
""" 
# Compose chat input
messages = [
    {"role": "user", "content": [
        {"type": "image"},  # <-- your image placeholder token
        {"type": "text", "text": instruction}
    ]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

# Stream output while enforcing schema
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

output = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=128,
    use_cache=True,
    temperature=0.7,  # lower temp = more reliable structured JSON
    min_p=0.1
)

In [None]:
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases
        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        max_length = 2048,
    ),
)

In [None]:
trainer_stats = trainer.train()

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

In [None]:
from PIL import Image
image = Image.open("watermark_new/wikiart_01244.png")
image

In [None]:
if True:
    from unsloth import FastVisionModel
    model, tokenizer = FastVisionModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = True, # Set to False for 16bit LoRA
    )
    FastVisionModel.for_inference(model) # Enable for inference!

from pydantic import BaseModel
import json
from transformers import TextStreamer

# Define schema
class Validate(BaseModel):
    watermarks: int
    text: str
    main_object: str
    style: str

# Schema as JSON Schema
schema_as_str = json.dumps(Validate.model_json_schema())

# Instruction with strict JSON requirement
instruction = """
You are an image analysis system. Analyze the provided image and return only a valid JSON object that exactly follows **this schema including the exact key names**:
{
  "watermarks": integer,
  "text": string,
  "main_object": string,
  "style": string
}

Rules:
- Output must be strictly valid JSON (no comments, no explanations, no text outside braces).
- "watermarks" = integer (use 0 if none).
- "text" = any detected text in the image (empty string if none).
- "main_object" = the primary subject of the image in plain English.
- "style" = choose exactly one from the following list:
["Abstract_Expressionism","Action_painting","Analytical_Cubism","Art_Nouveau","Baroque","Color_Field_Painting","Contemporary_Realism","Cubism","Early_Renaissance","Expressionism","Fauvism","High_Renaissance","Impressionism","Mannerism_Late_Renaissance","Minimalism","Naive_Art_Primitivism","New_Realism","Northern_Renaissance","Pointillism","Pop_Art","Post_Impressionism","Realism","Rococo","Romanticism","Symbolism","Synthetic_Cubism","Ukiyo_e"]

Examples of correct outputs:
{
  "watermarks": 0,
  "text": "",
  "main_object": "Woman with a parasol",
  "style": "Impressionism"
}
{
  "watermarks": 1,
  "text": "COPYRIGHT",
  "main_object": "Landscape with mountains",
  "style": "Post_Impressionism"
}
{
  "watermarks": 2,
  "text": "VOID 4",
  "main_object": "City buildings",
  "style": "Cubism"
}
"""
# Compose chat input
messages = [
    {"role": "user", "content": [
        {"type": "image"},  # <-- your image placeholder token
        {"type": "text", "text": instruction}
    ]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

# Stream output while enforcing schema
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

output = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=128,
    use_cache=True,
    temperature=0.7,  # lower temp = more reliable structured JSON
    min_p=0.1
)