In [None]:
! pip install transformers boto3 "Pillow>=9.4.0" "sagemaker-core==1.0.41" "datasets[s3]==2.18.0" "sagemaker>=2.190.0" --upgrade --quiet

In [4]:
from huggingface_hub import login
from datasets import load_dataset
import base64
from io import BytesIO
import os


In [None]:
os.environ['hf_token']=""
os.environ['WANDB_API_KEY'] = ""
login(
  token=os.environ['hf_token']
)

In [6]:
# # note the image is not provided in the prompt its included as part of the "processor"
# prompt= """Create a Short Product description based on the provided ##PRODUCT NAME## and ##CATEGORY## and image. 
# Only return description. The description should be SEO optimized and for a better mobile search experience.
 
# ##PRODUCT NAME##: {product_name}
# ##CATEGORY##: {category}"""
 
# system_message = "You are an expert product description writer for Amazon."




# # Convert dataset to OAI messages       
# def format_data(sample):
#     # buffer = BytesIO()
#     # sample["image"].save(buffer, format="JPEG")
#     # base64_img = base64.b64encode(buffer.getvalue())
#     return {"messages": [
#                 {
#                     "role": "system",
#                     "content": [{"index": None, "type": "text", "text": system_message}],
#                 },
#                 {
#                     "role": "user",
#                     "content": [
#                         {
#                             "index": None,
#                             "type": "text",
#                             "text": prompt.format(product_name=sample["Product Name"], category=sample["Category"]),
#                         },{
#                             "index": None,
#                             "type": "image",
#                             "image": None,#base64_img,
#                         }
#                     ],
#                 },
#                 {
#                     "role": "assistant",
#                     "content": [{"index": None,"type": "text", "text": sample["description"]}],
#                 },
#             ],
#             "images":sample["image"]
#         }
 
# # Load dataset from the hub
# dataset_id = "philschmid/amazon-product-descriptions-vlm"
# dataset = load_dataset("philschmid/amazon-product-descriptions-vlm", split="train")
 
# # Convert dataset to OAI messages
# # need to use list comprehension to keep Pil.Image type, .mape convert image to bytes
# dataset = dataset.map(format_data, remove_columns= ['Uniq Id', 'Product Name', 'Category', 'Selling Price', 'Model Number', 'About Product', 'Product Specification', 'Technical Details', 'Shipping Weight', 'Variants', 'Product Url', 'Is Amazon Seller', 'description'])
# #dataset = dataset.rename_column("image", "images")
# print(dataset[0]["messages"])
# print(dataset[0]["images"])

In [7]:
from datasets import load_dataset, load_from_disk
#dataset = load_dataset("HuggingFaceH4/llava-instruct-mix-vsft")
dataset = load_dataset("unsloth/llava-instruct-mix-vsft-mini")

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['messages', 'images'],
        num_rows: 8552
    })
    test: Dataset({
        features: ['messages', 'images'],
        num_rows: 1364
    })
})

In [9]:
#dataset.to_json("data/train.jsonl", orient="records")
dataset.save_to_disk("data/train.hf")

Saving the dataset (0/1 shards):   0%|          | 0/8552 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1364 [00:00<?, ? examples/s]

In [10]:
import sagemaker
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer
import boto3
import os

sagemaker_session = sagemaker.Session()
bucket_name = sagemaker_session.default_bucket()

In [18]:
# save train_dataset to s3 using our SageMaker session
prefix = 'datasets'

input_source = sagemaker_session.upload_data('data/train.hf', bucket=bucket_name, key_prefix=f'{prefix}/vlm-sample.hf')

print(f"Training data uploaded to:")
print(input_source)


Training data uploaded to:
s3://sagemaker-us-east-1-783764584149/datasets/vlm-sample.hf


In [19]:
!aws s3 ls $input_source/ 

                           PRE test/
                           PRE train/
2025-08-22 22:27:17         29 dataset_dict.json


In [20]:
#dats_1 = load_from_disk('data/train.hf')

In [21]:
from sagemaker.config import load_sagemaker_config
configs = load_sagemaker_config()

In [32]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import Compute, SourceCode, InputData, StoppingCondition, CheckpointConfig

env = {}
env["FI_PROVIDER"] = "efa"
env["NCCL_PROTO"] = "simple"
env["NCCL_SOCKET_IFNAME"] = "eth0"
env["NCCL_IB_DISABLE"] = "1"
env["NCCL_DEBUG"] = "WARN"
env["HF_token"] = os.environ['hf_token']
env["WANDB_API_KEY"] = os.environ['WANDB_API_KEY']
env["data_location"] = input_source
# MLFlow tracker
#tracking_server_arn = ""
#env["MLFLOW_TRACKING_ARN"] = tracking_server_arn
instance_type = "ml.g6e.48xlarge"  # Override the instance type if you want to get a different container version

compute = Compute(
    instance_count=1,
    instance_type= instance_type,
    volume_size_in_gb=96,
    keep_alive_period_in_seconds=3600,
)

# image_uri = sagemaker.image_uris.retrieve(
#     framework="pytorch",
#     region=sagemaker_session.boto_session.region_name,
#     version="2.6",
#     instance_type=instance_type,
#     image_scope="training"
# )

# image_uri

In [33]:
image_uri = (
    f"658645717510.dkr.ecr.{sagemaker_session.boto_session.region_name}.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121"
)

image_uri

'658645717510.dkr.ecr.us-east-1.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121'

In [34]:
checkpoint_s3_path = f"s3://{bucket_name}/vlm-sample-checkpoints/checkpoints"
checkpoint_s3_path

's3://sagemaker-us-east-1-783764584149/vlm-sample-checkpoints/checkpoints'

In [35]:
job_prefix = f"vlm-sft-model-trainer-distributed"

In [36]:
hyperparameters = {
    "dataset_path": "/opt/ml/input/data/dataset",
    "model_dir": "/opt/ml/model",
}

In [37]:
source_code = SourceCode(
    source_dir="./scripts",
    requirements="requirements.txt",
    entry_script="run_training.sh",
)

In [38]:
model_trainer = ModelTrainer(
    training_image=image_uri,
    compute=compute,
    hyperparameters=hyperparameters,
    environment=env,
    source_code=source_code,
    stopping_condition=StoppingCondition(
        max_runtime_in_seconds=90000,
    ),
    checkpoint_config=CheckpointConfig(
        s3_uri=f"{checkpoint_s3_path}/{job_prefix}",
    ),
    base_job_name=job_prefix

)

In [39]:
training_data = InputData(
    channel_name="training_dataset",
    data_source=input_source,
)

In [None]:
model_trainer.train(input_data_config=[training_data], wait=True)

In [41]:
def get_last_job_name(job_name_prefix):
    sagemaker_client = boto3.client('sagemaker')

    matching_jobs = []
    next_token = None

    while True:
        # Prepare the search parameters
        search_params = {
            'Resource': 'TrainingJob',
            'SearchExpression': {
                'Filters': [
                    {
                        'Name': 'TrainingJobName',
                        'Operator': 'Contains',
                        'Value': job_name_prefix
                    },
                    {
                        'Name': 'TrainingJobStatus',
                        'Operator': 'Equals',
                        'Value': "Completed"
                    }
                ]
            },
            'SortBy': 'CreationTime',
            'SortOrder': 'Descending',
            'MaxResults': 100
        }

        # Add NextToken if we have one
        if next_token:
            search_params['NextToken'] = next_token

        # Make the search request
        search_response = sagemaker_client.search(**search_params)

        # Filter and add matching jobs
        matching_jobs.extend([
            job['TrainingJob']['TrainingJobName'] 
            for job in search_response['Results']
            if job['TrainingJob']['TrainingJobName'].startswith(job_name_prefix)
        ])

        # Check if we have more results to fetch
        next_token = search_response.get('NextToken')
        if not next_token or matching_jobs:  # Stop if we found at least one match or no more results
            break

    if not matching_jobs:
        raise ValueError(f"No completed training jobs found starting with prefix '{job_name_prefix}'")

    return matching_jobs[0]

In [42]:
job_name = get_last_job_name(job_prefix)

job_name

'vlm-sft-model-trainer-distributed-20250822111245'

In [43]:
default_prefix=None
if default_prefix:
    model_data=f"s3://{bucket_name}/{default_prefix}/{job_prefix}/{job_name}/output/model.tar.gz"
else:
    model_data=f"s3://{bucket_name}/{job_prefix}/{job_name}/output/model.tar.gz"


In [44]:
model_data

's3://sagemaker-us-east-1-783764584149/vlm-sft-model-trainer-distributed/vlm-sft-model-trainer-distributed-20250822111245/output/model.tar.gz'

# Test Model and run Inference

In [45]:
print(model_data)

s3://sagemaker-us-east-1-783764584149/vlm-sft-model-trainer-distributed/vlm-sft-model-trainer-distributed-20250822111245/output/model.tar.gz


In [None]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from qwen_vl_utils import process_vision_info
adapter_path = "../models/vlm-sft-SmolVLM-Instruct/"
model_id = "HuggingFaceTB/SmolVLM-Instruct"
# Load Model base model
model = AutoModelForImageTextToText.from_pretrained(
  model_id,
  device_map="auto",
  torch_dtype=torch.float16
)
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
# sample from amazon.com
sample = {
  "product_name": "Hasbro Marvel Avengers-Serie Marvel Assemble Titan-Held, Iron Man, 30,5 cm Actionfigur",
  "catergory": "Toys & Games | Toy Figures & Playsets | Action Figures",
  "image": "https://m.media-amazon.com/images/I/81+7Up7IWyL._AC_SY300_SX300_.jpg"
}


# prepare message
messages = [{
        "role": "user",
        "content": [
            {
                "index": None,
                "type": "image",
                "image": sample["image"],
            },
            {"index": None,"type": "text", "text": prompt.format(product_name=sample["product_name"], category=sample["catergory"])},
        ],
    }
]

image_inputs, video_inputs = process_vision_info(messages)
print(image_inputs)
def generate_description(sample, model, processor):
    messages = [
        {"role": "system", "content": [{"index": None, "type": "text", "text": system_message}]},
        {"role": "user", "content": [
            {"index": None,"type": "image","image": sample["image"]},
            {"index": None,"type": "text", "text": prompt.format(product_name=sample["product_name"], category=sample["catergory"])},
        ]},
    ]
    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        images=image_inputs,
        text=[text],
        #videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=256, top_p=1.0, do_sample=True, temperature=0.8)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text[0]
 

In [None]:
# let's generate the description
base_description = generate_description(sample, model, processor)
print(base_description)
#

In [None]:
model.load_adapter(adapter_path) # load the adapter and activate
 
ft_description = generate_description(sample, model, processor)
print(ft_description)

In [None]:

from peft import PeftModel
from transformers import AutoProcessor, AutoModelForImageTextToText
 
adapter_path = "../models/vlm-sft-SmolVLM-Instruct/"
model_id = "HuggingFaceTB/SmolVLM-Instruct"
merged_path = "../models/merged"
 
# Load Model base model
model = AutoModelForImageTextToText.from_pretrained(model_id, low_cpu_mem_usage=True)
 
# Path to save the merged model
 
# Merge LoRA and base model and save
peft_model = PeftModel.from_pretrained(model, adapter_path)
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained(merged_path,safe_serialization=True, max_shard_size="2GB")
 
processor = AutoProcessor.from_pretrained(model_id)
processor.save_pretrained(merged_path)

In [None]:
2+2