In [1]:
import os
import torch
import torch.nn as nn
from datasets import Dataset
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig
from janus.janusflow.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

model_path = "deepseek-ai/JanusFlow-1.3B"
vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer



Python version is above 3.10, patching the collections module.
Python version is above 3.10, patching the collections module.




preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/2.94k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

Some kwargs in processor config are unused and will not have any effect: image_end_tag, image_tag, image_gen_tag, mask_prompt, ignore_id, image_start_tag, sft_format, add_special_token, num_image_tokens. 


In [2]:
class JanusForCausalLMWrapper(nn.Module):
    def __init__(self, model, tokenizer):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        if (hasattr(model, "language_model") and 
            hasattr(model.language_model, "get_input_embeddings") and 
            callable(model.language_model.get_input_embeddings)):
            self.embed = model.language_model.get_input_embeddings()
        else:
            raise NotImplementedError("The provided model does not have language_model.get_input_embeddings.")
    
    @property
    def config(self):
        if hasattr(self.model, "config"):
            return self.model.config
        elif hasattr(self.model, "language_model") and hasattr(self.model.language_model, "config"):
            return self.model.language_model.config
        else:
            raise AttributeError("No config attribute found in the model.")
    
    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        return {"input_ids": input_ids, **kwargs}
    
    def _get_lm_head(self):
        if hasattr(self.model, "gen_head") and self.model.gen_head is not None:
            return self.model.gen_head
        elif hasattr(self.model, "lm_head") and self.model.lm_head is not None:
            return self.model.lm_head
        elif hasattr(self.model.language_model, "get_output_embeddings") and callable(self.model.language_model.get_output_embeddings):
            head = self.model.language_model.get_output_embeddings()
            if head is not None:
                return head
        hidden_dim = self.model.config.hidden_size
        vocab_size = len(self.tokenizer)
        print(f"Creating a new LM head with hidden_dim {hidden_dim} and vocab_size {vocab_size}.")
        new_head = nn.Linear(hidden_dim, vocab_size, bias=False).to(next(self.model.parameters()).device)
        nn.init.normal_(new_head.weight, mean=0.0, std=0.02)
        self.model.gen_head = new_head
        return new_head
    
    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        kwargs.pop("inputs_embeds", None)
        if input_ids is None:
            raise ValueError("input_ids must be provided")
        inputs_embeds = self.embed(input_ids)
        outputs = self.model.language_model.model(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            **kwargs
        )
        if isinstance(outputs, dict):
            hidden_states = outputs.get("last_hidden_state", outputs.get("hidden_states", outputs[0]))
        else:
            hidden_states = outputs[0]
        B, T, D = hidden_states.shape
        lm_head = self._get_lm_head()
        logits = lm_head(hidden_states.view(-1, D)).view(B, T, -1)
        lm_head_dim = logits.size(-1)
        tokenizer_vocab_size = len(self.tokenizer)
        if tokenizer_vocab_size != lm_head_dim:
            pass
            # print(f"Tokenizer vocab size {tokenizer_vocab_size} != LM head output dim {lm_head_dim}.")
            # print("Using LM head output dimension as the valid range.")
        vocab_size = lm_head_dim
        if labels is not None:
            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = labels[:, 1:].contiguous()
            # print(f"Effective vocabulary size (from LM head): {vocab_size}")
            # print(f"Max label value before replacement: {shift_labels.max().item()}")
            shift_labels = torch.where(
                shift_labels >= vocab_size,
                torch.tensor(-100, device=shift_labels.device),
                shift_labels
            )
            if torch.any(shift_labels >= vocab_size):
                raise ValueError(f"Invalid label values found: {shift_labels[shift_labels >= vocab_size]}")
            else:
                pass
                # print("All labels are within bounds.")
            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            return {"loss": loss, "logits": logits, "hidden_states": hidden_states, "past_key_values": outputs.get("past_key_values", None)}
        else:
            return {"logits": logits, "hidden_states": hidden_states, "past_key_values": outputs.get("past_key_values", None)}
    
    def generate(self, input_ids, attention_mask=None, max_new_tokens=20, temperature=1.0):
        self.eval()
        generated = input_ids.clone()
        device = next(self.parameters()).device
        if attention_mask is not None:
            attn_mask = attention_mask.clone()
        else:
            attn_mask = None
        for _ in range(max_new_tokens):
            if attn_mask is not None:
                new_mask = torch.ones((attn_mask.size(0), 1), device=device, dtype=attn_mask.dtype)
                attn_mask = torch.cat([attn_mask, new_mask], dim=1)
            outputs = self.forward(input_ids=generated, attention_mask=attn_mask)
            logits = outputs["logits"]
            next_token_logits = logits[:, -1, :] / temperature
            next_token_probs = torch.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(next_token_probs, num_samples=1)
            generated = torch.cat([generated, next_token], dim=1)
            if next_token.item() == self.tokenizer.eos_token_id:
                break
        return generated

vl_gpt = MultiModalityCausalLM.from_pretrained(model_path, trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
base_model = vl_gpt.base_model
wrapped_model = JanusForCausalLMWrapper(base_model, tokenizer)
wrapped_model = wrapped_model.to(next(wrapped_model.parameters()).device)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.09G [00:00<?, ?B/s]

In [5]:
import json

def main():
    # Open and parse the JSON file
    with open('dataset.json', 'r') as f:
        data = json.load(f)

    print(len(data))
    # Create a list to store the pairs
    pairs = []
    
    # For each entry, extract both English and Romanian pairs
    for entry in data:
        # English pair
        en_pair = {
            'instruction': entry['instruction']['en'],
            'response': entry['response']['en']
        }
        pairs.append(en_pair)
        
        # Romanian pair
        ro_pair = {
            'instruction': entry['instruction']['ro'],
            'response': entry['response']['ro']
        }
        pairs.append(ro_pair)
    
    return pairs


In [6]:
train_data = main()

336


In [7]:
len(train_data)

672

In [8]:
# %% [markdown]
# ## Prepare the Training Data
#
# We define two instruction–response pairs and format them as conversations using the SFT template.
#
# The formatting function creates a conversation with a `<|User|>` message and a `<|Assistant|>` reply.

# %% [code]
# Define our tiny training set

def format_conversation(instruction: str, response: str) -> str:
    conversation = [
        {"role": "<|User|>", "content": instruction},
        {"role": "<|Assistant|>", "content": response},
    ]
    # Apply the SFT template from the VLChatProcessor.
    # This returns a formatted text prompt that the model was originally trained with.
    formatted = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
        conversations=conversation,
        sft_format=vl_chat_processor.sft_format,
        system_prompt="",
    )
    return formatted

# Format each training example
formatted_texts = [format_conversation(item["instruction"], item["response"]) for item in train_data]


In [9]:

# %% [markdown]
# ## Tokenize the Data
#
# We tokenize the formatted conversations and set the `labels` equal to the input IDs for causal language modeling.
#
# (In a more advanced setup you might choose to mask parts of the prompt.)

# %% [code]
def tokenize_function(text):
    tokenized = tokenizer(text, truncation=True, max_length=64)
    # Replace pad token ids with -100 in the labels
    tokenized["labels"] = [
        -100 if token_id == tokenizer.pad_token_id else token_id
        for token_id in tokenized["input_ids"]
    ]
    return tokenized

# Tokenize each example
tokenized_data = [tokenize_function(txt) for txt in formatted_texts]

# Create a Hugging Face Dataset
dataset = Dataset.from_list(tokenized_data)

print("Number of training examples:", len(dataset))


Number of training examples: 672


In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(wrapped_model, lora_config)
print("Trainable parameters:")
model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir="./janus_peft_big",
    num_train_epochs=35,
    per_device_train_batch_size=1,
    learning_rate=5e-5,
    logging_steps=1,
    save_steps=10,
    fp16=True,
    report_to="none",
    deepspeed=None,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)
trainer.train()

Trainable parameters:
trainable params: 1,572,864 || all params: 2,047,940,368 || trainable%: 0.0768
[2025-02-08 14:14:01,469] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::substr(unsigned long, unsigned long) const@GLIBCXX_3.4'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/opt/conda/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/opt/conda/compiler_compat/ld: /usr/local/cuda/l

Step,Training Loss
1,4.3945
2,4.0342
3,3.2575
4,3.988
5,3.9378
6,3.8646
7,4.007
8,4.2027
9,3.3773
10,3.8546


In [11]:
adapter_save_path = "./janus_peft_finetuned_big_1"
model.save_pretrained(adapter_save_path)
print(f"Adapter saved to {adapter_save_path}")

Adapter saved to ./janus_peft_finetuned_big_1


In [18]:


def generate_text(prompt: str, max_new_tokens: int = 100, temperature: float = 0.7) -> str:
    inputs = tokenizer(prompt, return_tensors="pt")
    device = next(wrapped_model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    generated_ids = wrapped_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs.get("attention_mask", None),
        max_new_tokens=max_new_tokens,
        temperature=temperature,
    )
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

test_instruction = "Cum mă autentific ca admin pentru a gestiona utilizatorii?"
test_formatted = format_conversation(test_instruction, "")
print("Test prompt (formatted):")
print(test_formatted)
print("\nGenerated response:")
print(generate_text(test_formatted))


Test prompt (formatted):
<|User|>: Cum mă autentific ca admin pentru a gestiona utilizatorii?

<|Assistant|>:

Generated response:
<|User|>: Cum mă autentific ca admin pentru a gestiona utilizatorii?

<|Assistant|>: Accesează pagina de autentificare (http://supremejobs.test/auth), introdu emailul și parola de admin, apoi apasă pe 'Continue' pentru a-l conecta. Sistemul va verificaedată apoi apasă un badge dând rolul de admin, iar o notificare va confirma accesul.


In [16]:
train_data

[{'instruction': 'How do I log in as an admin to manage users?',
  'response': "Visit the login page (http://supremejobs.test/auth), enter your admin email and password, then click 'Continue'. Once logged in, you'll be directed to your admin dashboard where you can manage users."},
 {'instruction': 'Cum mă autentific ca admin pentru a gestiona utilizatorii?',
  'response': "Accesează pagina de autentificare (http://supremejobs.test/auth), introdu emailul și parola de admin, apoi apasă 'Continue'. Odată autentificat, vei fi direcționat către dashboard-ul de admin, unde poți gestiona utilizatorii."},
 {'instruction': 'How do I open the Add Admin User form?',
  'response': "After logging in, navigate to the user management section by clicking on the 'Add Admin User' link. This will open the form for creating a new admin user."},
 {'instruction': 'Cum deschid formularul de adăugare a unui utilizator admin?',
  'response': "După autentificare, mergi la secțiunea de administrare a utilizator