In [None]:
#connect with my drive
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# Load Llama 3.2 model - choose between 1B or 3B
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.12: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


Unsloth 2025.6.12 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets
import json

# Load both datasets
print("Loading MG-Verilog dataset...")
mg_dataset = load_dataset("GaTech-EIC/MG-Verilog", split="train")

print("Loading VHDL dataset...")
with open("/content/drive/MyDrive/2025/PEP/VHDL_data_1.json", "r") as f:
    vhdl_data = json.load(f)
vhdl_dataset = Dataset.from_list(vhdl_data)

print(f"MG-Verilog dataset size: {len(mg_dataset)}")
print(f"VHDL dataset size: {len(vhdl_dataset)}")
print(f"Total original examples: {len(mg_dataset) + len(vhdl_dataset)}")

Loading MG-Verilog dataset...


README.md: 0.00B [00:00, ?B/s]

data-00000-of-00001.arrow:   0%|          | 0.00/61.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11144 [00:00<?, ? examples/s]

Loading VHDL dataset...
MG-Verilog dataset size: 11144
VHDL dataset size: 2102
Total original examples: 13246


In [None]:
def convert_multigrained_to_conversations(examples, language="Verilog"):
    """
    Convert dataset using multigrained approach.
    Creates 3 training examples per row using different description levels:
    1. block_summary (most detailed)
    2. detailed_global_summary (medium detail)
    3. high_level_global_summary (minimal detail)
    """
    conversations = []

    for i in range(len(examples['code'])):
        code = examples['code'][i]
        description = examples['description'][i]

        # All 3 description levels
        descriptions = [
            ("detailed", description['block_summary']),
            ("medium", description['detailed_global_summary']),
            ("minimal", description['high_level_global_summary'])
        ]

        # Create 3 training examples from the same code
        for level_name, instruction in descriptions:
            # Skip empty descriptions
            if instruction and instruction.strip():
                conversation = [
                    {
                        "role": "system",
                        "content": f"You are an expert {language} designer. You write syntactically correct and functionally accurate {language} code based on the given specifications."
                    },
                    {
                        "role": "user",
                        "content": instruction
                    },
                    {
                        "role": "assistant",
                        "content": code
                    }
                ]
                conversations.append(conversation)

    return {"conversations": conversations}

# Test the conversion function
print("Testing multigrained conversion...")
sample_mg = mg_dataset.select(range(2))
sample_vhdl = vhdl_dataset.select(range(2))

converted_mg = convert_multigrained_to_conversations(sample_mg, language="Verilog")
converted_vhdl = convert_multigrained_to_conversations(sample_vhdl, language="VHDL")

print(f"MG-Verilog: {len(sample_mg)} rows ‚Üí {len(converted_mg['conversations'])} conversations")
print(f"VHDL: {len(sample_vhdl)} rows ‚Üí {len(converted_vhdl['conversations'])} conversations")
print("‚úÖ Expected: 2 rows ‚Üí 6 conversations each (3x multigrained)")

Testing multigrained conversion...
MG-Verilog: 2 rows ‚Üí 6 conversations
VHDL: 2 rows ‚Üí 6 conversations
‚úÖ Expected: 2 rows ‚Üí 6 conversations each (3x multigrained)


In [None]:
print("=== Sample MG-Verilog Conversations ===")
for i, conv in enumerate(converted_mg['conversations'][:3]):
    print(f"\n--- Conversation {i+1} (Verilog) ---")
    print(f"User: {conv[1]['content'][:100]}...")
    print(f"Assistant: {conv[2]['content'][:100]}...")

print("\n" + "="*60)
print("=== Sample VHDL Conversations ===")
for i, conv in enumerate(converted_vhdl['conversations'][:3]):
    print(f"\n--- Conversation {i+1} (VHDL) ---")
    print(f"User: {conv[1]['content'][:100]}...")
    print(f"Assistant: {conv[2]['content'][:100]}...")

=== Sample MG-Verilog Conversations ===

--- Conversation 1 (Verilog) ---
User: 
    <s>[INST] <<SYS>>
    You only complete chats with syntax correct Verilog code. End the Verilog...
Assistant:  
 assign enabled = enable; 
 dmac_data_mover # (.ID_WIDTH(ID_WIDTH),.DATA_WIDTH(S_AXIS_DATA_WIDTH),...

--- Conversation 2 (Verilog) ---
User: 
    <s>[INST] <<SYS>>
    You only complete chats with syntax correct Verilog code. End the Verilog...
Assistant:  
 assign enabled = enable; 
 dmac_data_mover # (.ID_WIDTH(ID_WIDTH),.DATA_WIDTH(S_AXIS_DATA_WIDTH),...

--- Conversation 3 (Verilog) ---
User: 
    <s>[INST] <<SYS>>
    You only complete chats with syntax correct Verilog code. End the Verilog...
Assistant:  
 assign enabled = enable; 
 dmac_data_mover # (.ID_WIDTH(ID_WIDTH),.DATA_WIDTH(S_AXIS_DATA_WIDTH),...

=== Sample VHDL Conversations ===

--- Conversation 1 (VHDL) ---
User: 
 <s>[INST] <<SYS>>
 You only complete chats with syntax-correct VHDL code. End the VHDL architectur...
Assista

In [None]:
print("Converting full datasets using multigrained approach...")

# Convert MG-Verilog dataset
print("Converting MG-Verilog...")
mg_conversations = mg_dataset.map(
    lambda examples: convert_multigrained_to_conversations(examples, language="Verilog"),
    batched=True,
    remove_columns=mg_dataset.column_names,
    desc="Converting MG-Verilog to multigrained conversations"
)

# Convert VHDL dataset
print("Converting VHDL...")
vhdl_conversations = vhdl_dataset.map(
    lambda examples: convert_multigrained_to_conversations(examples, language="VHDL"),
    batched=True,
    remove_columns=vhdl_dataset.column_names,
    desc="Converting VHDL to multigrained conversations"
)

# Combine both datasets
print("Combining datasets...")
combined_dataset = concatenate_datasets([mg_conversations, vhdl_conversations])

print(f"\n=== Dataset Statistics ===")
print(f"Original MG-Verilog: {len(mg_dataset)} ‚Üí {len(mg_conversations)} conversations")
print(f"Original VHDL: {len(vhdl_dataset)} ‚Üí {len(vhdl_conversations)} conversations")
print(f"Combined total: {len(combined_dataset)} conversations")
print(f"Expansion factor: ~{len(combined_dataset) / (len(mg_dataset) + len(vhdl_dataset)):.1f}x")

Converting full datasets using multigrained approach...
Converting MG-Verilog...


Converting MG-Verilog to multigrained conversations:   0%|          | 0/11144 [00:00<?, ? examples/s]

Converting VHDL...


Converting VHDL to multigrained conversations:   0%|          | 0/2102 [00:00<?, ? examples/s]

Combining datasets...

=== Dataset Statistics ===
Original MG-Verilog: 11144 ‚Üí 33432 conversations
Original VHDL: 2102 ‚Üí 6306 conversations
Combined total: 39738 conversations
Expansion factor: ~3.0x


In [None]:
# Let's look at a complete example to understand the instruction format
print("=== Complete Example Analysis ===")
example = mg_dataset[5]  # Look at example 5

print("\n--- Block Summary (Instruction) ---")
print(example['description']['block_summary'])

print("\n--- Expected Code Output ---")
print(example['code'])

print("\n--- Detailed Global Summary ---")
print(example['description']['detailed_global_summary'][:500] + "...")

print("\n--- High Level Global Summary ---")
print(example['description']['high_level_global_summary'][:500] + "...")


=== Complete Example Analysis ===

--- Block Summary (Instruction) ---

    <s>[INST] <<SYS>>
    You only complete chats with syntax correct Verilog code. End the Verilog module code completion with 'endmodule'. Do not include module, input and output definitions.
    <</SYS>>

    Implement the Verilog module based on the following block level summaries. Assume that signals are positive clock/clk edge triggered unless otherwise stated.
Here are block level summaries:

block_0: This code block defines the module inputs and outputs, including the 4-bit encoded binary output, the enable signal for the encoder, and the 16-bit input for the encoder. It also declares a register to store the encoded data.
block_1: This code block contains an always block that triggers when there is a change in the enable signal or the input to the encoder. Inside the block, the binary output is initialized to zero, and the encoder is enabled by checking the enable signal. It also includes a case statement f

In [None]:
from unsloth.chat_templates import get_chat_template

# Set up the chat template for Llama 3.1
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

# Apply formatting to combined dataset
# dataset = combined_dataset.map(formatting_prompts_func, batched = True)
dataset = vhdl_conversations.map(formatting_prompts_func, batched = True)


print(f"Final formatted dataset size: {len(dataset)}")
print(f"Dataset features: {dataset.features}")

# Verify mixed content
print("\n=== Verifying Mixed Content ===")
verilog_count = sum(1 for i in range(min(100, len(dataset))) if "Verilog" in dataset[i]["conversations"][0]["content"])
vhdl_count = sum(1 for i in range(min(100, len(dataset))) if "VHDL" in dataset[i]["conversations"][0]["content"])
print(f"In first 100 examples: {verilog_count} Verilog, {vhdl_count} VHDL")

Map:   0%|          | 0/6306 [00:00<?, ? examples/s]

Final formatted dataset size: 6306
Dataset features: {'conversations': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'text': Value(dtype='string', id=None)}

=== Verifying Mixed Content ===
In first 100 examples: 0 Verilog, 100 VHDL


In [None]:
# Convert the entire dataset
print("Converting entire MG-Verilog dataset to conversation format...")
dataset = mg_dataset.map(
    convert_mg_verilog_to_conversations,
    batched=True,
    remove_columns=mg_dataset.column_names,  # Remove original columns
    desc="Converting to conversations"
)

print(f"Converted dataset size: {len(dataset)}")
print(f"Dataset features: {dataset.features}")


Converting entire MG-Verilog dataset to conversation format...


NameError: name 'convert_mg_verilog_to_conversations' is not defined

In [None]:
# Apply chat template formatting
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

# Verify the formatting
print("=== Formatted Example ===")
print("Raw conversation:")
print(json.dumps(dataset[0]["conversations"][:2], indent=2))  # Show first 2 messages
print("\n" + "="*50)
print("Formatted text:")
print(dataset[0]["text"][:1000] + "...")


Map:   0%|          | 0/6306 [00:00<?, ? examples/s]

=== Formatted Example ===
Raw conversation:
[
  {
    "content": "You are an expert VHDL designer. You write syntactically correct and functionally accurate VHDL code based on the given specifications.",
    "role": "system"
  },
  {
    "content": "\n <s>[INST] <<SYS>>\n You only complete chats with syntax-correct VHDL code. End the VHDL architecture code with 'end Behavioral;'. Include all required entities, ports, signals, and logic to ensure compilable code.\n <</SYS>>\n\n{\n    \"block_summary\": [\n        {\n            \"block_id\": \"block_0\",\n            \"description\": \"The VHDL entity `v3e6c24_v68c173` has one output port `v`. The port `v` is assigned a constant value of '1'.\"\n        }\n    ],\n    \"detailed_global_summary\": \"The VHDL entity `v3e6c24_v68c173` defines a single output `v`. This output is assigned a constant value of '1'. The entity uses standard logic (`std_logic`) and contains no internal logic beyond assigning this constant value.\",\n    \"high_l

In [None]:
# Analyze the tokenized length to ensure it fits within max_seq_length
sample_texts = [dataset[i]["text"] for i in range(100)]  # Sample 100 examples
tokenized_lengths = []

for text in sample_texts:
    tokens = tokenizer.encode(text)
    tokenized_lengths.append(len(tokens))

print("=== Tokenization Analysis ===")
print(f"Max sequence length setting: {max_seq_length}")
print(f"Sample size: {len(tokenized_lengths)}")
print(f"Mean tokens: {np.mean(tokenized_lengths):.1f}")
print(f"Median tokens: {np.median(tokenized_lengths):.1f}")
print(f"Max tokens: {np.max(tokenized_lengths)}")
print(f"95th percentile: {np.percentile(tokenized_lengths, 95):.1f}")
print(f"Examples exceeding max_seq_length: {sum(1 for l in tokenized_lengths if l > max_seq_length)}")

if any(l > max_seq_length for l in tokenized_lengths):
    print("\n‚ö†Ô∏è  Warning: Some examples exceed max_seq_length. Consider:")
    print("   1. Increasing max_seq_length")
    print("   2. Filtering out long examples")
    print("   3. Truncating long examples")
else:
    print("\n‚úÖ All examples fit within max_seq_length!")


=== Tokenization Analysis ===
Max sequence length setting: 2048
Sample size: 100


NameError: name 'np' is not defined

In [None]:
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq

# Optional: Filter out examples that are too long
def filter_by_length(example):
    tokens = tokenizer.encode(example["text"])
    return len(tokens) <= max_seq_length

# Filter the dataset
original_size = len(dataset)
dataset = dataset.filter(filter_by_length)
filtered_size = len(dataset)

print(f"Original dataset size: {original_size}")
print(f"Filtered dataset size: {filtered_size}")
print(f"Removed {original_size - filtered_size} examples ({((original_size - filtered_size) / original_size * 100):.1f}%)")

# Setup trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 100, # Increase this for longer training
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)


Filter:   0%|          | 0/6306 [00:00<?, ? examples/s]

Original dataset size: 6306
Filtered dataset size: 6300
Removed 6 examples (0.1%)


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/6300 [00:00<?, ? examples/s]

In [None]:
# Use train_on_responses_only to only train on the Verilog code outputs
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)


Map (num_proc=2):   0%|          | 0/6300 [00:00<?, ? examples/s]

In [None]:
# Verify masking is working correctly
print("=== Verifying Training Masking ===")
print("Full input:")
print(tokenizer.decode(trainer.train_dataset[5]["input_ids"]))

print("\n" + "="*50)
print("Masked labels (what the model will be trained on):")
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
print(tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]]))


=== Verifying Training Masking ===
Full input:
<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

You are an expert VHDL designer. You write syntactically correct and functionally accurate VHDL code based on the given specifications.<|eot_id|><|start_header_id|>user<|end_header_id|>


 <s>[INST] <<SYS>>
 You only complete chats with syntax-correct VHDL code. End the VHDL architecture code with 'end Behavioral;'. Include all required entities, ports, signals, and logic to ensure compilable code.
 <</SYS>>

{
    "block_summary": [
        {
            "block_id": "block_0",
            "description": "The VHDL entity `data_mapper` maps the bits of the input signal `idat` to specific positions in the output signal `odat`. The mapping is defined by a fixed pattern, where each bit of `idat` is assigned to a specific bit in `odat`."
        }
    ],
    "detailed_global_summary": "The VHDL module `da

In [None]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


GPU = Tesla T4. Max memory = 14.741 GB.
3.441 GB of memory reserved.


In [None]:
# Start training
print("Starting training on MG-Verilog dataset...")
trainer_stats = trainer.train()


Starting training on MG-Verilog dataset...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,300 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,000,000,000 (0.81% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.4404
2,0.3708
3,0.3515
4,0.2723
5,0.2382
6,0.1964
7,0.4127
8,0.3981
9,0.3469
10,0.4254


In [None]:
# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


727.3244 seconds used for training.
12.12 minutes used for training.
Peak reserved memory = 4.111 GB.
Peak reserved memory for training = 0.67 GB.
Peak reserved memory % of max memory = 27.888 %.
Peak reserved memory for training % of max memory = 4.545 %.


In [None]:
# Enable inference mode
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Test with a Verilog generation task
messages = [
    {
        "role": "system",
        "content": "You are an expert Verilog RTL designer. You write syntactically correct and functionally accurate Verilog code based on the given specifications."
    },
    {
        "role": "user",
        "content": "Write a Verilog module for a simple 4-bit counter with synchronous reset. The counter should increment on each positive clock edge when enable is high, and reset to 0 when reset is asserted."
    }
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)

print("=== Generated Verilog Code ===")
_ = model.generate(
    input_ids = inputs,
    streamer = text_streamer,
    max_new_tokens = 512,
    use_cache = True,
    temperature = 0.7,
    min_p = 0.1
)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


=== Generated Verilog Code ===


LlamaForCausalLM has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.


library IEEE;
use IEEE.STD_LOGIC_1164.ALL;

entity counter is
    port (
        clk   : in  STD_LOGIC;
        reset : in  STD_LOGIC;
        enable : in  STD_LOGIC;
        q     : out STD_LOGIC_VECTOR(3 downto 0)
    );
end entity counter;

architecture behavior of counter is
begin
    process(clk, reset)
    begin
        if reset = '1' then
            q <= "0000";
        elsif rising_edge(clk) then
            if enable = '1' then
                q <= std_logic_vector(to_unsigned(to_integer(unsigned(q(3)))+1, 4)));
            end if;
        end if;
    end process;
end architecture behavior;<|eot_id|>


In [None]:
# Test with a more complex example from the training set
# Let's use one of the actual training examples
test_example = mg_dataset[10]  # Pick a different example
test_instruction = test_example['description']['high_level_global_summary']

messages = [
    {
        "role": "system",
        "content": "You are an expert Verilog RTL designer. You write syntactically correct and functionally accurate Verilog code based on the given specifications."
    },
    {
        "role": "user",
        "content": test_instruction
    }
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

print("=== Test Instruction ===")
print(test_instruction)
print("\n=== Expected Code ===")
print(test_example['code'])
print("\n=== Generated Code ===")

_ = model.generate(
    input_ids = inputs,
    streamer = text_streamer,
    max_new_tokens = 512,
    use_cache = True,
    temperature = 0.3,  # Lower temperature for more deterministic output
    min_p = 0.1
)


=== Test Instruction ===

    <s>[INST] <<SYS>>
    You only complete chats with syntax correct Verilog code. End the Verilog module code completion with 'endmodule'. Do not include module, input and output definitions.
    <</SYS>>

    Implement the Verilog module based on the following description. Assume that signals are positive clock/clk edge triggered unless otherwise stated.

This Verilog module implements a binary multiplier. It consists of registers k, i, j, and accum for controlling the multiplication operation. The multiplication is performed on the input signals rx_a and rx_b and the result is accumulated in the accum register. The module uses the clk signal for synchronization and the reset signal for resetting all the registers to their initial states. The module updates the registers based on certain conditions, and updates the output tx_r when the multiplication operation is completed.

 Module header:

module bn_mul (input clk,input reset,input [255:0] rx_a,input [255

In [None]:
# Save the LoRA adapters
model.save_pretrained("/content/drive/MyDrive/2025/PEP/finetuning_2/mg_verilog_lora_model")  # Local saving
tokenizer.save_pretrained("mg_verilog_lora_model")

print("Model saved successfully!")
print("To upload to Hugging Face Hub, uncomment and use:")
print('# model.push_to_hub("your_username/mg-verilog-llama3.2-lora", token="...")')
print('# tokenizer.push_to_hub("your_username/mg-verilog-llama3.2-lora", token="...")')


Model saved successfully!
To upload to Hugging Face Hub, uncomment and use:
# model.push_to_hub("your_username/mg-verilog-llama3.2-lora", token="...")
# tokenizer.push_to_hub("your_username/mg-verilog-llama3.2-lora", token="...")


In [None]:
# Save to different formats (uncomment as needed)

# Merge to 16bit
# model.save_pretrained_merged("mg_verilog_merged_16bit", tokenizer, save_method = "merged_16bit")

# Save to GGUF format
# model.save_pretrained_gguf("mg_verilog_gguf", tokenizer, quantization_method = "q4_k_m")

print("Uncomment the lines above to save in different formats.")
