In [2]:
# Install and update necessary libraries
!pip install --upgrade datasets transformers torch

import pandas as pd
import numpy as np
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling

# Load the dataset from Hugging Face
print("Loading dataset...")
dataset = load_dataset("rtl-llm/vhdl_github")
print("Dataset loaded successfully!")

# Display dataset information
print(f"Dataset features: {dataset['train'].features}")
print(f"Total number of samples: {len(dataset['train'])}")

# Extract the first 5000 samples from the content column
samples = dataset['train'].select(range(5000))

# Data preprocessing function
def preprocess_function(examples):
    """Preprocess the data samples for training."""
    # Check for empty content
    contents = [text if text and len(text) > 0 else "-- Empty VHDL file" for text in examples["content"]]

    # Add a prefix to help the model understand the task
    processed_texts = [f"# VHDL Code:\n{content}" for content in contents]

    return {"processed_text": processed_texts}

# Apply preprocessing
print("\nPreprocessing data...")
processed_dataset = samples.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset['train'].column_names
)

# Create training and validation splits
split_dataset = processed_dataset.train_test_split(test_size=0.1)
print(f"Training samples: {len(split_dataset['train'])}")
print(f"Validation samples: {len(split_dataset['test'])}")

# Initialize tokenizer (using a publicly available model instead of gated one)
model_name = "gpt2"  # Using GPT-2 as a freely available alternative
print(f"\nInitializing tokenizer from {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    """Tokenize the text samples."""
    return tokenizer(examples["processed_text"], padding="max_length", truncation=True, max_length=1024)

print("Tokenizing dataset...")
tokenized_datasets = split_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["processed_text"]
)

# Prepare for training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Use causal language modeling, not masked
)

# Set up SIMPLIFIED training arguments - minimizing potential compatibility issues
print("\nSetting up training arguments...")
training_args = TrainingArguments(
    output_dir="./vhdl-code-generator",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=1000,
    logging_dir='./logs',
    logging_steps=100,
)

# Load pre-trained model
print(f"\nLoading model from {model_name}...")
model = AutoModelForCausalLM.from_pretrained(model_name)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

# Save some examples from the dataset
def save_examples(num_examples=5):
    """Save some examples from the dataset to a file."""
    examples = []
    for i in range(num_examples):
        examples.append(samples[i]['content'][:1000])  # First 1000 chars of each example

    with open("vhdl_examples.txt", "w") as f:
        for i, example in enumerate(examples):
            f.write(f"=== Example {i+1} ===\n\n")
            f.write(example)
            f.write("\n\n" + "="*50 + "\n\n")

    print(f"{num_examples} examples saved to vhdl_examples.txt")

# Save examples
save_examples()

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")
if device.type == "cuda":
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"Memory allocated: {torch.cuda.memory_allocated(0)/1024**2:.2f} MB")
    print(f"Memory reserved: {torch.cuda.memory_reserved(0)/1024**2:.2f} MB")
else:
    print("No GPU available, using CPU. Training will be significantly slower.")

# Training function
def start_training():
    """Start the training process."""
    print("Starting training...")
    trainer.train()
    print("Training completed!")

    # Save the model
    print("Saving the model...")
    trainer.save_model("./vhdl-generator-final")
    tokenizer.save_pretrained("./vhdl-generator-final")
    print("Model saved to ./vhdl-generator-final")

# Generate VHDL code function (for after training)
def generate_vhdl_code(prompt, max_length=512):
    """Generate VHDL code from a prompt."""
    # Ensure the model is on the right device
    model.to(device)

    # Add a prefix to help guide generation
    full_prompt = f"Generate VHDL code for: {prompt}\n# VHDL Code:\n"

    # Tokenize and generate
    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Return only the generated code part (after the prompt)
    return generated_text[len(full_prompt):]

# Print instructions for the user
print("\n" + "="*50)
print("INSTRUCTIONS:")
print("1. To start training, run: start_training()")
print("2. After training, to generate code, run: generate_vhdl_code('description of your circuit')")
print("3. Example: generate_vhdl_code('4-bit counter with reset and enable signals')")
print("="*50)

Loading dataset...


Repo card metadata block was not found. Setting CardData to empty.


Dataset loaded successfully!
Dataset features: {'content': Value(dtype='string', id=None)}
Total number of samples: 100932

Preprocessing data...
Training samples: 4500
Validation samples: 500

Initializing tokenizer from gpt2...
Tokenizing dataset...


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]


Setting up training arguments...

Loading model from gpt2...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

5 examples saved to vhdl_examples.txt

Using device: cuda
GPU name: Tesla T4
Memory allocated: 487.47 MB
Memory reserved: 542.00 MB

INSTRUCTIONS:
1. To start training, run: start_training()
2. After training, to generate code, run: generate_vhdl_code('description of your circuit')
3. Example: generate_vhdl_code('4-bit counter with reset and enable signals')


In [3]:
start_training()
print("\nExample usage after training:")
prompt = 'Generate VHDL code for a 4-bit counter with reset'
vhdl_code = generate_vhdl_code(prompt)
print(vhdl_code)

def export_model():
    model.save_pretrained("./vhdl-generator-model")
    tokenizer.save_pretrained("./vhdl-generator-model")
    print("Model and tokenizer saved for inference.")

export_model()



Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mamujalo1[0m ([33mamujalo1-etf-database[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,2.0503
200,1.5912
300,1.4253
400,1.44
500,1.3233
600,1.1878
700,1.19
800,1.2452


Step,Training Loss
100,2.0503
200,1.5912
300,1.4253
400,1.44
500,1.3233
600,1.1878
700,1.19
800,1.2452
900,1.1531
1000,1.0496


Training completed!
Saving the model...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model saved to ./vhdl-generator-final

Example usage after training:
library IEEE;
use IEEE.numeric_std.all;
use IEEE.std_logic_1164.all;
use IEEE.std_logic_arith.all;

entity counter_gen is
  port (
      clk        : in  std_logic;
      reset        : in  std_logic;
      data_out      : in  std_logic_vector(31 downto 0);
      reset_n       : in  std_logic;
       data_in      : in  std_logic_vector(31 downto 0);
      data_out     : out std_logic_vector(31 downto 0);
      data_out     : out std_logic_vector(31 downto 0);
      data_in      : out std_logic_vector(31 downto 0)
     );
end entity counter_gen;

architecture rtl of counter_gen is
    -- Generate the output
    signal clk            : std_logic;
    signal reset         : std_logic;
    signal data_out      : std_logic_vector(31 downto 0);
    signal data_in      : std_logic_vector(31 downto 0);
    signal data_in      : std_logic_vector(31 downto 0);
begin

   -- Generate the output
    counter_gen  : counter_gen := '

In [5]:
prompt = 'his component implements a clock crossing with AXI-S handshaking for transferring data from one clock domain to another one that runs at an integer fraction of the frequency of the input clock frequency. It can for example be used to transfer data from a 100 MHz clock domain to a 50 MHz clock domain (both generated by the same PLL). Note that the two clocks must be phase aligned. Note that the clock crossing **does not work if the two clocks have the same frequency**. This block follows the general [clock-crossing principles](clock_crossing_principles.md). Read through them for more information.'
vhdl_code = generate_vhdl_code(prompt)
print(vhdl_code)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


-- ---------------------------------------------------
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library unisim;
use unisim.vcomponents.all;

entity clock is
  port (
     clock_clk : in std_logic;
     clock_reset : in std_logic;
     reset_delay : in std_logic;
      data_out_delay : out std_logic_vector(3 downto 0);
     data_in_delay : out std_logic_vector(3 downto 0);
     data_out_delay : out std_logic_vector(3 downto 0);
      data_out_data_delay : out std_logic_vector(3 downto 0);
      data_out_data_delay : out std_logic_vector(3 downto 0)
    );
end clock;

architecture rtl of clock is

  constant C_COMMON_CLOCK_SIZE : integer := 100;
  constant C_COMMON_SHIFT : integer := 0;
  constant C_COMMON_CLOCK_DIVIDER : integer := 0;
  constant C_COMMON_CLOCK_DIVIDER : integer := 0;
  constant C_COMMON_CLOCK_DIVIDER : integer := 0;
  constant C_COMMON_CLOCK_DIVIDER : integer := 0;
  constant C_COMMON_
