In [14]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

# 1. Učitavanje dataseta
print("Učitavanje VHDL dataseta...")
dataset = load_dataset("amujalo1/vhdl-ETF")
print(f"Dataset učitan sa {len(dataset['train'])} primera")

# Provera strukture podataka
print("\nStruktura primera:")
print(dataset["train"][0])  # Prikaz prvog primera da vidimo strukturu

# Pregled nekoliko primera
print("\nPrimer podataka:")
for i in range(2):
    example = dataset["train"][i]
    print(f"Primer {i+1}:")
    print(f"Opis: {example['input']}")
    print(f"VHDL kod (prvih 100 karaktera): {example['output'][:100]}...\n")

Učitavanje VHDL dataseta...
Dataset učitan sa 78 primera

Struktura primera:
{'input': "This entity measures the frequency of a clock under the assumption that the frequency of the main-clock is exactly correct. Generally the system clock comes from PS, the block is useful to verify if other clocks are set to the correct frequency. This block contains _Open Logic_ clock crossings, refer to [clock crossing principles](../base/clock_crossing_principles.md) regarding timing constraints.\n\n# VHDL Analysis for 'olo_intf_clk_meas.vhd'\n\n## File Comments\n- -------------------------------------------------------------------------------------------------\n- Copyright (c) 2018 by Paul Scherrer Institute, Switzerland\n- Copyright (c) 2024 by Oliver Bründler\n- All rights reserved.\n- Authors: Oliver Bruendler\n- -------------------------------------------------------------------------------------------------\n- -----------------------------------------------------------------------------------

In [15]:
# 2. Priprema podataka za trening
def prepare_data_for_training(examples):
    inputs = []
    for input_text, output_text in zip(examples["input"], examples["output"]):
        # Format input-output parova za trening
        # Dodajemo specijalan format koji će model naučiti
        formatted_text = f"Opis: {input_text}\nVHDL kod:\n{output_text}"
        inputs.append(formatted_text)

    return {"text": inputs}  # Vraćamo rečnik sa ključem "text"

In [16]:
# 3. Izbor modela
model_name = "EleutherAI/gpt-neo-1.3B"  # Možete koristiti i manje modele kao "EleutherAI/gpt-neo-125M"
print(f"Učitavanje baznog modela: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Dodavanje posebnih tokena za VHDL
tokenizer.add_special_tokens({
    'additional_special_tokens': ['<VHDL>', '</VHDL>']
})

# Ako tokenizer nema pad_token, postavite ga
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Priprema podataka
tokenized_datasets = dataset.map(
    prepare_data_for_training,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# Tokenizacija
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024,
        padding="max_length"
    )

tokenized_datasets = tokenized_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Trening/validacija split
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1)

Učitavanje baznog modela: EleutherAI/gpt-neo-1.3B


In [31]:
# 4. Podešavanje parametara treninga
training_args = TrainingArguments(
    output_dir="./vhdl_code_generator",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,
)

# Učitavanje modela
model = AutoModelForCausalLM.from_pretrained(model_name)
# Redimenzioniranje embedding sloja za nove tokene
model.resize_token_embeddings(len(tokenizer))

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Koristimo causal language modeling (ne masked)
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

In [None]:
# 6. Trening modela
print("Početak treninga modela...")
trainer.train()


In [None]:
# 7. Čuvanje modela
print("Čuvanje istreniranog modela...")
model_path = "./vhdl_code_generator_trained"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model sačuvan u: {model_path}")

In [None]:
# 8. Test generisanja koda
def generate_vhdl_code(description, max_length=512):
    input_text = f"Opis: {description}\nVHDL kod:"
    inputs = tokenizer(input_text, return_tensors="pt")

    # Generisanje
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )

    # Dekodiranje
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Izdvajanje samo VHDL koda (nakon "VHDL kod:")
    vhdl_code = generated_text.split("VHDL kod:")[1].strip()

    return vhdl_code

# Testiranje sa nekoliko primera
test_descriptions = [
    "D flip-flop sa asinhronim resetom",
    "8-bitni brojač sa enable signalom",
    "Jednostavan ALU koji podržava sabiranje i oduzimanje"
]

print("\nTestiranje generisanja VHDL koda:")
for desc in test_descriptions:
    print(f"\nOpis: {desc}")
    print("Generisani VHDL kod:")
    print("-" * 50)
    try:
        vhdl_code = generate_vhdl_code(desc)
        print(vhdl_code)
    except Exception as e:
        print(f"Greška prilikom generisanja: {e}")
    print("-" * 50)

print("\nProces treninga završen!")