In [7]:
# VHDL Dataset Input Generator - Google Colab
# Generiše nove input opise za postojeći VHDL kod iz dataseta

# Instalacija potrebnih paketa
!pip install requests datasets groq openai

import requests
import json
import pandas as pd
from datasets import load_dataset
import time
import random
from groq import Groq
import os

# =============================================================================
# KONFIGURACIJA - Odaberi koji API želiš da koristiš
# =============================================================================

# Opcija 1: Groq (BESPLATNO - preporučeno)
USE_GROQ = True
GROQ_API_KEY = "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"  # Dobij na https://console.groq.com/


# Opcija 3: OpenAI-kompatibilni API
USE_OPENAI_COMPATIBLE = False
OPENAI_API_KEY = "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"
OPENAI_BASE_URL = "https://api.openai.com/v1"

# =============================================================================
# UČITAJ DATASET
# =============================================================================

print("Učitavam dataset...")
df = pd.read_csv("hf://datasets/amujalo1/vhdl-ETF/vhdl_etf.csv")
print(f"Dataset učitan: {len(df)} redova")
print("\nPrimer postojeće strukture:")
print(f"Input: {df['input'].iloc[0][:200]}...")
print(f"Output: {df['output'].iloc[0][:200]}...")

# =============================================================================
# AI MODEL FUNKCIJE
# =============================================================================

def generate_with_groq(vhdl_code, api_key):
    """Generiše opis koristeći Groq API"""
    client = Groq(api_key=api_key)

    prompt = f"""Analyze this VHDL code and provide a concise technical description (2-3 sentences) of what this component does. Focus on its functionality, inputs, outputs, and purpose. Be specific but not overly verbose.

VHDL Code:
{vhdl_code}

Provide a detailed component specification that would result in this file:"""

    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "user", "content": prompt}
            ],
            model="meta-llama/llama-4-scout-17b-16e-instruct",  #
            temperature=0.7,
            max_tokens=300
        )
        return chat_completion.choices[0].message.content.strip()
    except Exception as e:
        print(f"Greška sa Groq: {e}")
        return None


def generate_with_openai_compatible(vhdl_code, api_key, base_url):
    """Generiše opis koristeći OpenAI-kompatibilni API"""
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    prompt = f"""Analyze this VHDL code and provide a concise technical description (2-3 sentences) of what this component does. Focus on its functionality, inputs, outputs, and purpose. Be specific but not overly verbose.

VHDL Code:
{vhdl_code}

Provide a detailed component specification that would result in this file:"""

    payload = {
        "model": "gpt-3.5-turbo",  # ili drugi model
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.7,
        "max_tokens": 300
    }

    try:
        response = requests.post(f"{base_url}/chat/completions",
                               headers=headers, json=payload, timeout=120)
        if response.status_code == 200:
            return response.json()["choices"][0]["message"]["content"].strip()
        else:
            print(f"OpenAI API greška: {response.status_code}")
            return None
    except Exception as e:
        print(f"Greška sa OpenAI API: {e}")
        return None

# =============================================================================
# GLAVNA FUNKCIJA ZA GENERISANJE
# =============================================================================

def generate_input_description(vhdl_code):
    """Generiše novi input opis za dati VHDL kod"""

    if USE_GROQ and GROQ_API_KEY != "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":
        return generate_with_groq(vhdl_code, GROQ_API_KEY)


    elif USE_OPENAI_COMPATIBLE and OPENAI_API_KEY != "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":
        return generate_with_openai_compatible(vhdl_code, OPENAI_API_KEY, OPENAI_BASE_URL)

    else:
        print("Molim te konfiguriši API ključeve!")
        return None

# =============================================================================
# PROCESIRANJE DATASETA
# =============================================================================

def process_dataset(df, start_idx=0, batch_size=10):
    """Procesira dataset i generiše nove input opise"""

    results = []
    total = len(df)

    print(f"Počinjem procesiranje od reda {start_idx}...")

    for i in range(start_idx, min(start_idx + batch_size, total)):
        print(f"\nProcesuiram red {i+1}/{total}")

        vhdl_code = df['output'].iloc[i]  # VHDL kod je u 'output' koloni
        original_input = df['input'].iloc[i]

        # Generiši novi opis
        new_description = generate_input_description(vhdl_code)

        if new_description:
            results.append({
                'original_input': original_input,
                'new_input': new_description,
                'output': vhdl_code,
                'index': i
            })
            print(f"✓ Uspešno generisan opis: {new_description[:100]}...")
        else:
            print(f"✗ Greška kod reda {i+1}")

        # Pauza između zahteva da izbegnemo rate limiting
        time.sleep(random.uniform(1, 3))

    return results

# =============================================================================
# POKRETANJE
# =============================================================================

# NAPOMENA: Počni sa malim batch-om da testiram
START_INDEX = 0
BATCH_SIZE = 78  # Počni sa 5 redova za test

print("=== VHDL Dataset Input Generator ===")
print(f"Konfigurisano za: {'Groq' if USE_GROQ else 'OpenAI-compatible'}")

# Proces batch-a
results = process_dataset(df, START_INDEX, BATCH_SIZE)

# Sačuvaj rezultate
if results:
    results_df = pd.DataFrame(results)

    print(f"\n=== REZULTATI ===")
    print(f"Uspešno procesiran {len(results)} redova")

    # Prikaži primere
    for i, result in enumerate(results[:3]):
        print(f"\n--- Primer {i+1} ---")
        print(f"Originalni input: {result['original_input'][:150]}...")
        print(f"Novi input: {result['new_input'][:150]}...")
        print(f"VHDL kod: {result['output'][:100]}...")

    # Sačuvaj u CSV
    results_df.to_csv('vhdl_new_inputs.csv', index=False)
    print(f"\nRezultati sačuvani u 'vhdl_new_inputs.csv'")

    # Kreiraj novi dataset sa novim input opisima
    new_dataset_df = pd.DataFrame({
        'input': results_df['new_input'],
        'output': results_df['output']
    })

    new_dataset_df.to_csv('vhdl_dataset_regenerated.csv', index=False)
    print("Novi dataset sačuvan u 'vhdl_dataset_regenerated.csv'")

else:
    print("Nema rezultata. Proveri konfiguraciju API-ja.")

# =============================================================================
# INSTRUKCIJE ZA UPOTREBU
# =============================================================================

print("""
=== INSTRUKCIJE ZA DALJU UPOTREBU ===

1. GROQ SETUP (PREPORUČENO - BESPLATNO):
   - Idi na https://console.groq.com/
   - Napravi nalog i dobij API ključ
   - Zameniti 'your_groq_api_key_here' sa pravim ključem
   - Postavi USE_GROQ = True

3. BATCH PROCESIRANJE:
   - Povećaj BATCH_SIZE na veći broj (npr. 50, 100)
   - Pokreći u manjim batch-ovima da izbegnuš rate limiting
   - Koristi START_INDEX da nastavis odakle si stao

4. POBOLJŠANJA:
   - Možeš modifikovati prompt za drugačiji stil opisa
   - Dodaj filtriranje za određene tipove VHDL komponenti
   - Implementiraj retry logiku za neuspešne zahteve
""")

Učitavam dataset...
Dataset učitan: 78 redova

Primer postojeće strukture:
Input: This entity measures the frequency of a clock under the assumption that the frequency of the main-clock is exactly correct. Generally the system clock comes from PS, the block is useful to verify if o...
Output: 

library ieee;
    use ieee.std_logic_1164.all;
    use ieee.numeric_std.all;

library work;
    use work.olo_base_pkg_math.all;

entity olo_intf_clk_meas is
    generic (
        ClkFrequency_g     ...
=== VHDL Dataset Input Generator ===
Konfigurisano za: Groq
Počinjem procesiranje od reda 0...

Procesuiram red 1/78
✓ Uspešno generisan opis: **Component Description**

The `olo_intf_clk_meas` component measures the frequency of an input cloc...

Procesuiram red 2/78
✓ Uspešno generisan opis: **Component Specification: Debouncer Interface (olo_intf_debounce)**

**Component Description:**
The...

Procesuiram red 3/78
✓ Uspešno generisan opis: **Component Specification: I2C Master Interface**

**Ov