In [7]:
!pip install requests datasets groq openai

import requests
import json
import pandas as pd
from datasets import load_dataset
import time
import random
from groq import Groq
import os

# =============================================================================
# KONFIGURACIJA
# =============================================================================

USE_GROQ = True
GROQ_API_KEY = "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"  # Zameni sa svojim ključem

USE_OPENAI_COMPATIBLE = False
OPENAI_API_KEY = "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"
OPENAI_BASE_URL = "https://api.openai.com/v1"

# =============================================================================
# UČITAJ DATASET
# =============================================================================

print("Učitavam dataset...")
df = pd.read_csv("hf://datasets/amujalo1/vhdl-ETF/vhdl_etf.csv")
print(f"Dataset učitan: {len(df)} redova")

# =============================================================================
# AI MODEL FUNKCIJE - OPTIMIZOVANE ZA DUGAČKE OPISE
# =============================================================================

def generate_with_groq(vhdl_code, api_key):
    """Generiše DUGAČAK i SVEOBUHVATAN opis koristeći Groq API"""
    client = Groq(api_key=api_key)

    # POBOLJŠAN PROMPT ZA DUGAČKE, DETALJNE OPISE:
    prompt = f"""Analyze this VHDL code and provide a comprehensive, detailed description (2500-5000 characters). This should be a thorough technical analysis covering all aspects of the design.

Requirements:
- Target length: 2500-5000 characters (approximately 15-30 sentences)
- Write in paragraph form with natural flow - NO bullet points or lists
- Cover ALL aspects thoroughly:

1. COMPONENT OVERVIEW: Start with what the component is, its primary purpose, and its role in larger systems
2. DETAILED INPUT/OUTPUT ANALYSIS: Describe each port in detail - its purpose, bit width, timing requirements, and how it affects operation
3. ARCHITECTURAL DESCRIPTION: Explain the internal structure, state machines, counters, registers, and their relationships
4. FUNCTIONAL BEHAVIOR: Describe step-by-step operation, timing relationships, and behavioral patterns
5. CONTROL LOGIC: Explain control signals, enable conditions, reset behavior, and state transitions
6. DATA FLOW: Describe how data moves through the component and any transformations
7. TIMING AND SYNCHRONIZATION: Clock domains, setup/hold requirements, and timing constraints
8. SPECIAL FEATURES: Any unique aspects, modes of operation, or advanced functionality
9. APPLICATIONS AND USE CASES: Where this component would typically be used
10. IMPLEMENTATION CONSIDERATIONS: Performance, resource usage, and design trade-offs

Write as flowing technical prose, not as a list. Use professional VHDL terminology. Be thorough and comprehensive - aim for maximum technical detail while maintaining readability.

VHDL Code:
{vhdl_code}

Provide a comprehensive, detailed component analysis:"""

    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "user", "content": prompt}
            ],
            model="meta-llama/llama-4-scout-17b-16e-instruct",
            temperature=0.3,  # Niža temperatura za konzistentniju analizu
            max_tokens=1200   # ZNAČAJNO POVEĆAN za dugačke opise
        )
        
        response = chat_completion.choices[0].message.content.strip()
        return response
        
    except Exception as e:
        print(f"Greška sa Groq: {e}")
        return None


def generate_with_openai_compatible(vhdl_code, api_key, base_url):
    """Generiše DUGAČAK i SVEOBUHVATAN opis koristeći OpenAI-kompatibilni API"""
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    prompt = f"""Analyze this VHDL code and provide a comprehensive, detailed description (2500-5000 characters). This should be a thorough technical analysis covering all aspects of the design.

Requirements:
- Target length: 2500-5000 characters (approximately 15-30 sentences)
- Write in paragraph form with natural flow - NO bullet points or lists
- Cover ALL aspects thoroughly:

1. COMPONENT OVERVIEW: Start with what the component is, its primary purpose, and its role in larger systems
2. DETAILED INPUT/OUTPUT ANALYSIS: Describe each port in detail - its purpose, bit width, timing requirements, and how it affects operation
3. ARCHITECTURAL DESCRIPTION: Explain the internal structure, state machines, counters, registers, and their relationships
4. FUNCTIONAL BEHAVIOR: Describe step-by-step operation, timing relationships, and behavioral patterns
5. CONTROL LOGIC: Explain control signals, enable conditions, reset behavior, and state transitions
6. DATA FLOW: Describe how data moves through the component and any transformations
7. TIMING AND SYNCHRONIZATION: Clock domains, setup/hold requirements, and timing constraints
8. SPECIAL FEATURES: Any unique aspects, modes of operation, or advanced functionality
9. APPLICATIONS AND USE CASES: Where this component would typically be used
10. IMPLEMENTATION CONSIDERATIONS: Performance, resource usage, and design trade-offs

Write as flowing technical prose, not as a list. Use professional VHDL terminology. Be thorough and comprehensive - aim for maximum technical detail while maintaining readability.

VHDL Code:
{vhdl_code}

Provide a comprehensive, detailed component analysis:"""

    payload = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.3,  # Niža temperatura za konzistentniju analizu
        "max_tokens": 1200   # ZNAČAJNO POVEĆAN za dugačke opise
    }

    try:
        response = requests.post(f"{base_url}/chat/completions",
                               headers=headers, json=payload, timeout=120)
        if response.status_code == 200:
            content = response.json()["choices"][0]["message"]["content"].strip()
            return content
        else:
            print(f"OpenAI API greška: {response.status_code}")
            return None
    except Exception as e:
        print(f"Greška sa OpenAI API: {e}")
        return None

# =============================================================================
# FUNKCIJE ZA POST-PROCESIRANJE - BEZ PRESECANJA
# =============================================================================

def post_process_description(description):
    """Optimizuj opis da bude između 2500-5000 karaktera BEZ presecanja"""
    if not description:
        return description
    
    # Ukloni nepotrebne uvodno fraze
    unwanted_phrases = [
        "This VHDL code implements",
        "This component represents", 
        "The provided VHDL code",
        "This VHDL design",
        "The code describes",
        "This implementation"
    ]
    
    for phrase in unwanted_phrases:
        description = description.replace(phrase, "")
    
    # Ukloni višak space-ova
    description = ' '.join(description.split())
    
    # Proveri dužinu - cilj je 2500-5000 karaktera
    current_length = len(description)
    
    if current_length < 2000:
        print(f"  ⚠️  Opis kratak ({current_length} kar.) - možda treba regenerisati")
    elif current_length > 5500:
        print(f"  ⚠️  Opis dugačak ({current_length} kar.) - AI je generisao previše")
    elif 2500 <= current_length <= 5000:
        print(f"  ✅ Optimalna dužina ({current_length} kar.)!")
    else:
        print(f"  📊 Dužina: {current_length} karaktera")
    
    # VAŽNO: Ne presecamo tekst - vraćamo ceo opis
    return description.strip()

# =============================================================================
# GLAVNA FUNKCIJA
# =============================================================================

def generate_input_description(vhdl_code):
    """Generiše dugačak input opis za dati VHDL kod"""

    if USE_GROQ and GROQ_API_KEY != "your_groq_api_key_here":
        raw_description = generate_with_groq(vhdl_code, GROQ_API_KEY)
    elif USE_OPENAI_COMPATIBLE and OPENAI_API_KEY != "your_openai_key_here":
        raw_description = generate_with_openai_compatible(vhdl_code, OPENAI_API_KEY, OPENAI_BASE_URL)
    else:
        print("Molim te konfiguriši API ključeve!")
        return None
    
    # Post-procesiranje BEZ presecanja
    return post_process_description(raw_description)

# =============================================================================
# PROCESIRANJE DATASETA
# =============================================================================

def process_dataset(df, start_idx=0, batch_size=10):
    """Procesira dataset i generiše potpuno NOVE dugačke input opise na osnovu VHDL koda"""

    results = []
    total = len(df)

    print(f"Počinjem procesiranje od reda {start_idx}...")
    print("CILJ: Generiši potpuno NOVE dugačke, sveobuhvatne opise (2500-5000 karaktera)")
    print("PRISTUP: Uzimam samo VHDL kod (output) i generiram novi input opis!")

    for i in range(start_idx, min(start_idx + batch_size, total)):
        print(f"\nProcesuiram red {i+1}/{total}")

        # KLJUČNO: Uzimamo samo VHDL kod, ignorišemo postojeći input
        vhdl_code = df['output'].iloc[i]
        original_input = df['input'].iloc[i]  # Samo za poređenje

        print(f"📋 Analiziram VHDL kod ({len(vhdl_code)} karaktera)...")
        print(f"🎯 Generiram potpuno NOVI opis na osnovu koda...")

        # Generiši potpuno NOVI dugačak opis na osnovu VHDL koda
        new_description = generate_input_description(vhdl_code)

        if new_description:
            original_length = len(original_input)
            new_length = len(new_description)
            
            results.append({
                'old_input': original_input,  # Stari input samo za poređenje
                'new_input': new_description,  # NOVI generisani input
                'output': vhdl_code,  # Isti VHDL kod
                'index': i,
                'old_input_length': original_length,
                'new_input_length': new_length,
                'length_change': new_length - original_length,
                'optimal_length': 2500 <= new_length <= 5000,
                'generated_from_code': True
            })
            
            print(f"✓ NOVI dugačak opis generisan na osnovu VHDL koda:")
            print(f"  - Stari input: {original_length} karaktera")
            print(f"  - NOVI input: {new_length} karaktera")
            print(f"  - Promena: {'+' if new_length > original_length else ''}{new_length - original_length} karaktera")
            if 2500 <= new_length <= 5000:
                print(f"  ✅ PERFEKTNA dužina!")
            elif new_length < 2500:
                print(f"  ⚠️  Kraće od cilja (2500+)")
            else:
                print(f"  📏 Duže od cilja (5000+) ali zadržano")
            print(f"  - Preview NOVOG opisa: {new_description[:200]}...")
            
        else:
            print(f"✗ Greška kod reda {i+1} - nije moguće generisati novi opis")

        # Pauza između zahteva
        time.sleep(random.uniform(2, 4))  # Duža pauza zbog većih zahteva

    return results

# =============================================================================
# POKRETANJE
# =============================================================================

START_INDEX = 0
BATCH_SIZE = 78  # Manji batch zbog dugačkih opisa

print("=== VHDL Dataset Generator - Dugački sveobuhvatni opisi (2500-5000 chars) ===")
print(f"Konfigurisano za: {'Groq' if USE_GROQ else 'OpenAI-compatible'}")

# Procesiranje
results = process_dataset(df, START_INDEX, BATCH_SIZE)

# ANALIZA REZULTATA
if results:
    results_df = pd.DataFrame(results)

    print(f"\n=== REZULTATI POTPUNO NOVIH DUGAČKIH OPISA ===")
    print(f"Uspešno generisan {len(results)} potpuno NOVIH opisa na osnovu VHDL koda")
    
    # Statistike dužine
    avg_old_length = results_df['old_input_length'].mean()
    avg_new_length = results_df['new_input_length'].mean()
    optimal_count = results_df['optimal_length'].sum()
    too_short = len(results_df[results_df['new_input_length'] < 2500])
    acceptable = len(results_df[results_df['new_input_length'] > 5000])
    
    print(f"Prosečna dužina STAROG input-a: {avg_old_length:.0f} karaktera")
    print(f"Prosečna dužina NOVOG input-a: {avg_new_length:.0f} karaktera")
    print(f"Poboljšanje: {'+' if avg_new_length > avg_old_length else ''}{avg_new_length - avg_old_length:.0f} karaktera u proseku")
    print(f"Optimalna dužina (2500-5000): {optimal_count}/{len(results)} ({optimal_count/len(results)*100:.1f}%)")
    print(f"Kraći od cilja (<2500): {too_short}")
    print(f"Duži od cilja (>5000): {acceptable} (zadržano bez presecanja)")
    
    # Distribucija dužina NOVIH opisa
    print(f"\n=== DISTRIBUCIJA DUŽINA NOVIH OPISA ===")
    length_ranges = [
        ("<2000", len(results_df[results_df['new_input_length'] < 2000])),
        ("2000-2499", len(results_df[(results_df['new_input_length'] >= 2000) & (results_df['new_input_length'] < 2500)])),
        ("2500-3499", len(results_df[(results_df['new_input_length'] >= 2500) & (results_df['new_input_length'] < 3500)])),
        ("3500-4499", len(results_df[(results_df['new_input_length'] >= 3500) & (results_df['new_input_length'] < 4500)])),
        ("4500-5000", len(results_df[(results_df['new_input_length'] >= 4500) & (results_df['new_input_length'] <= 5000)])),
        (">5000", len(results_df[results_df['new_input_length'] > 5000]))
    ]
    
    for range_name, count in length_ranges:
        print(f"{range_name} karaktera: {count}")
    
    # Pokazi najbolje primere NOVIH opisa
    optimal_examples = results_df[results_df['optimal_length'] == True].head(2)
    
    print(f"\n=== NAJBOLJI PRIMERI NOVIH OPISA (2500-5000 karaktera) ===")
    for idx, row in optimal_examples.iterrows():
        print(f"\n--- NOVI OPIS: Dužina {row['new_input_length']} karaktera ---")
        print(f"STARI INPUT ({row['old_input_length']} kar.): {row['old_input'][:100]}...")
        print(f"\nNOVI INPUT ({row['new_input_length']} kar.):")
        print(f"{row['new_input'][:500]}...")
        print(f"[...ostatak od {row['new_input_length']-500} karaktera...]")
        print("-" * 80)

    # Sačuvaj rezultate
    results_df.to_csv('vhdl_long_inputs.csv', index=False)
    print(f"\nRezultati sačuvani u 'vhdl_long_inputs.csv'")

    # Kreiraj novi dataset sa dugačkim opisima
    new_dataset_df = pd.DataFrame({
        'input': results_df['new_input'],
        'output': results_df['output']
    })

    new_dataset_df.to_csv('vhdl_dataset_long.csv', index=False)
    print("Novi dataset sa dugačkim opisima sačuvan u 'vhdl_dataset_long.csv'")

else:
    print("Nema rezultata. Proveri konfiguraciju API-ja.")

print("""
=== OPTIMIZACIJE ZA POTPUNO NOVE DUGAČKE OPISE (2500-5000 KARAKTERA) ===

1. ✅ Uzima samo VHDL kod (output kolonu) kao ulaz
2. ✅ Generiše potpuno NOVI input opis na osnovu koda
3. ✅ Prompt traži 2500-5000 karaktera (15-30 rečenica)
4. ✅ Sveobuhvatan pristup: 10 različitih aspekata analize
5. ✅ Max_tokens povećan na 1200 za dugačke opise
6. ✅ Temperatura 0.3 za konzistentnu tehničku analizu
7. ✅ BEZ PRESECANJA TEKSTA - ceo sadržaj se zadržava
8. ✅ Detaljno praćenje distribucije dužina
9. ✅ Tehnička proza umesto lista za prirodan tok
10. ✅ Poređenje starih i novih input opisa

REZULTAT: Potpuno novi, dugački, sveobuhvatni tehnički opisi generisani iz VHDL koda!
""")

Učitavam dataset...
Dataset učitan: 78 redova
=== VHDL Dataset Generator - Dugački sveobuhvatni opisi (2500-5000 chars) ===
Konfigurisano za: Groq
Počinjem procesiranje od reda 0...
CILJ: Generiši potpuno NOVE dugačke, sveobuhvatne opise (2500-5000 karaktera)
PRISTUP: Uzimam samo VHDL kod (output) i generiram novi input opis!

Procesuiram red 1/78
📋 Analiziram VHDL kod (3933 karaktera)...
🎯 Generiram potpuno NOVI opis na osnovu koda...
  📊 Dužina: 5410 karaktera
✓ NOVI dugačak opis generisan na osnovu VHDL koda:
  - Stari input: 3558 karaktera
  - NOVI input: 5410 karaktera
  - Promena: +1852 karaktera
  📏 Duže od cilja (5000+) ali zadržano
  - Preview NOVOG opisa: The `olo_intf_clk_meas` component is a clock measurement module designed to calculate the frequency of an input clock signal, `ClkTest`, relative to a reference clock signal, `Clk`. Its primary purpos...

Procesuiram red 2/78
📋 Analiziram VHDL kod (4412 karaktera)...
🎯 Generiram potpuno NOVI opis na osnovu koda...
  📊 Dužina