In [1]:
import requests
import re
import csv
from time import sleep

# Konfiguracija
GITHUB_API_URL = "https://api.github.com/repos/lxp32/lxp32-cpu/contents/verify/lxp32/src/platform"
RAW_BASE_URL = "https://raw.githubusercontent.com/lxp32/lxp32-cpu/develop/verify/lxp32/src/platform/"
OUTPUT_CSV = "vhdl_dataset.csv"

def get_vhdl_files():
    """Dohvata listu svih VHDL fajlova sa GitHub-a"""
    response = requests.get(GITHUB_API_URL)
    if response.status_code == 200:
        files = response.json()
        return [file["name"] for file in files if file["name"].endswith(".vhd")]
    else:
        print(f"Error fetching files: {response.status_code}")
        return []

def clean_description(desc):
    """Čisti opis - uklanja copyright i dekorativne linije"""
    lines = []
    for line in desc.split('\n'):
        line = line.strip()
        if not line or 'copyright' in line.lower() or line.startswith('---'):
            continue
        if line.startswith('--'):
            line = line[2:].strip()
        lines.append(line)
    return ' '.join(lines).strip()

def extract_content(vhdl_content):
    """Izdvaja opis i kod iz VHDL sadržaja"""
    # Pronađi početak koda (nakon komentara)
    code_start = 0
    comment_block = re.search(r'^-{3,}.*?-{3,}', vhdl_content, re.DOTALL)
    if comment_block:
        code_start = comment_block.end()

    # Ekstrakcija opisa
    description = clean_description(vhdl_content[:code_start]) if code_start > 0 else ""

    # Ekstrakcija koda sa očuvanim formatom
    code = vhdl_content[code_start:].strip()
    # Ukloni linijske komentare ali zadrži nove redove
    code = '\n'.join([line for line in code.split('\n') if not line.strip().startswith('--')])

    return description, code

def main():
    vhdl_files = get_vhdl_files()

    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL, escapechar='\\')
        writer.writerow(['input', 'output'])  # Header

        for filename in vhdl_files:
            print(f"Processing {filename}...")
            raw_url = RAW_BASE_URL + filename
            response = requests.get(raw_url)

            if response.status_code == 200:
                description, code = extract_content(response.text)

                if not description:
                    description = f"VHDL component {filename.split('.')[0]}"

                # Čuvanje koda sa originalnim formatiranjem
                writer.writerow([description, code])
                print(f"  Successfully processed {filename}")
            else:
                print(f"  Failed to fetch {filename}")

            sleep(1)  # Da izbegnemo rate limiting

    print(f"\nDone! Results saved to {OUTPUT_CSV}")

if __name__ == "__main__":
    main()

Processing coprocessor.vhd...
  Successfully processed coprocessor.vhd
Processing dbus_monitor.vhd...
  Successfully processed dbus_monitor.vhd
Processing generic_dpram.vhd...
  Successfully processed generic_dpram.vhd
Processing ibus_adapter.vhd...
  Successfully processed ibus_adapter.vhd
Processing intercon.vhd...
  Successfully processed intercon.vhd
Processing platform.vhd...
  Successfully processed platform.vhd
Processing program_ram.vhd...
  Successfully processed program_ram.vhd
Processing scrambler.vhd...
  Successfully processed scrambler.vhd
Processing timer.vhd...
  Successfully processed timer.vhd

Done! Results saved to vhdl_dataset.csv


In [2]:
import requests
import re
import csv
from time import sleep
from urllib.parse import unquote

# Konfiguracija
GITHUB_API_URL = "https://api.github.com/repos/open-logic/open-logic/contents/src/base/vhdl"
RAW_BASE_URL = "https://raw.githubusercontent.com/open-logic/open-logic/main/src/base/vhdl/"
DOCS_BASE_URL = "https://raw.githubusercontent.com/open-logic/open-logic/main/doc/base/"
OUTPUT_CSV = "open_logic_vhdl_dataset.csv"

def get_vhdl_files():
    """Dohvata listu svih VHDL fajlova sa GitHub-a"""
    response = requests.get(GITHUB_API_URL)
    if response.status_code == 200:
        files = response.json()
        return [file["name"] for file in files if file["name"].endswith(".vhd")]
    else:
        print(f"Error fetching files: {response.status_code}")
        return []

def extract_md_description(md_content):
    """Ekstrahira Description dio iz markdown fajla"""
    # Pronađi Description sekciju
    desc_match = re.search(
        r'^##\s*Description\s*$(.*?)(?=^##\s|\Z)',
        md_content,
        re.DOTALL | re.MULTILINE | re.IGNORECASE
    )

    if not desc_match:
        return ""

    description = desc_match.group(1).strip()

    # Ukloni slike i specijalne markdown elemente
    description = re.sub(r'!\[.*?\]\(.*?\)', '', description)
    description = re.sub(r'`.*?`', '', description)

    # Očisti prazne linije i višestruke razmake
    clean_lines = []
    for line in description.split('\n'):
        line = line.strip()
        if line and not line.startswith('#'):
            clean_lines.append(line)

    return ' '.join(clean_lines).strip()

def get_description_from_docs(vhdl_filename):
    """Dohvata opis iz odgovarajućeg .md fajla u dokumentaciji"""
    md_filename = vhdl_filename.replace('.vhd', '.md')
    doc_url = DOCS_BASE_URL + md_filename

    try:
        response = requests.get(doc_url)
        if response.status_code == 200:
            return extract_md_description(response.text)
    except Exception as e:
        print(f"  Error fetching docs: {e}")

    return ""

def extract_vhdl_description(vhdl_content):
    """Izdvaja Description dio iz VHDL header-a"""
    desc_match = re.search(
        r'^-{3,}\s*Description\s*-{3,}\s*(.*?)(?=^-{3,}|\Z)',
        vhdl_content,
        re.DOTALL | re.MULTILINE
    )

    if not desc_match:
        return ""

    description = desc_match.group(1)
    clean_lines = []
    for line in description.split('\n'):
        line = line.strip()
        if line.startswith('--'):
            line = line[2:].strip()
        if line and not line.startswith('Documentation:') and not line.startswith('Note:'):
            clean_lines.append(line)

    return ' '.join(clean_lines).strip()

def extract_content(vhdl_filename, vhdl_content):
    """Glavna funkcija za ekstrakciju opisa i koda"""
    # Prvo pokušaj iz VHDL header-a
    description = extract_vhdl_description(vhdl_content)

    # Ako nema dobrog opisa, pokušaj iz dokumentacije
    if not description or len(description.split()) < 10:  # Ako je opis prekratak
        description = get_description_from_docs(vhdl_filename)

    # Ako i dalje nema opisa, koristi fallback
    if not description:
        description = f"VHDL component {vhdl_filename.split('.')[0]}"

    # Ekstrakcija koda
    code_start = 0
    header_end = re.search(r'^-{3,}\s*Libraries\s*-{3,}', vhdl_content, re.MULTILINE | re.IGNORECASE)
    if header_end:
        code_start = header_end.end()

    code = vhdl_content[code_start:].strip()
    code = '\n'.join([line for line in code.split('\n') if not line.strip().startswith('--')])

    return description, code

def main():
    vhdl_files = get_vhdl_files()

    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        writer.writerow(['input', 'output'])  # Header

        for filename in vhdl_files:
            print(f"Processing {filename}...")
            raw_url = RAW_BASE_URL + filename
            response = requests.get(raw_url)

            if response.status_code == 200:
                description, code = extract_content(filename, response.text)
                writer.writerow([description, code])
                print(f"  Successfully processed {filename}")
            else:
                print(f"  Failed to fetch {filename}")

            sleep(1)  # Rate limiting

    print(f"\nDone! Results saved to {OUTPUT_CSV}")

if __name__ == "__main__":
    main()

Processing olo_base_arb_prio.vhd...
  Successfully processed olo_base_arb_prio.vhd
Processing olo_base_arb_rr.vhd...
  Successfully processed olo_base_arb_rr.vhd
Processing olo_base_cam.vhd...
  Successfully processed olo_base_cam.vhd
Processing olo_base_cc_bits.vhd...
  Successfully processed olo_base_cc_bits.vhd
Processing olo_base_cc_handshake.vhd...
  Successfully processed olo_base_cc_handshake.vhd
Processing olo_base_cc_n2xn.vhd...
  Successfully processed olo_base_cc_n2xn.vhd
Processing olo_base_cc_pulse.vhd...
  Successfully processed olo_base_cc_pulse.vhd
Processing olo_base_cc_reset.vhd...
  Successfully processed olo_base_cc_reset.vhd
Processing olo_base_cc_simple.vhd...
  Successfully processed olo_base_cc_simple.vhd
Processing olo_base_cc_status.vhd...
  Successfully processed olo_base_cc_status.vhd
Processing olo_base_cc_xn2n.vhd...
  Successfully processed olo_base_cc_xn2n.vhd
Processing olo_base_crc.vhd...
  Successfully processed olo_base_crc.vhd
Processing olo_base_de

In [3]:
GITHUB_API_URL = "https://api.github.com/repos/open-logic/open-logic/contents/src/intf/vhdl"
RAW_BASE_URL = "https://raw.githubusercontent.com/open-logic/open-logic/main/src/intf/vhdl/"
DOCS_BASE_URL = "https://raw.githubusercontent.com/open-logic/open-logic/main/doc/intf/"
OUTPUT_CSV = "open_logic_intf_vhdl_dataset.csv"  # Novi naziv CSV fajla
main()

Processing olo_intf_clk_meas.vhd...
  Successfully processed olo_intf_clk_meas.vhd
Processing olo_intf_debounce.vhd...
  Successfully processed olo_intf_debounce.vhd
Processing olo_intf_i2c_master.vhd...
  Successfully processed olo_intf_i2c_master.vhd
Processing olo_intf_spi_master.vhd...
  Successfully processed olo_intf_spi_master.vhd
Processing olo_intf_spi_slave.vhd...
  Successfully processed olo_intf_spi_slave.vhd
Processing olo_intf_sync.vhd...
  Successfully processed olo_intf_sync.vhd
Processing olo_intf_uart.vhd...
  Successfully processed olo_intf_uart.vhd

Done! Results saved to open_logic_intf_vhdl_dataset.csv


In [4]:
GITHUB_API_URL = "https://api.github.com/repos/open-logic/open-logic/contents/src/axi/vhdl"
RAW_BASE_URL = "https://raw.githubusercontent.com/open-logic/open-logic/main/src/axi/vhdl/"
DOCS_BASE_URL = "https://raw.githubusercontent.com/open-logic/open-logic/main/doc/axi/"
OUTPUT_CSV = "open_logic_axi_vhdl_dataset.csv"
main()

Processing olo_axi_lite_slave.vhd...
  Successfully processed olo_axi_lite_slave.vhd
Processing olo_axi_master_full.vhd...
  Successfully processed olo_axi_master_full.vhd
Processing olo_axi_master_simple.vhd...
  Successfully processed olo_axi_master_simple.vhd
Processing olo_axi_pkg_protocol.vhd...
  Successfully processed olo_axi_pkg_protocol.vhd
Processing olo_axi_pl_stage.vhd...
  Successfully processed olo_axi_pl_stage.vhd

Done! Results saved to open_logic_axi_vhdl_dataset.csv
