given an svd file, generate questions based on the tags 

In [95]:
import os 
import json
import io
import random

from tqdm import tqdm
from collections import defaultdict
from cmsis_svd.parser import SVDParser

In [32]:
config = {
    'svd_files' : 'cmsis-svd-data/data/',
    'db' : './chroma_db'
}

In [33]:
vendor_to_svd = defaultdict(list)
svd_to_vendor = {}
svd_to_content = {}

for root, dirs, files in os.walk(config['svd_files']):
    subdirs = root.split(os.sep)
    if len(subdirs) > 2:  # Ensure it's at least within a vendor's subdirectory
        vendor = subdirs[2]  # Get the vendor name
        # Collect full file paths for the vendor
        for file in files:
            if 'svd' in file:
                file_path = os.path.join(root, file)
                vendor_to_svd[vendor].append(file)
                with open(file_path, 'rb') as f:
                    content = f.read()
                
                svd_to_content[file] = content
            svd_to_vendor[file] = vendor

In [34]:
# convert svd to content to json or text using svd parser 
def svd_to_json(svd_content):
    if isinstance(svd_content, bytes):
        svd_content = svd_content.decode("utf-8")
    else:
        # saved as json string already in case accidentally rerun
        return svd_content

    # try:
    #     svd_file = io.StringIO(svd_content)
    #     parser = SVDParser.for_xml_file(svd_file)
    #     device = parser.get_device()
    # except Exception as e:
    #     # raise ValueError(f"Failed to parse SVD content: {e}")
    #     return None

    svd_file = io.StringIO(svd_content)
    parser = SVDParser.for_xml_file(svd_file)
    device = parser.get_device()

    device_dict = {
        "device": {
            "name": device.name,
            "description": device.description,
            "peripherals": []
        }
    }

    for peripheral in device.peripherals:
        peripheral_info = {
            "name": peripheral.name,
            "description": peripheral.description,
            "base_address": peripheral.base_address,
            "registers": []
        }

        for register in peripheral.registers:
            register_info = {
                "name": register.name,
                "address_offset": register.address_offset,
                "size": register.size,
                "description": register.description,
                "fields": []
            }

            for field in register.fields:
                field_info = {
                    "name": field.name,
                    "bit_offset": field.bit_offset,
                    "bit_width": field.bit_width,
                    "description": field.description
                }
                register_info["fields"].append(field_info)

            peripheral_info["registers"].append(register_info)

        device_dict["device"]["peripherals"].append(peripheral_info)

    return str(json.dumps(device_dict, indent=4))


In [35]:
test_svd = svd_to_json(svd_to_content['nrf52840.svd'])

In [20]:
test_svd_json = json.loads(test_svd)

In [36]:
test_svd_json['device'].keys()

dict_keys(['name', 'description', 'peripherals'])

In [38]:
peripherals = test_svd_json['device']['peripherals']

In [39]:
peripherals[0]['name']

'FICR'

{"prompt": "Generate the SVD file for the nRF52840 microcontroller.", "completion": "<SVD XML content here>"}
{"prompt": "List all peripherals and their base addresses for the STM32F103 microcontroller.", "completion": "Peripheral base addresses: GPIOA - 0x40010800, GPIOB - 0x40010C00, ..."}

In [64]:
peripherals_template = {
    "prompt": "List all the {tag}s for {name} microcontroller."
}

list_all_tags_from_parent_template = {
    "prompt": "List all the {tag}s of {parent_tag} for {name} microcontroller."
}

specific_tag_template = {
    "prompt": "What is the {tag} of {parent_tag} for {name} microcontroller?"
}


In [83]:
# device level questions 
def generate_template_questions(device):
    qa = []
    if "name" in device:
        name = device['name']
    else:
        return qa

    # basic: name, description, peripherals 
    qa.append({
        "prompt": "What is the description of {name} microcontroller".format(name=name),
        "completion": device['description']
    })
    
    qa.append({
        "prompt": peripherals_template["prompt"].format(tag="peripheral", name=name),
        "completion": ', '.join([peripheral['name'] for peripheral in device['peripherals']])
    })
    
    for peripheral in device['peripherals']:
        # for every peripheral, list names, descriptions, base_address, registers
        p_name = peripheral['name'] + " peripheral"
        qa.append({
            "prompt": specific_tag_template["prompt"].format(tag="description", parent_tag=p_name, name=name),
            "completion": peripheral['description']
        })
        qa.append({
            "prompt": specific_tag_template["prompt"].format(tag="base address", parent_tag=p_name, name=name),
            "completion": peripheral['base_address']
        })
        qa.append({
            "prompt": list_all_tags_from_parent_template["prompt"].format(tag="register", parent_tag=p_name, name=name),
            "completion": ', '.join([register['name'] for register in peripheral['registers']])
        })
        
        for register in peripheral["registers"]:
            r_name = register["name"] + " register from " + p_name
            
            qa.append({
                "prompt": specific_tag_template["prompt"].format(tag="description", parent_tag=r_name, name=name),
                "completion": register['description']
            })
            qa.append({
                "prompt": specific_tag_template["prompt"].format(tag="size", parent_tag=r_name, name=name),
                "completion": register['size']
            })
            qa.append({
                "prompt": specific_tag_template["prompt"].format(tag="address_offset", parent_tag=r_name, name=name),
                "completion": register['address_offset']
            })
            qa.append({
                "prompt": list_all_tags_from_parent_template["prompt"].format(tag="fields", parent_tag=r_name, name=name),
                "completion": ', '.join([field['name'] for field in register['fields']])
            })
            
            for field in register['fields']:
                f_name = field["name"] + " field from " + r_name
                
                qa.append({
                    "prompt": specific_tag_template["prompt"].format(tag="description", parent_tag=f_name, name=name),
                    "completion": register['description']
                })
                if "bit_offset" in register:
                    qa.append({
                        "prompt": specific_tag_template["prompt"].format(tag="bit_offset", parent_tag=f_name, name=name),
                        "completion": register['bit_offset']
                    })
                if "bit_width" in register:
                    qa.append({
                        "prompt": specific_tag_template["prompt"].format(tag="bit_width", parent_tag=f_name, name=name),
                        "completion": register['bit_width']
                    })
                
    return qa

In [86]:
test_out = generate_template_questions(test_svd_json['device'])

In [87]:
print(len(test_out))

12111


In [94]:
all_svd_qa = []

1798


In [103]:
print(len((list(svd_to_content.keys()))))
# choose 10 random svds to test the template qa generation
random_keys = random.sample(list(svd_to_content.keys()), 5)
subset_svds = {}

# Print the selected items
for key in random_keys:
    print(key)
    subset_svds[key] = svd_to_content[key]
    # print(f"Content: {svd_to_content[key]}")

1798
MB9AF31xK.svd
TM4C1231H6PZ.svd
EFM32LG900F256.svd
EFM32GG11B820F2048GQ100.svd
ATSAMD21J17A.svd


In [105]:
for svd_name, svd in tqdm(subset_svds.items()):
    # print(svd_name)
    svd_content = svd_to_json(svd)
    svd_json = json.loads(svd_content)
    svd_qa = generate_template_questions(svd_json['device'])
    all_svd_qa.extend(svd_qa)
    

100%|██████████| 5/5 [00:01<00:00,  3.72it/s]


In [107]:
len(all_svd_qa)

53988

In [108]:
all_svd_qa

[{'prompt': 'What is the description of MB9AF31xK microcontroller',
  'completion': 'MB9AF31xK'},
 {'prompt': 'List all the peripherals for MB9AF31xK microcontroller.',
  'completion': 'WORKFLASH_IF, FLASH_IF, CRG, CRTRIM, SWWDT, HWWDT, DTIM, MFT0, BTIOSEL03, BTIOSEL47, SBSSR, BT0, BT1, BT2, BT3, BT4, BT5, BT6, BT7, QPRC0, WC, MFT_PPG, ADC0, ADC1, EXTI, INTREQ, GPIO, LVD, DS, MFS0, MFS1, MFS3, MFS5, MFS_NFC, RTC, CRC, USBCLK, USB0, DMAC'},
 {'prompt': 'What is the description of WORKFLASH_IF peripheral for MB9AF31xK microcontroller?',
  'completion': 'WorkFlash Memory'},
 {'prompt': 'What is the base address of WORKFLASH_IF peripheral for MB9AF31xK microcontroller?',
  'completion': 537788416},
 {'prompt': 'List all the registers of WORKFLASH_IF peripheral for MB9AF31xK microcontroller.',
  'completion': 'WFASZR, WFRWTR, WFSTR'},
 {'prompt': 'What is the description of WFASZR register from WORKFLASH_IF peripheral for MB9AF31xK microcontroller?',
  'completion': 'WorkFlash Access Size R