In [None]:
!unzip /content/drive/MyDrive/circuit-digest-dataset/hand.v1i.voc-new.zip -d hand1

In [None]:
import os
import shutil
import xml.etree.ElementTree as ET

# Path to the original dataset root folder
original_dataset_path = '/content/hand1/hand.v1i.voc'  # Update this path

# Paths for the original dataset folders within the dataset root
dataset_folders = ['test', 'train', 'valid']

# Paths for the new structure
img_root = '/content/xml_format/images'
xml_root = '/content/xml_format/xml'

# Create root directories if they don't exist
os.makedirs(img_root, exist_ok=True)
os.makedirs(xml_root, exist_ok=True)

def modify_xml_structure(xml_file):
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()

        # Remove the <folder>, <path>, and <source> tags if they exist
        for tag in ['folder', 'path', 'source']:
            element = root.find(tag)
            if element is not None:
                root.remove(element)

        # Save the updated XML structure with the modified format
        new_xml_file = xml_file.replace(".xml", "_modified.xml")
        with open(new_xml_file, 'wb') as f:
            tree.write(f, encoding='utf-8', xml_declaration=True)

        return new_xml_file
    except Exception as e:
        print(f"Error modifying XML file {xml_file}: {e}")
        return None

def get_label_from_xml(xml_file):
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        object_tag = root.find('object')
        if object_tag is not None:
            name_tag = object_tag.find('name')
            if name_tag is not None:
                label = name_tag.text
            else:
                label = 'unknown'
        else:
            label = 'unknown'
        return label
    except Exception as e:
        print(f"Error parsing XML file {xml_file}: {e}")
        return 'unknown'

def copy_file(src_file, dest_folder):
    try:
        os.makedirs(dest_folder, exist_ok=True)
        shutil.copy(src_file, dest_folder)
        print(f"Copied {src_file} to {dest_folder}")
    except Exception as e:
        print(f"Error copying file {src_file} to {dest_folder}: {e}")

for folder in dataset_folders:
    folder_path = os.path.join(original_dataset_path, folder)
    if not os.path.exists(folder_path):
        print(f"Warning: Folder {folder_path} does not exist.")
        continue

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        if file_name.endswith('.jpg'):
            # Determine the label from the corresponding XML file
            xml_file_name = file_name.replace('.jpg', '.xml')
            xml_file_path = os.path.join(folder_path, xml_file_name)

            if os.path.exists(xml_file_path):
                # Modify the XML structure to remove unwanted tags
                new_xml_file = modify_xml_structure(xml_file_path)

                if new_xml_file:
                    label = get_label_from_xml(new_xml_file)

                    # Copy the image file to the corresponding label folder in img
                    dest_img_folder = os.path.join(img_root, label)
                    copy_file(file_path, dest_img_folder)

                    # Copy the modified XML file to the corresponding label folder in xml
                    dest_xml_folder = os.path.join(xml_root, label)
                    copy_file(new_xml_file, dest_xml_folder)
            else:
                print(f"Warning: No corresponding XML file found for {file_name}")
        elif file_name.endswith('.xml'):
            continue  # XML files are handled when processing images
        else:
            print(f"Skipping unknown file type: {file_name}")

print("Dataset restructuring complete.")


In [None]:
import os

# Path to the folder
folder_path = '/content/yess/xml_format/images/hand1'

# Get a list of all files and directories in the folder
files_and_dirs = os.listdir(folder_path)

# Filter out directories, only count files
files = [f for f in files_and_dirs if os.path.isfile(os.path.join(folder_path, f))]

# Print the number of files
print(f"Number of files in the folder: {len(files)}")
