In [45]:
import os
import json
import shutil

# Define paths
model_folder = 'clothing-detection-ecommerce-dataset-master'  # Root folder containing dataset and Annotations
annotation_dir = os.path.join(model_folder, 'annotations')  # Directory where annotation JSON files are located
dataset_dir = os.path.join(model_folder, 'dataset')  # Directory containing images
labels_dir = 'ecommerce_dataset/labels'  # Directory where YOLO label files are located
images_dir = 'ecommerce_dataset/images'  # Directory to copy relevant images
log_file_path = 'missing_categories_log.txt'  # Path to save the log file

# Create directories if they don't exist
os.makedirs(labels_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)

# Count the number of files in the annotation directory and labels directory
annotation_files_count = sum([len(files) for r, d, files in os.walk(annotation_dir)])
label_files_count = sum([len(files) for r, d, files in os.walk(labels_dir)])
image_files_count = sum([len(files) for r, d, files in os.walk(images_dir)])

print(f"Number of annotation files: {annotation_files_count}")
print(f"Number of label files: {label_files_count}")
print(f"Number of image files: {image_files_count}")


Number of annotation files: 65166
Number of label files: 64639
Number of image files: 64639


In [43]:
# Define category mapping
category_mapping = {
    'topwear': [
        'camisetas', 'bodies', 'jerseys', 'camisas', 'camiseta-interior', 
        'polos', 'sudaderas', 'vestidos', 'cardigans', 
        'camisas-y-blusas', 'lenceria', 'banadores', 'bufandas', 'trajes', 'reloj',
        'sombreros', 'gorras', 'gorros', 'guantes', 'gafas', 'paraguas', 'cinturones', 
        'corbatas', 'bisuteria', 'medias', 'calcetines', 'panuelos', 'bolsos-y-mochilas',
        'carteras', 'abrigos', 'botines', 'shirts', 'jackets', 'dresses','vestes', 'coats'
    ],
    'bottomwear': [
        'monos', 'jeans', 'pantalones', 'braguitas', 'shorts', 'faldas', 'calzoncillos',
        'trousers', 'skirts', 'jumpsuits', 'swimwear'
    ],
    'footwear': [
        'botas', 'zapatos', 'tacones', 'chanclas', 'sandalias', 'zapatillas', 'alpargatas', 'shoes',
        'boots', 'sportshoes'
    ]
}

# Map original categories to class IDs
class_id_mapping = {
    'topwear': 0,
    'bottomwear': 1,
    'footwear': 2,
}

# underwear

In [44]:
from PIL import Image

# Log missing categories
missing_categories_log = []

# Process each annotation file
for annotation_file in os.listdir(annotation_dir):
    if annotation_file.endswith('.json'):
        with open(os.path.join(annotation_dir, annotation_file)) as f:
            data = json.load(f)
            img_path = data['file_name']  
            img_full_path = os.path.join(model_folder, img_path)  
            
            if not os.path.exists(img_full_path):
                print(f"Image file {img_full_path} not found, skipping.")
                continue

            # Open image to get dimensions
            img = Image.open(img_full_path)
            img_width, img_height = img.size
            img.close()

            yolo_annotations = []
            for box in data['arr_boxes']:
                category = box['class']
                class_id = None
                
                for main_category, subcategories in category_mapping.items():
                    if category in subcategories:
                        class_id = str(class_id_mapping[main_category])  # Ensure class_id is a string
                        break

                if class_id is None:
                    missing_categories_log.append(category)
                    continue

                # Get bounding box coordinates
                x_center = (box['x'] + box['width'] / 2) / img_width  # Normalize x_center
                y_center = (box['y'] + box['height'] / 2) / img_height  # Normalize y_center
                width = box['width'] / img_width  # Normalize width
                height = box['height'] / img_height  # Normalize height

                # Ensure class_id is converted to string for YOLO format
                yolo_annotations.append(f"{class_id} {x_center} {y_center} {width} {height}")

            if yolo_annotations:
                base_name = os.path.splitext(annotation_file)[0]
                output_file_path = os.path.join(labels_dir, base_name + '.txt')
                with open(output_file_path, 'w') as out_f:
                    out_f.write('\n'.join(yolo_annotations))
                
                # Copy image to new directory
                new_img_path = os.path.join(images_dir, base_name + '.png')
                shutil.copy(img_full_path, new_img_path)

# Save missing categories to log file
if missing_categories_log:
    with open(log_file_path, 'w') as log_f:
        log_f.write('\n'.join(set(missing_categories_log)))

print("Conversion completed!")


Conversion completed!


In [47]:
import os
import shutil
import random

# Define directories
images_dir = 'ecommerce_dataset/images'
labels_dir = 'ecommerce_dataset/labels'

train_images_dir = 'ecommerce_dataset/train/images'
val_images_dir = 'ecommerce_dataset/val/images'
test_images_dir = 'ecommerce_dataset/test/images'

train_labels_dir = 'ecommerce_dataset/train/labels'
val_labels_dir = 'ecommerce_dataset/val/labels'
test_labels_dir = 'ecommerce_dataset/test/labels'

# Create directories if they don't exist
os.makedirs(train_images_dir, exist_ok=True)
os.makedirs(val_images_dir, exist_ok=True)
os.makedirs(test_images_dir, exist_ok=True)

os.makedirs(train_labels_dir, exist_ok=True)
os.makedirs(val_labels_dir, exist_ok=True)
os.makedirs(test_labels_dir, exist_ok=True)

# Get list of all image files
image_files = [f for f in os.listdir(images_dir) if f.endswith('.png')]

# Shuffle the image files
random.shuffle(image_files)

# Define split ratios
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

# Calculate split indices
train_split = int(train_ratio * len(image_files))
val_split = int((train_ratio + val_ratio) * len(image_files))

# Split the data
train_files = image_files[:train_split]
val_files = image_files[train_split:val_split]
test_files = image_files[val_split:]

# Function to move files
def move_files(files, src_image_dir, dest_image_dir, src_label_dir, dest_label_dir):
    for file in files:
        base_name = os.path.splitext(file)[0]
        label_file = base_name + '.txt'
        
        # Move image file
        shutil.move(os.path.join(src_image_dir, file), os.path.join(dest_image_dir, file))
        
        # Move corresponding label file
        shutil.move(os.path.join(src_label_dir, label_file), os.path.join(dest_label_dir, label_file))

# Move the files to the respective directories
move_files(train_files, images_dir, train_images_dir, labels_dir, train_labels_dir)
move_files(val_files, images_dir, val_images_dir, labels_dir, val_labels_dir)
move_files(test_files, images_dir, test_images_dir, labels_dir, test_labels_dir)

print("Dataset split completed!")


Dataset split completed!


In [48]:
train_images_dir = 'ecommerce_dataset/train/images'
val_images_dir = 'ecommerce_dataset/val/images'
test_images_dir = 'ecommerce_dataset/test/images'

train_labels_dir = 'ecommerce_dataset/train/labels'
val_labels_dir = 'ecommerce_dataset/val/labels'
test_labels_dir = 'ecommerce_dataset/test/labels'

# Count the number of files in the annotation directory and labels directory
train_image_files_count = sum([len(files) for r, d, files in os.walk(train_images_dir)])
val_image_files_count = sum([len(files) for r, d, files in os.walk(val_images_dir)])
test_image_files_count = sum([len(files) for r, d, files in os.walk(test_images_dir)])

train_labels_files_count = sum([len(files) for r, d, files in os.walk(train_labels_dir)])
val_labels_files_count = sum([len(files) for r, d, files in os.walk(val_labels_dir)])
test_labels_files_count = sum([len(files) for r, d, files in os.walk(test_labels_dir)])


print(f"Number of train images: {train_image_files_count}")
print(f"Number of train lables: {train_labels_files_count}")

print(f"Number of val images: {val_image_files_count}")
print(f"Number of val lables: {val_labels_files_count}")

print(f"Number of test images: {test_image_files_count}")
print(f"Number of test lables: {test_labels_files_count}")

Number of train images: 45247
Number of train lables: 45247
Number of val images: 12928
Number of val lables: 12928
Number of test images: 6464
Number of test lables: 6464
