# Labelbox Data handling scripts

## convert labelbox ndjson to YOLO format 
where each corresponding image has a .txt file with the same name and the label

In [3]:
import json
import os
import shutil
# Specify the path to your ndjson file
input_path = 'export_20240404.ndjson' #change these paths
images_dir = "../New dataset/elkasr_elaini_20230315/"
output_dir = 'kasr_aini_20240404_full'


# Function to convert Roman numerals to Arabic numerals as per your mapping
def roman_to_arabic(roman):
    mapping = {'I': 0, 'i':0, 
               'IIb': 1, 'iia':1,
               'IIa': 1, 'iib':1,
                'III': 2, 'iii':2}
    return mapping.get(roman, None)

# Open the ndjson file
with open(input_path, 'r') as f:
    lines = f.readlines()

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process each line (which is a separate JSON object)
for line in lines:
    data = json.loads(line)

    # Extract the required values
    external_id = data['data_row']['external_id']
    degree = None

    # The "degree" value is nested, so we need to dig a bit deeper
    for project in data['projects'].values():
        for label in project['labels']:
            for classification in label['annotations']['classifications']:
                if classification['name'] == 'degree':
                    degree = roman_to_arabic(classification['radio_answer']['name'])
                    break
                else:
                    print(f"Skipping annotation with name: {classification['name']}")

    # Check if the filename ends with .jpg
    if external_id.endswith('.jpg'):
        # find the file in the images directory or its subdirectories
        for root, dirs, files in os.walk(images_dir):
            if external_id in files:
                print ("found file", external_id, "with label", degree)
                # Copy the file to the output directory
                os.system(f'cp "{os.path.join(root, external_id)}" "{output_dir}"')
                shutil.copy(os.path.join(root, external_id), output_dir)                
                break
        # Create a .txt file with the same name
        filename = os.path.join(output_dir, external_id.replace('.jpg', '.txt'))
        # Write the label to the .txt file
        with open(filename, 'w') as txtfile:
            txtfile.write(str(degree))
print("Done")

found file img (1).jpg with label 2
found file img (2).jpg with label 2
found file img (3).jpg with label 2
found file img (9).jpg with label 2
found file img (10).jpg with label 2
found file img (11).jpg with label 1
found file img (12).jpg with label 2
found file img (13).jpg with label 2
found file img (14).jpg with label 2
found file img (16).jpg with label 1
found file img (17).jpg with label 2
found file img (18).jpg with label 2
found file img (19).jpg with label 2
found file img (20).jpg with label 2
found file img (21).jpg with label 2
found file img (22).jpg with label 2
found file img (23).jpg with label 2
found file img (24).jpg with label 1
found file img (25).jpg with label 1
found file img (26).jpg with label 2
found file img (27).jpg with label 2
found file img (28).jpg with label 2
found file img (35).jpg with label 1
found file img (36).jpg with label 2
Skipping annotation with name: Needs graft?
found file img (37).jpg with label 2
found file img (38).jpg with label 

## Handling mask data

In [5]:
import requests
from PIL import Image
import numpy as np
import io
import json
import os

headers = {
    'Authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJjbHBoZXdxbHMwNGdzMDd5czU1ejA5bWZhIiwib3JnYW5pemF0aW9uSWQiOiJjbHBoZXdxbDgwNGdyMDd5czlmODRmOWNzIiwiYXBpS2V5SWQiOiJjbHVsejVmNzYwNGx6MDd3N2d0bHFlbmw3Iiwic2VjcmV0IjoiZjc2OGNhODQ2YzI0OGFhNzFmODk3NWU0YjdkOWYwYzUiLCJpYXQiOjE3MTIyNzk3NDUsImV4cCI6MjM0MzQzMTc0NX0.-COC3GIHm5oG9Kp1wfNG2R9ZsPVlX6rMMJJyBU_9PzE'
}

# Function to fetch mask image from URL with authentication
def fetch_mask_image(url, headers=None):
    response = requests.get(url, headers=headers, stream=True)
    if response.status_code == 200:
        content_type = response.headers.get('content-type')
        print("Content Type:", content_type)
        if 'image' in content_type:
            try:
                img = Image.open(response.raw)
                return img
            except Exception as e:
                print("Error:", e)
                return None
        else:
            print("Error: Response is not an image.")
            return None
    else:
        print(f"Error: Unable to fetch image. Status code: {response.status_code}")
        return None

# Function to combine masks with similar labels
def combine_masks(masks):
    combined_mask = np.zeros_like(masks[0], dtype=np.uint8)
    for mask in masks:
        combined_mask += mask
    combined_mask = np.clip(combined_mask, 0, 255)
    return combined_mask

# Parse the provided JSON data
json_data = '''
{"data_row": {"id": "cltsuivlt0y6b084095ayxe92", "external_id": "img (35).jpg", "row_data": "https://storage.labelbox.com/clphewql804gr07ys9f84f9cs%2F4f5fda11-0dc3-fd85-5838-74aed3bdb6d3-img%20(35).jpg?Expires=1712279243174&KeyName=labelbox-assets-key-3&Signature=iHP0cfdd5tk2ZSr0giP6kz7pm90", "details": {"dataset_id": "cltsu739100240840e0mn9n78", "dataset_name": "KA_20240315", "created_at": "2024-03-15T16:00:56.389+00:00", "updated_at": "2024-03-15T16:00:57.581+00:00", "last_activity_at": "2024-04-01T23:16:18.000+00:00", "created_by": "ahmed.elsarta00@eng-st.cu.edu.eg"}}, "media_attributes": {"height": 1600, "width": 483, "mime_type": "image/jpeg", "exif_rotation": "1"}, "attachments": [], "projects": {"cltspurvz006k070i059o144r": {"name": "Elkasr ElAini burn images dataset", "labels": [{"label_kind": "Default", "version": "1.0.0", "id": "cltyyo3ig0dci07c0c7jr142o", "label_details": {"created_at": "2024-03-19T23:02:19.000+00:00", "updated_at": "2024-03-19T23:02:19.000+00:00", "created_by": "ahmed.elsarta00@eng-st.cu.edu.eg", "content_last_updated_at": "2024-03-19T23:02:19.044+00:00", "reviews": []}, "annotations": {"objects": [{"feature_id": "cltyyu3dr002w2a6ehax4o6eh", "feature_schema_id": "clphfkl3n08cy070b9gnb9fjn", "name": "III", "value": "iii", "annotation_kind": "ImageSegmentationMask", "classifications": [], "mask": {"url": "https://api.labelbox.com/api/v1/projects/cltspurvz006k070i059o144r/annotations/cltyyu3dr002w2a6ehax4o6eh/index/1/mask"}, "composite_mask": {"url": "https://api.labelbox.com/api/v1/tasks/clukjervj017w074v3q3g62nc/masks/cltyyo9o7002q2a6etlvcrgti/index/1", "color_rgb": [153, 73, 245]}}, {"feature_id": "cltyywdyg00382a6eg1l3k4p5", "feature_schema_id": "clphfkl3n08cu070b6aa93xqm", "name": "IIa", "value": "i_ia", "annotation_kind": "ImageSegmentationMask", "classifications": [], "mask": {"url": "https://api.labelbox.com/api/v1/projects/cltspurvz006k070i059o144r/annotations/cltyywdyg00382a6eg1l3k4p5/index/1/mask"}, "composite_mask": {"url": "https://api.labelbox.com/api/v1/tasks/clukjervj017w074v3q3g62nc/masks/cltyyo9o7002q2a6etlvcrgti/index/1", "color_rgb": [200, 126, 137]}}, {"feature_id": "cltyywunw003c2a6epk2lpygh", "feature_schema_id": "clphfkl3n08cu070b6aa93xqm", "name": "IIa", "value": "i_ia", "annotation_kind": "ImageSegmentationMask", "classifications": [], "mask": {"url": "https://api.labelbox.com/api/v1/projects/cltspurvz006k070i059o144r/annotations/cltyywunw003c2a6epk2lpygh/index/1/mask"}, "composite_mask": {"url": "https://api.labelbox.com/api/v1/tasks/clukjervj017w074v3q3g62nc/masks/cltyyo9o7002q2a6etlvcrgti/index/1", "color_rgb": [63, 173, 118]}}, {"feature_id": "cltyyzo6r003l2a6eup91h0q7", "feature_schema_id": "clphfkl3n08cy070b9gnb9fjn", "name": "III", "value": "iii", "annotation_kind": "ImageSegmentationMask", "classifications": [], "mask": {"url": "https://api.labelbox.com/api/v1/projects/cltspurvz006k070i059o144r/annotations/cltyyzo6r003l2a6eup91h0q7/index/1/mask"}, "composite_mask": {"url": "https://api.labelbox.com/api/v1/tasks/clukjervj017w074v3q3g62nc/masks/cltyyo9o7002q2a6etlvcrgti/index/1", "color_rgb": [212, 41, 81]}}], "classifications": [{"feature_id": "cltyz2ece00412a6e3wzfri29", "feature_schema_id": "clphfkl3n08d0070bg7dx7f4a", "name": "degree", "value": "degree", "radio_answer": {"feature_id": "cltyz2ece00402a6eb5oecqjz", "feature_schema_id": "clphfkl3n08d3070b4h6iez8x", "name": "IIa", "value": "i_ia", "classifications": []}}], "relationships": []}}]}}}

'''

data = json.loads(json_data)

# Create a directory to save mask images
output_dir = "mask_images"
os.makedirs(output_dir, exist_ok=True)

# Iterate over annotations
for annotation in data["projects"]["cltspurvz006k070i059o144r"]["labels"][0]["annotations"]["objects"]:
    if "mask" in annotation:  # Check if "mask" key exists
        mask_url = annotation["mask"]["url"]
        label_name = annotation["name"]
        mask_image = fetch_mask_image(mask_url, headers)
        if mask_image is not None:
            mask_np = np.array(mask_image)

            # Save mask image
            mask_filename = f"{data['data_row']['external_id'][:-4]}_{label_name}.jpg"
            mask_filepath = os.path.join(output_dir, mask_filename)
            Image.fromarray(mask_np).save(mask_filepath)
            print(f"Mask image saved successfully: {mask_filepath}")
        else:
            print("Failed to fetch mask image.")
    else:
        print("No mask data found for this annotation.")


Content Type: image/png
Mask image saved successfully: mask_images\img (35)_III.jpg
Content Type: image/png
Mask image saved successfully: mask_images\img (35)_IIa.jpg
Content Type: image/png
Mask image saved successfully: mask_images\img (35)_IIa.jpg
Content Type: image/png
Mask image saved successfully: mask_images\img (35)_III.jpg


## Convert labelbox export to CSV file with filename and label

In [3]:
import json
import csv

# Specify the path to your ndjson file
ndjson_file_path = 'export_20240404.ndjson' #change these paths
csv_output_path = '../New dataset/kasr_aini_20240404.csv'

# Open the ndjson file
with open(ndjson_file_path, 'r') as f:
    lines = f.readlines()

# Prepare for CSV output
with open(csv_output_path, 'w', newline='') as csvfile:
    fieldnames = ['filename', 'label']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    # Process each line (which is a separate JSON object)
    for line in lines:
        data = json.loads(line)

        # Extract the required values
        external_id = data['data_row']['external_id']
        degree = None

        # The "degree" value is nested, so we need to dig a bit deeper
        for project in data['projects'].values():
            for label in project['labels']:
                for classification in label['annotations']['classifications']:
                    if classification['name'] == 'degree':
                        degree = classification['radio_answer']['name']
                        break
        # Write to the CSV file
        writer.writerow({'filename': external_id, 'label': degree})


In [16]:
import os

directory = "./kasr_aini_20240404"  # Replace with the actual directory path

deleted_files = []

# Iterate over files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            content = file.read()
            if 'None' in content or content.strip() == '':
                # close the file
                file.close()
                # Delete the file
                os.remove(file_path)
                deleted_files.append({'filename': filename, 'content': content})

print("Deleted files:")
for deleted_file in deleted_files:
    print(deleted_file)


Deleted files:
{'filename': 'img (54).txt', 'content': 'None'}
