In [3]:
from google.cloud import storage
from pathlib import Path
import os

PROJECT_ID = "dental-treatment-detection"
BUCKET_NAME = f"{PROJECT_ID}-dental-data"
REGION = "us-central1"

print(f"Project ID: {PROJECT_ID}")
print(f"Bucket Name: {BUCKET_NAME}")
print(f"Region: {REGION}")

Project ID: dental-treatment-detection
Bucket Name: dental-treatment-detection-dental-data
Region: us-central1


In [7]:
# Initialize storage client
storage_client = storage.Client(project=PROJECT_ID)

# Create bucket if it doesn't exist
try:
    bucket = storage_client.get_bucket(BUCKET_NAME)
    print(f"Bucket {BUCKET_NAME} already exists")
except:
    bucket = storage_client.create_bucket(
        BUCKET_NAME,
        location=REGION
    )
    print(f"Created bucket: {BUCKET_NAME}")

# Verify bucket
print(f"Bucket location: {bucket.location}")

Created bucket: dental-treatment-detection-dental-data
Bucket location: US-CENTRAL1


In [8]:
dataset_path = Path("/home/jupyter/DentAi-2")

def upload_split_to_gcs(local_path, bucket, split_name):
    """Upload images and cleaned annotations for one split"""
    split_path = local_path / split_name
    
    images = list(split_path.glob("*.jpg"))
    cleaned_json = split_path / "_annotations_cleaned.coco.json"
    
    uploaded = 0
    
    # Upload images
    for img_path in images:
        blob_name = f"dental-dataset/{split_name}/{img_path.name}"
        blob = bucket.blob(blob_name)
        blob.upload_from_filename(str(img_path))
        uploaded += 1
        
        if uploaded % 500 == 0:
            print(f"  {split_name}: {uploaded}/{len(images)} images...")
    
    # Upload cleaned annotations as _annotations.coco.json (standard name)
    blob_name = f"dental-dataset/{split_name}/_annotations.coco.json"
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(str(cleaned_json))
    
    print(f"  {split_name}: Complete - {len(images)} images + annotations")
    return uploaded + 1

print("Uploading dataset to GCS...")
total = 0
for split in ['train', 'valid', 'test']:
    total += upload_split_to_gcs(dataset_path, bucket, split)

print(f"\nUpload complete: {total} total files uploaded")

Uploading dataset to GCS...
  train: 500/9159 images...
  train: 1000/9159 images...
  train: 1500/9159 images...
  train: 2000/9159 images...
  train: 2500/9159 images...
  train: 3000/9159 images...
  train: 3500/9159 images...
  train: 4000/9159 images...
  train: 4500/9159 images...
  train: 5000/9159 images...
  train: 5500/9159 images...
  train: 6000/9159 images...
  train: 6500/9159 images...
  train: 7000/9159 images...
  train: 7500/9159 images...
  train: 8000/9159 images...
  train: 8500/9159 images...
  train: 9000/9159 images...
  train: Complete - 9159 images + annotations
  valid: Complete - 370 images + annotations
  test: Complete - 243 images + annotations

Upload complete: 9775 total files uploaded


In [10]:
import json

def convert_coco_to_jsonl(coco_json_path, output_jsonl_path, gcs_image_prefix, split_name):
    with open(coco_json_path, 'r') as f:
        coco_data = json.load(f)
    
    category_map = {cat['id']: cat['name'] for cat in coco_data['categories']}
    image_map = {img['id']: img for img in coco_data['images']}
    
    image_annotations = {}
    for ann in coco_data['annotations']:
        img_id = ann['image_id']
        if img_id not in image_annotations:
            image_annotations[img_id] = []
        image_annotations[img_id].append(ann)
    
    jsonl_lines = []
    for img_id, img_info in image_map.items():
        if img_id not in image_annotations:
            continue
            
        gcs_uri = f"{gcs_image_prefix}/{img_info['file_name']}"
        
        bboxes = []
        for ann in image_annotations[img_id]:
            x, y, w, h = ann['bbox']
            x_norm = x / img_info['width']
            y_norm = y / img_info['height']
            w_norm = w / img_info['width']
            h_norm = h / img_info['height']
            
            bboxes.append({
                "displayName": category_map[ann['category_id']],
                "xMin": x_norm,
                "yMin": y_norm,
                "xMax": x_norm + w_norm,
                "yMax": y_norm + h_norm
            })
        
        jsonl_entry = {
            "imageGcsUri": gcs_uri,
            "boundingBoxAnnotations": bboxes,
            "dataItemResourceLabels": {"split": split_name}
        }
        jsonl_lines.append(json.dumps(jsonl_entry))
    
    with open(output_jsonl_path, 'w') as f:
        f.write('\n'.join(jsonl_lines))
    
    return len(jsonl_lines)

In [11]:
PROJECT_ID = "dental-treatment-detection"
BUCKET_NAME = f"{PROJECT_ID}-dental-data"

train_coco = "/home/jupyter/DentAi-2/train/_annotations_cleaned.coco.json"
train_jsonl = "/home/jupyter/train_annotations.jsonl"
gcs_train_prefix = f"gs://{BUCKET_NAME}/dental-dataset/train"
train_count = convert_coco_to_jsonl(train_coco, train_jsonl, gcs_train_prefix, "train")
print(f"Train: {train_count} images")

valid_coco = "/home/jupyter/DentAi-2/valid/_annotations_cleaned.coco.json"
valid_jsonl = "/home/jupyter/valid_annotations.jsonl"
gcs_valid_prefix = f"gs://{BUCKET_NAME}/dental-dataset/valid"
valid_count = convert_coco_to_jsonl(valid_coco, valid_jsonl, gcs_valid_prefix, "validation")
print(f"Valid: {valid_count} images")

test_coco = "/home/jupyter/DentAi-2/test/_annotations_cleaned.coco.json"
test_jsonl = "/home/jupyter/test_annotations.jsonl"
gcs_test_prefix = f"gs://{BUCKET_NAME}/dental-dataset/test"
test_count = convert_coco_to_jsonl(test_coco, test_jsonl, gcs_test_prefix, "test")
print(f"Test: {test_count} images")

print(f"\nTotal: {train_count + valid_count + test_count} images")

Train: 9159 images
Valid: 370 images
Test: 243 images

Total: 9772 images


In [12]:
from google.cloud import storage

combined_jsonl = "/home/jupyter/all_annotations.jsonl"
with open(combined_jsonl, 'w') as outfile:
    with open(train_jsonl, 'r') as f:
        outfile.write(f.read())
    outfile.write('\n')
    with open(valid_jsonl, 'r') as f:
        outfile.write(f.read())
    outfile.write('\n')
    with open(test_jsonl, 'r') as f:
        outfile.write(f.read())

storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob("dental-dataset/all_annotations.jsonl")
blob.upload_from_filename(combined_jsonl)

print(f"Uploaded: gs://{BUCKET_NAME}/dental-dataset/all_annotations.jsonl")

Uploaded: gs://dental-treatment-detection-dental-data/dental-dataset/all_annotations.jsonl
