In [11]:
import os
import lmdb
from PIL import Image
from io import BytesIO
import pickle

def image_to_byte_array(image: Image) -> bytes:
    """Convert a PIL image to a byte array."""
    img_byte_arr = BytesIO()
    image.save(img_byte_arr, format='JPEG')
    return img_byte_arr.getvalue()

def create_separate_lmdbs(input_folder, output_folder):
    # Convert available memory to bytes (193000 * 32 = 6,176,000 bytes)
    available_memory = 193000 * 32  # in bytes

    # Create the main folders for images and labels
    img_lmdb_path = os.path.join(output_folder, "train_imgs", "data.lmdb")
    label_lmdb_path = os.path.join(output_folder, "train_labels", "data.lmdb")
    os.makedirs(os.path.dirname(img_lmdb_path), exist_ok=True)
    os.makedirs(os.path.dirname(label_lmdb_path), exist_ok=True)

    # Open two LMDB environments with the calculated map size
    img_lmdb_env = lmdb.open(img_lmdb_path, map_size=available_memory)
    label_lmdb_env = lmdb.open(label_lmdb_path, map_size=available_memory)

    with img_lmdb_env.begin(write=True) as img_txn, label_lmdb_env.begin(write=True) as label_txn:
        idx = 0
        # Traverse through each class folder, assigning a label based on position
        for label, class_name in enumerate(sorted(os.listdir(input_folder))):
            class_path = os.path.join(input_folder, class_name)
            if not os.path.isdir(class_path):
                continue

            # Traverse through each image in the class folder
            for img_name in sorted(os.listdir(class_path)):
                img_path = os.path.join(class_path, img_name)
                if not img_path.endswith(('.jpg', '.jpeg', '.png')):
                    continue

                # Load the image and convert it to bytes
                img = Image.open(img_path).convert('RGB')
                img_bytes = image_to_byte_array(img)

                # Store image bytes in img_lmdb
                img_txn.put(f"{idx}".encode(), img_bytes)
                
                # Store the label in label_lmdb
                label_txn.put(f"{idx}".encode(), pickle.dumps(label))
                
                idx += 1

    img_lmdb_env.close()
    label_lmdb_env.close()
    print(f"Image LMDB file created at: {img_lmdb_path}")
    print(f"Label LMDB file created at: {label_lmdb_path}")




In [21]:
import os
import lmdb
import psutil
from PIL import Image
from io import BytesIO
import pickle

def image_to_byte_array(image: Image) -> bytes:
    """Convert a PIL image to a byte array."""
    img_byte_arr = BytesIO()
    image.save(img_byte_arr, format='JPEG')
    return img_byte_arr.getvalue()

def get_available_memory():
    """Get available memory and return a safe allocation size for LMDB."""
    available_memory = psutil.virtual_memory().available
    print(f"Available memory: {available_memory / 1024 / 1024} MB")
    return int(available_memory * 0.5)  # Use 50% of available memory as a safe map_size

def create_separate_lmdbs_in_batches(input_folder, output_folder, batch_size=50000):
    # Set available memory for LMDB map_size
    map_size = get_available_memory()

    idx = 0
    batch_num = 0

    # Traverse through each class folder, assigning a label based on position
    for label, class_name in enumerate(sorted(os.listdir(input_folder))):
        class_path = os.path.join(input_folder, class_name)
        if not os.path.isdir(class_path):
            continue

        for img_name in sorted(os.listdir(class_path)):
            img_path = os.path.join(class_path, img_name)
            if not img_path.endswith(('.jpg', '.jpeg', '.png')):
                continue

            # Load the image and convert it to bytes
            img = Image.open(img_path).convert('RGB')
            img_bytes = image_to_byte_array(img)

            # If starting a new batch, create new LMDB environments for images and labels
            if idx % batch_size == 0:
                if idx > 0:
                    # Close previous batch LMDB files
                    img_lmdb_env.close()
                    label_lmdb_env.close()
                    print(f"Batch {batch_num} saved with images and labels.")

                # Define batch LMDB file paths
                batch_start = idx
                batch_end = idx + batch_size - 1
                img_lmdb_path = os.path.join(output_folder, f"img{batch_start}-{batch_end}_imgs", "data.lmdb")
                label_lmdb_path = os.path.join(output_folder, f"img{batch_start}-{batch_end}_labels", "data.lmdb")
                os.makedirs(os.path.dirname(img_lmdb_path), exist_ok=True)
                os.makedirs(os.path.dirname(label_lmdb_path), exist_ok=True)

                # Open new LMDB environments for the current batch
                img_lmdb_env = lmdb.open(img_lmdb_path, map_size=map_size)
                label_lmdb_env = lmdb.open(label_lmdb_path, map_size=map_size)
                batch_num += 1

            # Store image bytes in img_lmdb and label in label_lmdb
            with img_lmdb_env.begin(write=True) as img_txn, label_lmdb_env.begin(write=True) as label_txn:
                img_txn.put(f"{idx}".encode(), img_bytes)
                label_txn.put(f"{idx}".encode(), pickle.dumps(label))
                
            idx += 1

    # Close the final batch
    img_lmdb_env.close()
    label_lmdb_env.close()
    print(f"Final batch {batch_num} saved with images and labels.")



In [22]:
input_folder = "/home/hk-project-p0021769/hgf_grc7525/workspace/hkfswork/hgf_grc7525-nick/kaggle/train"
output_folder = "/home/hk-project-p0021769/hgf_grc7525/workspace/hkfswork/hgf_grc7525-nick/kaggle/train_lmdb"
create_separate_lmdbs_in_batches(input_folder, output_folder)

Available memory: 494154.73828125 MB
Final batch 1 saved with images and labels.


In [23]:
!python Masterproject-plankton-dinov2/dinov2/run/eval/knn.py \
--config-file Masterproject-plankton-dinov2/dinov2/configs/eval/vits14_pretrain.yaml \
--pretrained-weights 'checkpoints/dinov2_vits14_pretrain.pth' --output-dir \
/home/hk-project-p0021769/hgf_grc7525/output/ \
--train-dataset='LMDBDataset:split=TEST:root=/home/hk-project-p0021769/hgf_grc7525/workspace/hkfswork/hgf_grc7525-nick/kaggle/train_lmdb:extra=*' \


Traceback (most recent call last):
  File "/hkfs/home/project/hk-project-p0021769/hgf_grc7525/Masterproject-plankton-dinov2/dinov2/run/eval/knn.py", line 10, in <module>
    from dinov2.eval.knn import get_args_parser
ModuleNotFoundError: No module named 'dinov2'


NameError: name 'node' is not defined