In [None]:
import os
import lmdb
import psutil
from PIL import Image
from io import BytesIO
import pickle

def image_to_byte_array(image: Image) -> bytes:
    """Convert a PIL image to a byte array."""
    img_byte_arr = BytesIO()
    image.save(img_byte_arr, format='JPEG')
    return img_byte_arr.getvalue()

def get_available_memory():
    """Get available memory and return a safe allocation size for LMDB."""
    available_memory = psutil.virtual_memory().available
    print(f"Available memory: {available_memory / 1024 / 1024} MB")
    return int(available_memory * 0.5)  # Use 50% of available memory as a safe map_size

def create_separate_lmdbs_in_batches(input_folder, output_folder, batch_size=500):
    # Set available memory for LMDB map_size
    map_size = get_available_memory()

    idx = 0
    batch_num = 0

    # Traverse through each class folder, assigning a label based on position
    for label, class_name in enumerate(sorted(os.listdir(input_folder))):
        class_path = os.path.join(input_folder, class_name)
        if not os.path.isdir(class_path):
            continue

        for img_name in sorted(os.listdir(class_path)):
            img_path = os.path.join(class_path, img_name)
            if not img_path.endswith(('.jpg', '.jpeg', '.png')):
                continue

            # Load the image and convert it to bytes
            img = Image.open(img_path).convert('RGB')
            img_bytes = image_to_byte_array(img)

            # If starting a new batch, create new LMDB environments for images and labels
            if idx % batch_size == 0:
                if idx > 0:
                    # Close previous batch LMDB files
                    img_lmdb_env.close()
                    label_lmdb_env.close()
                    print(f"Batch {batch_num} saved with images and labels.")

                # Define batch LMDB file paths
                batch_start = idx
                batch_end = idx + batch_size - 1
                img_lmdb_path = os.path.join(output_folder, f"img{batch_start}-{batch_end}_imgs")
                label_lmdb_path = os.path.join(output_folder, f"img{batch_start}-{batch_end}_labels")
                os.makedirs(os.path.dirname(img_lmdb_path), exist_ok=True)
                os.makedirs(os.path.dirname(label_lmdb_path), exist_ok=True)

                # Open new LMDB environments for the current batch
                img_lmdb_env = lmdb.open(img_lmdb_path, map_size=map_size)
                label_lmdb_env = lmdb.open(label_lmdb_path, map_size=map_size)
                batch_num += 1

            # Store image bytes in img_lmdb and label in label_lmdb
            with img_lmdb_env.begin(write=True) as img_txn, label_lmdb_env.begin(write=True) as label_txn:
                img_txn.put(f"{idx}".encode(), img_bytes)
                label_txn.put(f"{idx}".encode(), pickle.dumps(label))
                
            idx += 1

    # Close the final batch
    img_lmdb_env.close()
    label_lmdb_env.close()
    print(f"Final batch {batch_num} saved with images and labels.")



In [None]:
input_folder = "/home/nick/Documents/ws24/lmdb/"
output_folder = "/home/nick/Documents/ws24/lmdb/train"
create_separate_lmdbs_in_batches(input_folder, output_folder)

In [None]:
import os
import lmdb
import pickle
from PIL import Image
from io import BytesIO
from sklearn.model_selection import train_test_split

def byte_array_to_image(byte_array):
    """Convert a byte array back to a PIL image."""
    img_byte_arr = BytesIO(byte_array)
    return Image.open(img_byte_arr)

def load_data_from_lmdb(lmdb_path):
    """Load images and labels from an LMDB file."""
    images = []
    labels = []
    with lmdb.open(lmdb_path, readonly=True) as env:
        with env.begin() as txn:
            cursor = txn.cursor()
            for key, value in cursor:
                if 'imgs' in lmdb_path:
                    images.append((key.decode(), value))  # Store byte array for images
                elif 'labels' in lmdb_path:
                    labels.append((key.decode(), pickle.loads(value)))  # Store label as int
    return images, labels

def save_images_and_labels(data, folder_path, subfolder_name):
    """Save images and labels to specified folders."""
    img_folder = os.path.join(folder_path, f"{subfolder_name}_images")
    label_folder = os.path.join(folder_path, f"{subfolder_name}_labels")
    os.makedirs(img_folder, exist_ok=True)
    os.makedirs(label_folder, exist_ok=True)

    for idx, (key, img_bytes) in enumerate(data['images']):
        # Save image
        image = byte_array_to_image(img_bytes)
        image_path = os.path.join(img_folder, f"{key}.jpg")
        image.save(image_path, format='JPEG')

        # Save label
        label = data['labels'][idx][1]  # Get corresponding label using the same index
        label_path = os.path.join(label_folder, f"{key}.pkl")
        with open(label_path, 'wb') as f:
            pickle.dump(label, f)

def process_lmdb_files(input_folder, output_folder, test_size=0.2):
    # Get all LMDB files for images and labels
    lmdb_files = sorted([os.path.join(input_folder, f) for f in os.listdir(input_folder) if 'img' in f])
    
    all_images = []
    all_labels = []
    
    # Load images and labels from LMDB batches
    for lmdb_file in lmdb_files:
        images, labels = load_data_from_lmdb(lmdb_file)
        if 'imgs' in lmdb_file:
            all_images.extend(images)
        elif 'labels' in lmdb_file:
            all_labels.extend(labels)

    # Ensure images and labels are aligned
    all_images.sort(key=lambda x: int(x[0]))  # Sort by key to align with labels
    all_labels.sort(key=lambda x: int(x[0]))

    # Split into training and testing sets
    train_images, test_images, train_labels, test_labels = train_test_split(
        all_images, all_labels, shuffle= True, test_size=test_size, random_state=42
    )

    # Save training and testing sets
    save_images_and_labels({'images': train_images, 'labels': train_labels}, output_folder, 'train')
    save_images_and_labels({'images': test_images, 'labels': test_labels}, output_folder, 'test')
    print("Data has been split and saved into train and test folders.")

# Set paths
input_folder = "/home/nick/Documents/ws24/lmdb/test1"
output_folder = "/home/nick/Documents/ws24/lmdb/out1"

# Process and split data
process_lmdb_files(input_folder, output_folder)


In [None]:
import lmdb
import json
import os
import random
from sklearn.model_selection import train_test_split
from PIL import Image
from io import BytesIO

def load_lmdb_data(lmdb_path):
    """Load images and labels from LMDB file."""
    data = []
    env = lmdb.open(lmdb_path, readonly=True, lock=False)
    with env.begin() as txn:
        cursor = txn.cursor()
        for key, value in cursor:
            image = Image.open(BytesIO(value))  # Adjust if images are stored differently
            
            label = key.decode("utf-8").split("_")[1]  # Adjust if label extraction is different
            data.append((image, label))
    return data

def split_data(data, test_ratio=0.2):
    """Split data into train and test sets with a class-balanced split."""
    train_data, test_data = [], []
    classes = set([label for _, label in data])
    for cls in classes:
        cls_data = [(img, label) for img, label in data if label == cls]
        train, test = train_test_split(cls_data, test_size=test_ratio, random_state=42)
        train_data.extend(train)
        test_data.extend(test)
    return train_data, test_data

def save_to_lmdb(data, lmdb_path):
    """Save a list of (image, label) pairs to an LMDB database."""
    env = lmdb.open(lmdb_path, map_size=1e12)  # Adjust map_size as needed
    with env.begin(write=True) as txn:
        for i, (img, label) in enumerate(data):
            img_byte_arr = BytesIO()
            img.save(img_byte_arr, format='PNG')
            key = f"{i}_{label}".encode("utf-8")  # Key format as needed
            txn.put(key, img_byte_arr.getvalue())
    env.close()

# Example usage:
lmdb_paths = ["/home/nick/Documents/ws24/lmdb/"]
output_path = "/home/nick/Documents/ws24/lmdb/out" # Define your desired output path

# Load data from each LMDB file
all_data = []
for lmdb_path in lmdb_paths:
    all_data.extend(load_lmdb_data(lmdb_path))

# Split data into class-balanced train and test sets
train_data, test_data = split_data(all_data, test_ratio=0.2)

# Save train and test data to LMDB databases
save_to_lmdb(train_data, os.path.join(output_path, "train_images.lmdb"))
save_to_lmdb(test_data, os.path.join(output_path, "test_images.lmdb"))


UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x71d602541cb0>

In [8]:
!python save_cpics_pngs_to_lmdb.py --dataset_path="/home/nick/Downloads/113201/FlowCamNet/imgs" --lmdb_dir_name="/home/nick/Documents/ws24/lmdb/" --min_size=128 --dataset_name="ZooScan"

wtf
TOTAL #images 301247 FROM /home/nick/Downloads/113201/FlowCamNet/imgs
100%|█████████████████████████████████| 301247/301247 [01:47<00:00, 2804.08it/s]
Finished importing from /home/nick/Downloads/113201/FlowCamNet/imgs and subdirectories, saved at: /home/nick/Documents/ws24/lmdb/ZooScan_imgs
Finished importing from /home/nick/Downloads/113201/FlowCamNet/imgs and subdirectories, saved at: /home/nick/Documents/ws24/lmdb/ZooScan_imgs


In [11]:
import os
import lmdb
import imageio.v3 as iio
import numpy as np

# Set paths to your LMDB directories
LMDB_IMGS_PATH = "/home/nick/Documents/ws24/out/TEST_imgs" #"/home/nick/Documents/ws24/lmdb/ZooScan_imgs"
LMDB_LABELS_PATH = "/home/nick/Documents/ws24/out/TEST_labels" #"/home/nick/Documents/ws24/lmdb/ZooScan_labels"
DEBUG_OUTPUT_DIR = "./debug_output"

# Ensure the debug output directory exists
os.makedirs(DEBUG_OUTPUT_DIR, exist_ok=True)
def count_entries(lmdb_path):
    # Open the LMDB environment
    env = lmdb.open(lmdb_path, readonly=True)
    
    with env.begin() as txn:
        # Get all the keys in the database
        cursor = txn.cursor()
        
        count = 0
        for key, _ in cursor:
            count += 1
        
    env.close()
    
    return count

def load_and_verify_lmdb(lmdb_imgs_path, lmdb_labels_path, debug_output_dir, max_images=10):
    # Open the LMDB databases
    num_images = count_entries(lmdb_imgs_path)
    num_labels = count_entries(lmdb_labels_path)
    
    print(f"Number of images in LMDB: {num_images}")
    print(f"Number of labels in LMDB: {num_labels}")

    env_imgs = lmdb.open(lmdb_imgs_path, readonly=True)
    env_labels = lmdb.open(lmdb_labels_path, readonly=True)

    with env_imgs.begin() as txn_imgs, env_labels.begin() as txn_labels:
        cursor_imgs = txn_imgs.cursor()
        cursor_labels = txn_labels.cursor()        
        count = 0
        for (img_key, img_value), (label_key, label_value) in zip(cursor_imgs, cursor_labels):
            # Decode the image
            img_decoded = iio.imread(img_value)  # Read the encoded image
            
            # Decode the label
            label_decoded = label_value.decode("utf-8")
            
            # Save the image for verification
            output_path = os.path.join(debug_output_dir, f"debug_{label_decoded}_{count}.png")
            iio.imwrite(output_path, img_decoded)
            
            print(f"Verified Image Key: {img_key.decode('utf-8')}, Label: {label_decoded}, Saved to: {output_path}")
            
            count += 1
            if count >= max_images:  # Limit the number of images to verify
                break

    # Close the LMDB environments
    env_imgs.close()
    env_labels.close()
    print(f"Verification complete. Images saved to {debug_output_dir}")

# Run the verification script
load_and_verify_lmdb(LMDB_IMGS_PATH, LMDB_LABELS_PATH, DEBUG_OUTPUT_DIR)


Number of images in LMDB: 2643
Number of labels in LMDB: 2643
Verified Image Key: Appendicularia_164794272.png, Label: Appendicularia, Saved to: ./debug_output/debug_Appendicularia_0.png
Verified Image Key: Appendicularia_164794416.png, Label: Appendicularia, Saved to: ./debug_output/debug_Appendicularia_1.png
Verified Image Key: Appendicularia_164794457.png, Label: Appendicularia, Saved to: ./debug_output/debug_Appendicularia_2.png
Verified Image Key: Appendicularia_164794639.png, Label: Appendicularia, Saved to: ./debug_output/debug_Appendicularia_3.png
Verified Image Key: Appendicularia_164794665.png, Label: Appendicularia, Saved to: ./debug_output/debug_Appendicularia_4.png
Verified Image Key: Appendicularia_164794801.png, Label: Appendicularia, Saved to: ./debug_output/debug_Appendicularia_5.png
Verified Image Key: Appendicularia_164794863.png, Label: Appendicularia, Saved to: ./debug_output/debug_Appendicularia_6.png
Verified Image Key: Appendicularia_164931190.png, Label: Append

In [9]:
import os
import lmdb
import imageio.v3 as iio
import numpy as np
from tqdm import tqdm
import psutil
from sklearn.model_selection import train_test_split

def get_available_memory():
    """Get available memory and return a safe allocation size for LMDB."""
    available_memory = psutil.virtual_memory().available
    print(f"Available memory: {available_memory / 1024 / 1024} MB")
    return int(available_memory * 0.75) 


MAP_SIZE_IMG = get_available_memory()
MAP_SIZE_META = int(get_available_memory()*0.1)

def load_lmdb_data(lmdb_path):
    """
    Loads data from an LMDB file and returns it as a list of (key, value) pairs.
    """
    env = lmdb.open(lmdb_path, readonly=True)
    data = []
    with env.begin() as txn:
        cursor = txn.cursor()
        for key, value in cursor:
            data.append((key, value))

    return data

def save_lmdb_data(lmdb_path_img, lmdb_path_label, img_data, label_data):
    """
    Saves images and labels to LMDB, no decoding and reencoding.
    """
    env_imgs = lmdb.open(lmdb_path_img, map_size=MAP_SIZE_IMG)
    env_labels = lmdb.open(lmdb_path_label, map_size=MAP_SIZE_META)
        
    with (
        env_imgs.begin(write=True) as txn_imgs,
        env_labels.begin(write=True) as txn_labels,
    ):
        # Iterate through img_data and label_data, assuming each contains (key, data)
        for (img_key, img_encoded), (label_key, label) in tqdm(zip(img_data, label_data), total=len(img_data)):
            # Ensure keys match (img_key should be the same in both img_data and label_data)
            if img_key != label_key:
                print(f"Warning: Mismatched keys! img_key: {img_key}, label_key: {label_key}")
                continue  # Skip if keys don't match

            txn_imgs.put(img_key, img_encoded)  # Save the already-encoded image
            txn_labels.put(label_key, label)  # Save the label as bytes
 
    env_imgs.close()
    env_labels.close()




def load_all_datasets(main_folder):
    """
    Load all datasets in the given folder and return combined image and label data.
    """
    img_data = []
    label_data = []
    
    for dataset in os.listdir(main_folder):
        print(f"Loading dataset: {dataset}")
        dataset_path = os.path.join(main_folder, dataset)
        print(dataset_path)
        if dataset_path.endswith("_imgs"):
            img_data.extend(load_lmdb_data(dataset_path))
        elif dataset_path.endswith("_labels"):
            label_data.extend(load_lmdb_data(dataset_path))
        else:
            print(f"Skipping {dataset_path}")
            print("we are fucked if this contains a valid path")
                
    return img_data, label_data

def split_and_save_data(main_folder, output_folder, test_size=0.2):
    """
    Loads all datasets, splits the data into train and test, and saves them in the output folder.
    """
    os.makedirs(output_folder, exist_ok=True)

    # Load all datasets
    img_data, label_data = load_all_datasets(main_folder)
    
    print(f"Total data loaded: {len(img_data)} images and {len(label_data)} labels.")

    # Split dataset into train and test
    train_imgs, test_imgs = train_test_split(img_data, test_size=test_size,shuffle=True, random_state=43)
    train_labels, test_labels = train_test_split(label_data, test_size=test_size,shuffle=True, random_state=43)

    # Save the split data to LMDB
    save_lmdb_data(os.path.join(output_folder, "TRAIN_imgs"), os.path.join(output_folder, "TRAIN_labels"), train_imgs, train_labels)
    save_lmdb_data(os.path.join(output_folder, "TEST_imgs"), os.path.join(output_folder, "TEST_labels"), test_imgs, test_labels)

    print(f"Finished processing and saving datasets to {output_folder}")

split_and_save_data(main_folder="/home/nick/Documents/ws24/lmdb", output_folder="/home/nick/Documents/ws24/out", test_size=0.2)


Available memory: 22124.16015625 MB
Available memory: 22124.16015625 MB
Loading dataset: ZooScan_imgs
/home/nick/Documents/ws24/lmdb/ZooScan_imgs
Loading dataset: ZooScan_labels
/home/nick/Documents/ws24/lmdb/ZooScan_labels
Total data loaded: 13212 images and 13212 labels.


100%|██████████| 10569/10569 [00:00<00:00, 127248.61it/s]
100%|██████████| 2643/2643 [00:00<00:00, 160311.58it/s]


Finished processing and saving datasets to /home/nick/Documents/ws24/out
