Dataset used
I have collected data from TrashNet, which includes images of various trash items labelled trash (these are not recyclable), then there are multiple classes of recyclable trash such as paper, cardboard, glass, metal and plastic

1211 images are there for the training set and 508 images for the test set

Images are pre-labelled


In [26]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt

In [10]:
import os
import shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def create_splits(data_dir, output_dir, val_size=0.20):
    # Define class mapping for unifying class names
    class_mapping = {
        "cardboard": "cardboard",  
        "Cardboard": "cardboard",  
        "Glass": "glass",
        "glass": "glass",
        "Metal": "metal",
        "metal": "metal",
        "paper": "paper",
        "Paper": "paper",
        "plastic": "plastic",
        "Plastic": "plastic",
        "trash": "trash",
        "Food Organics": "trash",
        "Miscellaneous Trash": "trash",
    }

    # Folders to ignore
    ignored_folders = {"Textile Trash", "Vegetation"}

    # Create directories for train and validation sets
    train_dir = os.path.join(output_dir, 'train')
    val_dir = os.path.join(output_dir, 'val')

    for d in [train_dir, val_dir]:
        os.makedirs(d, exist_ok=True)

    # Data storage for CSV logging
    records = []

    # Process each class directory
    for class_name in os.listdir(data_dir):
        if class_name in ignored_folders:
            print(f"Skipping {class_name} (ignored)")
            continue  # Skip ignored folders

        class_dir = os.path.join(data_dir, class_name)
        if not os.path.isdir(class_dir):
            continue
        
        # Map class name (default to original if no mapping exists)
        mapped_class = class_mapping.get(class_name, class_name)

        # Create mapped class directories in train & val
        for d in [train_dir, val_dir]:
            class_dir_out = os.path.join(d, mapped_class)
            os.makedirs(class_dir_out, exist_ok=True)

        # Get all images and split them
        images = [os.path.join(class_dir, img) for img in os.listdir(class_dir) if img.endswith(('png', 'jpg', 'jpeg'))]
        train_images, val_images = train_test_split(images, test_size=val_size, random_state=42)
        
        print(f"Class {class_name} mapped to {mapped_class}: {len(train_images)} training, {len(val_images)} validation")

        # Function to copy images to the respective directories
        def copy_images(image_list, output_dir, split_type):
            for image in image_list:
                dest = os.path.join(output_dir, mapped_class, os.path.basename(image))
                shutil.copy(image, dest)
                records.append((os.path.basename(image), class_name, mapped_class, split_type))  # Save mapping data

        # Copy images to their respective directories and log them
        copy_images(train_images, train_dir, "train")
        copy_images(val_images, val_dir, "val")

    # Save mapping information to CSV
    df = pd.DataFrame(records, columns=['filename', 'original_class', 'mapped_class', 'split_type'])
    df.to_csv(os.path.join(output_dir, 'class_mapping.csv'), index=False)

    print("Dataset splitting and mapping completed!")

# Set the paths
original_data_dir = '../data/realwaste-main/RealWaste'
output_data_dir = '../data/dataset_split'

# Create splits
create_splits(original_data_dir, output_data_dir)


Class Cardboard mapped to cardboard: 368 training, 93 validation
Class Food Organics mapped to trash: 328 training, 83 validation
Class Glass mapped to glass: 336 training, 84 validation
Class Metal mapped to metal: 632 training, 158 validation
Class Miscellaneous Trash mapped to trash: 396 training, 99 validation
Class Paper mapped to paper: 400 training, 100 validation
Class Plastic mapped to plastic: 736 training, 185 validation
Skipping Textile Trash (ignored)
Skipping Vegetation (ignored)
Dataset splitting and mapping completed!
