In [1]:
import os
import re
import numpy as np

In [2]:
# Text pre-processing method
def clean_text(text):
    text = text.lower()  # Convert to lowercase 
    text = text.replace('_', ' ')  # Replace underscore with space
    text = text.strip()  # Remove space at end
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove any other special chars
    return(text)    

In [3]:
# Extract text from file names as well as labels
# Input: path of parent directory of class folders
# Output: Numpy array with two columns: file names and classes (labelled with integer from 1)
def read_text_files_with_labels(path):
    text_and_labels = []
    print("Dir: " + os.path.split(path)[1] + "\n")
    class_folders = sorted(os.listdir(path))  # Assuming class folders are sorted
    label_map = {class_name: idx-1 for idx, class_name in enumerate(class_folders)}

    for class_name in class_folders:
        class_path = os.path.join(path, class_name)
        if os.path.isdir(class_path):
            file_names = os.listdir(class_path)
            # Print class names, labels, and picture count in each class
            print("Class: " + class_name + ", class label: " + str(label_map[class_name]) + ", picture count: " + str(len(file_names)) + "\n")
            
            for file_name in file_names:
                file_path = os.path.join(class_path, file_name)
                if os.path.isfile(file_path):
                    file_name_no_ext, _ = os.path.splitext(file_name)
                    clean_file_name = clean_text(file_name_no_ext)
                    text_and_labels.append([clean_file_name, label_map[class_name]])
    return np.array(text_and_labels)

In [4]:
# Dataset paths
TRAIN_PATH = r"/Users/yuhao/Downloads/garbage_data/CVPR_2024_dataset_Train"
VAL_PATH = r"/Users/yuhao/Downloads/garbage_data/CVPR_2024_dataset_Val"
TEST_PATH = r"/Users/yuhao/Downloads/garbage_data/CVPR_2024_dataset_Test"

In [5]:
train_text_labels = read_text_files_with_labels(TRAIN_PATH)

Dir: CVPR_2024_dataset_Train

Class: Black, class label: 0, picture count: 2368

Class: Blue, class label: 1, picture count: 4842

Class: Green, class label: 2, picture count: 2317

Class: TTR, class label: 3, picture count: 2102



In [6]:
val_text_labels = read_text_files_with_labels(VAL_PATH)
test_text_labels = read_text_files_with_labels(TEST_PATH)

Dir: CVPR_2024_dataset_Val

Class: Black, class label: 0, picture count: 372

Class: Blue, class label: 1, picture count: 768

Class: Green, class label: 2, picture count: 352

Class: TTR, class label: 3, picture count: 308

Dir: CVPR_2024_dataset_Test

Class: Black, class label: 0, picture count: 695

Class: Blue, class label: 1, picture count: 1086

Class: Green, class label: 2, picture count: 799

Class: TTR, class label: 3, picture count: 852



In [7]:
# Write datasets into csv
np.savetxt("train_text_labels.csv", train_text_labels, delimiter=',', fmt="%s")
np.savetxt("val_text_labels.csv", val_text_labels, delimiter=',', fmt="%s")
np.savetxt("test_text_labels.csv", test_text_labels, delimiter=',', fmt="%s")

In [8]:
x = np.genfromtxt('train_text_labels.csv', delimiter=',', dtype=str, encoding=None)

In [9]:
type(x)

numpy.ndarray

In [10]:
x[1][0]

'cheese plastic wrapper '

In [11]:
x[1]

array(['cheese plastic wrapper ', '0'], dtype='<U65')

In [12]:
x.shape

(11629, 2)

In [40]:
print(train_text.shape)

(11629,)


In [41]:
type(train_text)

numpy.ndarray

In [42]:
train_text[0:30]

array(['instant noodles bag ', 'cheese plastic wrapper ',
       'plastic spoon ', 'sweet wrapping ', 'foam plate ',
       'sealed plastic bag ', 'small plastic lid ',
       'muscle cream bottle ', 'plastic fork ', 'plastic bag ',
       'plastic fork ', 'styrofoam ', 'foam plate ', 'chocolate wrapper ',
       'old worn sandpaper ', 'empty chip packet ', 'tape roll ',
       'clean plastic straw ', 'dirty plastic box with plastic cover ',
       'plastic lid ', 'nonstretchy wrap ', 'mask ',
       'clean plastic tofu container ', 'mosquito repellent cream tube ',
       'razer ', 'multilayer snack wrapper ', 'empty toothpaste tube ',
       'foam meat package', 'oily plastic food wrapper ',
       'foil food wrapper '], dtype='<U65')