Preprocessing data for the input for the Hugginface pretrained models. We convert our images into the dataset, consisting in a dictionary with train, validation and test data.

In [1]:
#Create dataset
import os
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk



def create_image_folder_dataset(root_path):
  """creates `Dataset` from image folder structure"""

  # get class names by folders names
  _CLASS_NAMES= os.listdir(root_path)
  # defines `datasets` features`
  features=datasets.Features({
                      "img": datasets.Image(),
                      "label": datasets.features.ClassLabel(names=_CLASS_NAMES),
                  })
  # temp list holding datapoints for creation
  img_data_files=[]
  label_data_files=[]
  # load images into list for creation
  for img_class in os.listdir(root_path):
    for img in os.listdir(os.path.join(root_path,img_class)):
      path_=os.path.join(root_path,img_class,img)
      img_data_files.append(path_)
      label_data_files.append(img_class)
  # create dataset
  ds = datasets.Dataset.from_dict({"img":img_data_files,"label":label_data_files},features=features)
  return ds

In [2]:
ds = create_image_folder_dataset("data")

In [3]:
ds

Dataset({
    features: ['img', 'label'],
    num_rows: 6402
})

In [4]:
#Classes names
labels = ds.features["label"].names
print(labels)

['iron', '.ipynb_checkpoints', 'gamma', 'proton']


In [5]:
# test size will be 15% of train dataset
test_size=.15

ds_split = ds.shuffle().train_test_split(test_size=test_size)

In [6]:
ds_split

DatasetDict({
    train: Dataset({
        features: ['img', 'label'],
        num_rows: 5441
    })
    test: Dataset({
        features: ['img', 'label'],
        num_rows: 961
    })
})

In [7]:
ds_split['train'][0]

{'img': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=288x288>,
 'label': 2}

We take a look at an example. The image field contains a PIL image and each label is an integer that represents a class. We create a dictionary that maps a label name to an integer and vice versa. The mapping will help the model recover the label name from the label number.

In [8]:
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

Now we can covert the label number to a label name.

In [9]:
id2label[str(0)]

'iron'

In [10]:
id2label[str(1)]

'.ipynb_checkpoints'

In [11]:
id2label[str(2)]

'gamma'

In [12]:
# split up training into training + validation
splits = ds_split["train"].train_test_split(test_size=0.1)
train_ds = splits['train']
val_ds = splits['test']

In [13]:
train_ds

Dataset({
    features: ['img', 'label'],
    num_rows: 4896
})

In [14]:
val_ds

Dataset({
    features: ['img', 'label'],
    num_rows: 545
})

In [15]:
test_ds = ds_split['test']

In [16]:
test_ds

Dataset({
    features: ['img', 'label'],
    num_rows: 961
})

In [17]:
#All together to save it
ds_dict = datasets.DatasetDict({
    "train": train_ds,
    "val": val_ds,
    "test": test_ds,
})


In [18]:
ds_dict

DatasetDict({
    train: Dataset({
        features: ['img', 'label'],
        num_rows: 4896
    })
    val: Dataset({
        features: ['img', 'label'],
        num_rows: 545
    })
    test: Dataset({
        features: ['img', 'label'],
        num_rows: 961
    })
})

In [19]:
#Save data
ds_dict.save_to_disk("./data_dict")

Flattening the indices:   0%|          | 0/5 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
#To load data
ds = load_from_disk('./data_dict')
ds

DatasetDict({
    train: Dataset({
        features: ['img', 'label'],
        num_rows: 4896
    })
    val: Dataset({
        features: ['img', 'label'],
        num_rows: 545
    })
    test: Dataset({
        features: ['img', 'label'],
        num_rows: 961
    })
})