In [1]:
import os
import codecs
import numpy as np
from skimage.io import imsave

# the path which stores the 4 dataset files
datapath = '../../../datasets/european_dataset/'

# lists the files and directories inside 'datapath'
files = os.listdir(datapath)
data_dict = {}

In [2]:
# prints out the files and directories inside 'datapath'
print(files)

['train-labels.idx1-ubyte', '.DS_Store', 'train-images.idx3-ubyte', 'testing_set.zip', 't10k-labels.idx1-ubyte', 'training_set.zip', 't10k-images.idx3-ubyte']


In [14]:
# Encodes 'byte' parameter using the codec registered for encoding 'hex'.
def get_int(byte):
    return int(codecs.encode(byte, 'hex'), 16)

In [15]:
# for loop which iterates through all the files
for file in files:
    # all the dataset files end with 'ubyte'
    if file.endswith('ubyte'):
        # opens each dataset file for reading
        with open(datapath + file, 'rb') as f:
            
            # stores the file content inside data variable
            data = f.read()
            # the first 4 characters store the type of the file content. The type can be image or label
            type = get_int(data[:4])
            # the second 4 characters store the length of the content. For training files the length is 60000, and for testing files the length is 10000
            length = get_int(data[4:8])
            
            # if the type equals to 2051, the files store the image data
            if type == 2051: 
                category = 'images'
                # images are 28x28 pixels
                num_rows = get_int(data[8:12])
                num_cols = get_int(data[12:16])
                # gets the rest of the characters from the 16th character
                parsed = np.frombuffer(data, dtype = np.uint8, offset = 16)
                
                # returns a 2D array 
                # for training images the array has 60000 arrays with 28 arrays where each array has 28 elements, the shape of the array is (60000, 28, 28)
                # for training images the array has 10000 arrays with 28 arrays where each array has 28 elements, the shape of the array is (10000, 28, 28)
                parsed = parsed.reshape(length, num_rows, num_cols)
                
            # if the type equals to 2049, the files store the label data
            if type == 2049:
                category = 'labels'
                # gets the rest of the characters from the 8th character
                parsed = np.frombuffer(data, dtype = np.uint8, offset = 8)
                
                # returns a 1D array 
                # the training and testing label arrays are reshaped to (60000) and (10000) respectively. 
                parsed = parsed.reshape(length)
                
            # if the length equals to 10000, the file stores the testing data. 
            if length == 10000:
                set = 'testing'
                
            # if the length equals to 60000, the file stores the training data. 
            if length == 60000:
                set = 'training'
                
            # data_dict dictionary stores 4 keys and values
            data_dict[set + '_' + category] = parsed

In [16]:
    # prints out the data_dict dictionary
    print(data_dict)

{'training_labels': array([5, 0, 4, ..., 5, 6, 8], dtype=uint8), 'training_images': array([[[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, 

In [17]:
# the path for storing the training set information
datapath = '../../../datasets/european_dataset/training_set'

# gets the values of 'training_images' and 'training_labels' keys from 'data_dict' directory
images = data_dict['training_images']
labels = data_dict['training_labels']

# the first value in the shape of training images is 60000
no_of_samples = images.shape[0]

# for loop which iterates through 0 to 60000
for i in range(no_of_samples):
    # gets the image and label
    image = images[i]
    label = labels[i]
    
    # checks if the directory with the name of the value of label is existing, if not creates it.
    if not os.path.exists(datapath + '/' + str(label) + '/'):
        os.makedirs(datapath + '/' + str(label) + '/')
        
    # gets the number of files in the directory
    no_of_file = len(os.listdir(datapath + '/' + str(label) + '/'))
    
    # saves the image file in the directory with a name of the label value
    imsave(datapath + '/' + str(label) + '/%0.5d.png'%(no_of_file), image)

In [18]:
# the path for storing the testing set information
datapath = '../../../datasets/european_dataset/testing_set'

# gets the values of 'testing_images' and 'testing_labels' keys from 'data_dict' directory
images = data_dict['testing_images']
labels = data_dict['testing_labels']

# the first value in the shape of testing images is 10000
no_of_samples = images.shape[0]

# for loop which iterates through 0 to 10000
for i in range(no_of_samples):
    # gets the image and label
    image = images[i]
    label = labels[i]
    
    # checks if the directory with the name of the value of label is existing, if not creates it.
    if not os.path.exists(datapath + '/' + str(label) + '/'):
        os.makedirs(datapath + '/' + str(label) + '/')
        
    # gets the number of files in the directory
    no_of_file = len(os.listdir(datapath + '/' + str(label) + '/'))
    
    # saves the image file in the directory with a name of the label value
    imsave(datapath + '/' + str(label) + '/%0.5d.png'%(no_of_file), image)