In [1]:
from scipy.io import arff
import numpy as np
import pickle

def load_BME_datasets():
    train_dataset = arff.loadarff(f'datasets/BME/BME_TRAIN.arff')[0]
    test_dataset = arff.loadarff(f'datasets/BME/BME_TEST.arff')[0]
    return train_dataset, test_dataset

def array_preprocess_BME_dataset(dataset):
    serie_length = 64
    matrix = np.zeros((dataset.shape[0], serie_length))
    labels = np.zeros((dataset.shape[0], 1))

    for i, serie in enumerate(dataset):
        label = serie[-1]
        label = np.frombuffer(label, dtype=np.uint8)[0] - 49

        serie = np.array(list(serie)[:-1])[::2]

        min_value = np.min(serie)   
        max_value = np.max(serie)
        serie = (serie - min_value) / (max_value - min_value)
        serie = 2*serie - 1
            
        matrix[i, :] = serie
        labels[i] = label
        
    return matrix.T, labels.flatten().astype('int')

train_dataset, test_dataset = load_BME_datasets()
train_matrix, train_labels = array_preprocess_BME_dataset(train_dataset)
test_matrix, test_labels = array_preprocess_BME_dataset(test_dataset)

In [2]:
dict_matrix, dict_labels = [], []
n_test_samples_per_class = 10

for c in np.unique(test_labels):
    idxs = np.where(test_labels == c)[0]
    test_matrix = np.delete(test_matrix, idxs[n_test_samples_per_class:], axis=1)
    test_labels = np.delete(test_labels, idxs[n_test_samples_per_class:])

In [3]:
train_D = dict()
train_D['X'] = train_matrix
train_D['labels'] = train_labels

with open('datasets/BME/train_D.pickle', 'wb') as f:
    pickle.dump(train_D, f)

test_D = dict()
test_D['X'] = test_matrix
test_D['labels'] = test_labels

with open('datasets/BME/test_D.pickle', 'wb') as f:
    pickle.dump(test_D, f)

In [4]:
def get_starts_ends(content):
    starts, ends = [], []

    for i, line in enumerate(content):
        if line == '\n' and i == 1:
            continue
        elif line == '\n':
            ends.append(i)
            starts.append(i+1)

    ends = ends[1:]
    ends.append(len(content))
    return starts, ends

def read_digits_line(segment):
    label = int(segment[0].split('"')[1])
    X, Y = [], []
    for line in segment[3:]:
        if not 'PEN' in line and not 'DT' in line:
            split_line = line.split(' ')
            while '' in split_line:
                split_line.remove('')
            x = float(split_line[0])
            y = float(split_line[1])
            X.append(x)
            Y.append(y)
    X = np.array(X)
    min_x = np.min(X)
    max_x = np.max(X)
    X = (X - min_x) / (max_x - min_x)
    Y = np.array(Y)
    min_y = np.min(Y)
    max_y = np.max(Y)
    Y = (Y - min_y) / (max_y - min_y)
    X = 2*X - 1
    Y = 2*Y - 1
    return X, Y, label

def read_digits_dataset(dataset_type):
    dataset_name = 'datasets/digits/pendigits-orig.'
    if dataset_type == 'train':
        dataset_name += 'tra'
    if dataset_type == 'test':
        dataset_name += 'tes'
    
    with open(dataset_name) as f:
        content = f.readlines()

    starts, ends = get_starts_ends(content)

    all_X, all_Y, all_labels = [], [], []
    for start, end in zip(starts, ends):
        segment = content[start:end]
        X, Y, label = read_digits_line(segment)

        all_X.append(X)
        all_Y.append(Y)
        all_labels.append(label)

    all_labels = np.array(all_labels)
    return all_X, all_Y, all_labels

def load_DIGITS_datasets(dataset_type):
    dataset_name = 'datasets/digits/pendigits-orig.'
    if dataset_type == 'train':
        dataset_name += 'tra'
    if dataset_type == 'test':
        dataset_name += 'tes'
    
    with open(dataset_name) as f:
        content = f.readlines()

    starts, ends = get_starts_ends(content)

    all_X, all_Y, all_labels = [], [], []
    for start, end in zip(starts, ends):
        segment = content[start:end]
        X, Y, label = read_digits_line(segment)

        all_X.append(X)
        all_Y.append(Y)
        all_labels.append(label)

    all_labels = np.array(all_labels)
    return all_X, all_Y, all_labels

In [5]:
def crop(X, Y, labels, max_len, n_samples_per_class):
    new_X, new_Y = [], []
    new_labels = []
    for y in np.unique(labels):
        idx_y = np.where(labels == y)[0]
        X_y = [X[i] for i in idx_y]
        Y_y = [Y[i] for i in idx_y]

        minus_lengths = [-len(x) for x in X_y]
        idx = np.argsort(minus_lengths)[:n_samples_per_class]
        X_y = [X_y[i] for i in idx]
        Y_y = [Y_y[i] for i in idx]

        new_X.extend(X_y)
        new_Y.extend(Y_y)
        new_labels.extend([y] * len(X_y))

    new_labels = np.array(new_labels)
    return new_X, new_Y, new_labels

def pad_and_array(X, max_len):
    X_array = []
    for x in X:
        new_x = np.pad(x, (0, max_len - len(x)), 'constant', constant_values=(0, 0))
        X_array.append(new_x)
    return np.array(X_array)
    
def array_preprocess_DIGITS_dataset(X, Y, labels, min_len, max_len, n_samples_class_train, n_samples_class_test):
    _idx = [i for i, x in enumerate(X) if len(x) <= max_len and len(x) >= min_len]
    _X = [X[i] for i in _idx]
    _Y = [Y[i] for i in _idx]
    labels = np.array([labels[i] for i in _idx])

    n_samples = len(_X)

    _X_train = _X[::2]
    _Y_train = _Y[::2]
    train_labels = labels[::2]

    _X_test = _X[1::2]
    _Y_test = _Y[1::2]
    test_labels = labels[1::2]

    _X_train, _Y_train, train_labels = crop(_X_train, _Y_train, train_labels, max_len, n_samples_per_class=n_samples_class_train)
    _X_train_array = pad_and_array(_X_train, max_len).T
    _Y_train_array = pad_and_array(_Y_train, max_len).T

    _X_test, _Y_test, test_labels = crop(_X_test, _Y_test, test_labels, max_len, n_samples_per_class=n_samples_class_test)
    _X_test_array = pad_and_array(_X_test, max_len).T
    _Y_test_array = pad_and_array(_Y_test, max_len).T

    return _X_train, _X_train_array, _Y_train, _Y_train_array, train_labels, _X_test, _X_test_array, _Y_test, _Y_test_array, test_labels

X_train, Y_train, train_labels = load_DIGITS_datasets('train')

train_X, train_matrix_X, train_Y, train_matrix_Y, true_train_labels, test_X, test_matrix_X, test_Y, test_matrix_Y, true_test_labels = array_preprocess_DIGITS_dataset(X_train, Y_train, train_labels, min_len=40, max_len=60, n_samples_class_train=10, n_samples_class_test=10)

In [6]:
train_D = dict()
train_D['X'] = train_X
train_D['Y'] = train_Y
train_D['matrix_X'] = train_matrix_X
train_D['matrix_Y'] = train_matrix_Y
train_D['labels'] = true_train_labels

with open('datasets/digits/train_D.pickle', 'wb') as f:
    pickle.dump(train_D, f)

test_D = dict()
test_D['X'] = test_X
test_D['Y'] = test_Y
test_D['matrix_X'] = test_matrix_X
test_D['matrix_Y'] = test_matrix_Y
test_D['labels'] = true_test_labels

with open('datasets/digits/test_D.pickle', 'wb') as f:
    pickle.dump(test_D, f)