In [1]:
import numpy as np
import scipy.io
from characterDefinitions import getHandwritingCharacterDefinitions
from dataLabelingStep import labelDataset, preparedataforLSTM
import os
import datetime

#point this towards the top level dataset directory
rootDir = os.path.expanduser('~') + '/handwritingBCIData/'

#define which datasets to process
dataDirs = ['t5.2019.05.08','t5.2019.11.25','t5.2019.12.09','t5.2019.12.11','t5.2019.12.1+8',
            't5.2019.12.20','t5.2020.01.06','t5.2020.01.08','t5.2020.01.13','t5.2020.01.15']

#defines the list of all 31 characters and what to call them
charDef = getHandwritingCharacterDefinitions()

#saves all labels in this folder
if not os.path.isdir(rootDir + 'LSTMTrainingSteps/Step2_HMMLabels'):
    os.mkdir(rootDir + 'LSTMTrainingSteps/Step2_HMMLabels')

In [2]:
for dataDir in dataDirs:
    timeStart = datetime.datetime.now()
    print('Labeling ' + dataDir + ' dataset')
    
    # Load sentences, single letter, time-warped files, and train/test partitions
    sentenceDat = scipy.io.loadmat(rootDir + 'Datasets/' + dataDir + '/sentences.mat')
    singleLetterDat = scipy.io.loadmat(rootDir + 'Datasets/' + dataDir + '/singleLetters.mat')
    twCubes = scipy.io.loadmat(rootDir + 'LSTMTrainingSteps/Step1_TimeWarping/' + dataDir + '_warpedCubes.mat')

    cvPart_heldOutBlocks = scipy.io.loadmat(rootDir + 'LSTMTrainingSteps/trainTestPartitions_HeldOutBlocks.mat')
    cvPart_heldOutTrials = scipy.io.loadmat(rootDir + 'LSTMTrainingSteps/trainTestPartitions_HeldOutTrials.mat')
    cvParts = [cvPart_heldOutBlocks, cvPart_heldOutTrials]
    
    # Remove hashmarks (#) from the sentence prompts
    for x in range(sentenceDat['sentencePrompt'].shape[0]):
        sentenceDat['sentencePrompt'][x, 0][0] = sentenceDat['sentencePrompt'][x, 0][0].replace('#', '')
    
    cvFolderNames = ['HeldOutBlocks', 'HeldOutTrials']
    
    sentences = sentenceDat['sentencePrompt'][:, 0]
    sentenceLens = sentenceDat['numTimeBinsPerSentence'][:, 0]
    
    # Construct separate labels for each training partition
    for cvPart, cvFolder in zip(cvParts, cvFolderNames):
        print("Labeling '" + cvFolder + "' partition")
        trainPartitionIdx = cvPart[dataDir + '_train']
        testPartitionIdx = cvPart[dataDir + '_test']
        
        # Label the data with an iterative forced alignment HMM
        letterStarts, letterDurations, blankWindows = labelDataset(sentenceDat, 
                                                                   singleLetterDat, 
                                                                   twCubes,
                                                                   trainPartitionIdx, 
                                                                   testPartitionIdx, 
                                                                   charDef)
        
        # Construct targets for supervised learning
        charStartTarget, charProbTarget, ignoreErrorHere = preparedataforLSTM(letterStarts, 
                                                                               letterDurations, 
                                                                               sentenceDat['neuralActivityCube'].shape[1], 
                                                                               sentences, 
                                                                               charDef)
        
        saveDict = {}
        saveDict['letterStarts'] = letterStarts
        saveDict['letterDurations'] = letterDurations
        saveDict['charStartTarget'] = charStartTarget.astype(np.float32)
        saveDict['charProbTarget'] = charProbTarget.astype(np.float32)
        saveDict['ignoreErrorHere'] = ignoreErrorHere.astype(np.float32)
        saveDict['blankWindows'] = blankWindows
        saveDict['timeBinsPerSentence'] = sentenceDat['numTimeBinsPerSentence']
        
        if not os.path.isdir(rootDir + 'LSTMTrainingSteps/Step2_HMMLabels/' + cvFolder):
            os.mkdir(rootDir + 'LSTMTrainingSteps/Step2_HMMLabels/' + cvFolder)
            
        scipy.io.savemat(rootDir + 'LSTMTrainingSteps/Step2_HMMLabels/' + cvFolder + '/' + dataDir + '_timeSeriesLabels.mat', saveDict)
        
    timeEnd = datetime.datetime.now()
    print('Total time taken: ' + str((timeEnd - timeStart).total_seconds()) + ' seconds')
    print(' ')

Labeling t5.2019.05.08 dataset
Labeling 'HeldOutBlocks' partition
HMM Iteration 0
HMM Iteration 1


ValueError: too many dimensions 'str'