In [8]:
#This notebook generates synthetic sentences to agument the RNN's training data (for BOTH of the train/test partitions and
#ALL ten sessions). Step 3 utilizes the data labels created during Step 2 to rearrange the data into new sentences. 
#The output of Step 3 is a set of .tfrecord files that are mixed together with the real data during RNN training.

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]=''

import tensorflow as tf

#suppress all tensorflow warnings (largely related to compatability with v2)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

import numpy as np
import scipy.io
import matplotlib.pyplot as plt
import sklearn.decomposition 
from characterDefinitions import getHandwritingCharacterDefinitions
from makeSyntheticSentences import generateCharacterSequences, extractCharacterSnippets, addSingleLetterSnippets
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, wait
import datetime
from dataPreprocessing import normalizeSentenceDataCube

#point this towards the top level dataset directory
rootDir = '/scratch/users/stfan' + '/handwritingBCIData/'

#define which datasets to process
dataDirs = ['t5.2019.05.08','t5.2019.12.09','t5.2019.12.11','t5.2019.12.18',
            't5.2019.12.20','t5.2020.01.06','t5.2020.01.08','t5.2020.01.13','t5.2020.01.15']

maxLengths = [
    5334*2, 5522*2, 4117*2, 4783*2, 4207*2, 3239*2, 3944*2, 3105*2, 3631*2
]

# dataDirs = ['t5.2019.11.25']
# maxLengths = [6938*2]

#construct synthetic data for both training partitions
cvParts = ['HeldOutBlocks', 
           #'HeldOutTrials']
          ]

#defines the list of all 31 characters and what to call them
charDef = getHandwritingCharacterDefinitions()

#saves all synthetic sentences & snippet libraries in this folder
outputDir = rootDir + 'RNNTrainingSteps/Step3_SyntheticSentences_tf2/'
if not os.path.isdir(outputDir):
    os.mkdir(outputDir)

In [2]:
#First, we generate snippet libraries for each dataset by cutting out each letter from each sentence. These can then
#be re-arranged into new sequences. 
for dataDir in dataDirs:
    print('Processing ' + dataDir)
    
    for cvPart in cvParts:
        print('--' + cvPart)
        
        #load datasets and train/test partition
        sentenceDat = scipy.io.loadmat(rootDir+'Datasets/'+dataDir+'/sentences.mat')
        singleLetterDat = scipy.io.loadmat(rootDir+'Datasets/'+dataDir+'/singleLetters.mat')
        twCubes = scipy.io.loadmat(rootDir+'RNNTrainingSteps/Step1_TimeWarping/'+dataDir+'_warpedCubes.mat')
        
        cvPartFile = scipy.io.loadmat(rootDir+'RNNTrainingSteps/trainTestPartitions_'+cvPart+'.mat')
        trainPartitionIdx = cvPartFile[dataDir+'_train']
        
        #the last two sessions have hashmarks (#) to indicate that T5 should take a brief pause
        #here we remove these from the sentence prompts, otherwise the code below will get confused (because # isn't a character)
        for x in range(sentenceDat['sentencePrompt'].shape[0]):
            sentenceDat['sentencePrompt'][x,0][0] = sentenceDat['sentencePrompt'][x,0][0].replace('#','')
        
        #normalize the neural activity cube
        neuralCube = normalizeSentenceDataCube(sentenceDat, singleLetterDat)
        
        #load labels
        labels = scipy.io.loadmat(rootDir + 'RNNTrainingSteps/Step2_HMMLabels/'+cvPart+'/'+dataDir+'_timeSeriesLabels.mat')

        #cut out character snippets from the data for augmentation
        snippetDict = extractCharacterSnippets(labels['letterStarts'], 
                                               labels['blankWindows'], 
                                               neuralCube, 
                                               sentenceDat['sentencePrompt'][:,0], 
                                               sentenceDat['numTimeBinsPerSentence'][:,0], 
                                               trainPartitionIdx, 
                                               charDef)

        #add single letter examples
        snippetDict = addSingleLetterSnippets(snippetDict, 
                                              singleLetterDat, 
                                              twCubes, 
                                              charDef)

        #save results
        if not os.path.isdir(rootDir + 'RNNTrainingSteps/Step3_SyntheticSentences_random_sentence_bos/'+cvPart):
            os.mkdir(rootDir + 'RNNTrainingSteps/Step3_SyntheticSentences_random_sentence_bos/'+cvPart)
        scipy.io.savemat(rootDir + 'RNNTrainingSteps/Step3_SyntheticSentences_random_sentence_bos/'+cvPart+'/'+dataDir+'_snippets.mat', snippetDict)
        

Processing t5.2019.05.08
--HeldOutTrials
Processing t5.2019.11.25
--HeldOutTrials
Processing t5.2019.12.09
--HeldOutTrials
Processing t5.2019.12.11
--HeldOutTrials
Processing t5.2019.12.18
--HeldOutTrials
Processing t5.2019.12.20
--HeldOutTrials
Processing t5.2020.01.06
--HeldOutTrials
Processing t5.2020.01.08
--HeldOutTrials
Processing t5.2020.01.13
--HeldOutTrials
Processing t5.2020.01.15
--HeldOutTrials


In [2]:
from pathlib import Path

#Now we use the above snippet libraries to make synthetic data for each dataset and train/test partition.

#'nParallelProcesses' specifies how many parallel processes to use when generating synthetic data (to speed things up).
#Decrease if it uses too much memory on your machine. (10 uses ~30 GB of RAM)
nParallelProcesses = 8

for nSteps, dataDir in zip(maxLengths, dataDirs):
    print('Processing ' + dataDir)
    
    for cvPart in cvParts:
        print('--' + cvPart)
        
        currOutputDir = outputDir+cvPart+'/'+dataDir+'_syntheticSentences'
        bashDir = rootDir+'bashScratch'
        repoDir = os.getcwd()

        Path(currOutputDir).mkdir(parents=True, exist_ok=True)
        Path(bashDir).mkdir(parents=True, exist_ok=True)

        args = {}
        args['nSentences'] = 64
        args['nSteps'] = nSteps
        args['binSize'] = 2
        args['wordListFile'] = repoDir+'/wordList/google-10000-english-usa.txt' #from https://github.com/first20hours/google-10000-english
        args['rareWordFile'] = repoDir+'/wordList/rareWordIdx.mat'
        args['snippetFile'] = rootDir + 'RNNTrainingSteps/Step3_SyntheticSentences/'+cvPart+'/'+dataDir+'_snippets.mat'
        args['accountForPenState'] = 1
        args['charDef'] = getHandwritingCharacterDefinitions()
        args['seed'] = datetime.datetime.now().microsecond
        
        with open('/scratch/users/stfan/handwritingBCIData/webTextSentences.txt', 'r') as f:
            sentences = [l.strip() for l in f.readlines()]
        args['sentenceList'] = sentences

        argList = []
        for x in range(80):
            newArgs = args.copy()
            newArgs['saveFile'] = currOutputDir+'/bat_'+str(x)+'.tfrecord'
            newArgs['seed'] += x
            argList.append(newArgs)
            
        futures = []
        with ProcessPoolExecutor(nParallelProcesses) as p:
            for arg in argList:
                futures.append(p.submit(generateCharacterSequences, arg))

        results = [f.result() for f in futures]
        print(results)


Processing t5.2019.05.08
--HeldOutBlocks
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
Processing t5.2019.12.09
--HeldOutBlocks
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

In [7]:
import pathlib
from glob import glob

dataDir = '/scratch/users/stfan/handwritingBCIData/RNNTrainingSteps/Step3_SyntheticSentences_tf2/HeldOutBlocks/t5.2019.11.25_syntheticSentences/'
files = glob(dataDir + '*.tfrecord')
# print(files)
dataset = tf.data.TFRecordDataset(files)

nInputFeatures = 192
nClasses = 31
maxSeqElements = 500

datasetFeatures = {"inputFeatures": tf.io.FixedLenSequenceFeature([nInputFeatures], tf.float32, allow_missing=True),
#             "classLabelsOneHot": tf.io.FixedLenSequenceFeature([nClasses], tf.float32, allow_missing=True),
            "newClassSignal": tf.io.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
            "ceMask": tf.io.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
            "seqClassIDs": tf.io.FixedLenFeature((maxSeqElements), tf.int64),
            "nTimeSteps": tf.io.FixedLenFeature((), tf.int64),
            "nSeqElements": tf.io.FixedLenFeature((), tf.int64),
            "transcription": tf.io.FixedLenFeature((maxSeqElements), tf.int64)}

def parseDatasetFunction(exampleProto):
    return tf.io.parse_single_example(exampleProto, datasetFeatures)

dataset = dataset.map(parseDatasetFunction).shuffle(10)
# dataset = dataset.padded_batch(1)
datIter = iter(dataset)

for i, d in enumerate(datIter):
    print(d['inputFeatures'], d['inputFeatures'].shape)
    print(d['nSeqElements'])
    print(tf.reduce_min(d['seqClassIDs'][:d['nSeqElements']]))
    break


tf.Tensor(
[[ 0.89646584 -0.30436826  0.41243026 ... -0.5551781   0.80052704
   0.7129129 ]
 [-0.27373    -0.30436826 -0.38925537 ...  0.07325782 -0.25691307
   0.7129129 ]
 [ 0.89646584 -0.30436826  0.41243026 ... -0.5551781  -0.25691307
  -0.65961415]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]], shape=(6938, 192), dtype=float32) (6938, 192)
tf.Tensor(66, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
