# CTC OMR model

## Imports

In [1]:
import numpy as np
import cv2
import random
import tensorflow.compat.v1 as tf
from tensorflow.python.framework import ops
from tensorflow.python.ops import math_ops
import argparse
import os

In [2]:
tf.__version__

'2.1.0'

In [3]:
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [6]:
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Utils

In [7]:
def convert_inputs_to_ctc_format(target_text):
    SPACE_TOKEN = '-'
    SPACE_INDEX = 4
    FIRST_INDEX = 0

    original = ' '.join(target_text.strip().lower().split(' ')).replace('.', '').replace('?', '').replace(',', '').replace("'", '').replace('!', '').replace('-', '')
    print(original)
    targets = original.replace(' ', '  ')
    targets = targets.split(' ')

    # Adding blank label
    targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets])

    # Transform char into index
    targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                          for x in targets])

    # Creating sparse representation to feed the placeholder
    train_targets = sparse_tuple_from([targets])

    return train_targets, original

def sparse_tuple_from(sequences, dtype=np.int32):
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)

    return indices, values, shape

def sparse_tensor_to_strs(sparse_tensor):
    indices= sparse_tensor[0][0]
    values = sparse_tensor[0][1]
    dense_shape = sparse_tensor[0][2]

    strs = [ [] for i in range(dense_shape[0]) ]

    string = []
    ptr = 0
    b = 0

    for idx in range(len(indices)):
        if indices[idx][0] != b:
            strs[b] = string
            string = []
            b = indices[idx][0]

        string.append(values[ptr])

        ptr = ptr + 1

    strs[b] = string

    return strs


def pad_sequences(sequences, maxlen=None, dtype=np.float32,
                  padding='post', truncating='post', value=0.):
    lengths = np.asarray([len(s) for s in sequences], dtype=np.int64)

    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x, lengths


def word_separator():
    return '\t'

def levenshtein(a,b):
    "Computes the Levenshtein distance between a and b."
    n, m = len(a), len(b)

    if n > m:
        a,b = b,a
        n,m = m,n

    current = range(n+1)
    for i in range(1,m+1):
        previous, current = current, [i]+[0]*n
        for j in range(1,n+1):
            add, delete = previous[j]+1, current[j-1]+1
            change = previous[j-1]
            if a[j-1] != b[i-1]:
                change = change + 1
            current[j] = min(add, delete, change)

    return current[n]


def edit_distance(a,b,EOS=-1,PAD=-1):
    _a = [s for s in a if s != EOS and s != PAD]
    _b = [s for s in b if s != EOS and s != PAD]

    return levenshtein(_a,_b)


def normalize(image):
    return (255. - image)/255.


def resize(image, height):
    width = int(float(height * image.shape[1]) / image.shape[0])
    sample_img = cv2.resize(image, (width, height))
    return sample_img

## Primus functions

In [8]:
class CTC_PriMuS:
    gt_element_separator = '-'
    PAD_COLUMN = 0
    validation_dict = None


    def __init__(self, corpus_dirpath, corpus_filepath, dictionary_path, semantic, distortions = False, val_split = 0.0):
        self.semantic = semantic
        self.distortions = distortions
        self.corpus_dirpath = corpus_dirpath

        # Corpus
        corpus_file = open(corpus_filepath,'r')
        corpus_list = corpus_file.read().splitlines()
        corpus_file.close()

        self.current_idx = 0

        # Dictionary
        self.word2int = {}
        self.int2word = {}
            
        dict_file = open(dictionary_path,'r')
        dict_list = dict_file.read().splitlines()
        for word in dict_list:
            if not word in self.word2int:
                word_idx = len(self.word2int)
                self.word2int[word] = word_idx
                self.int2word[word_idx] = word

        dict_file.close()

        self.vocabulary_size = len(self.word2int)
        
        
        # Train and validation split
        random.shuffle(corpus_list) 
        val_idx = int(len(corpus_list) * val_split) 
        self.training_list = corpus_list[val_idx:]
        self.validation_list = corpus_list[:val_idx]
        
        print ('Training with ' + str(len(self.training_list)) + ' and validating with ' + str(len(self.validation_list)))

    def nextBatch(self, params):
        images = []
        labels = []

        # Read files
        for _ in range(params['batch_size']):
            sample_filepath = self.training_list[self.current_idx]
            sample_fullpath = self.corpus_dirpath + '/' + sample_filepath + '/' + sample_filepath
            # IMAGE
            if self.distortions:
                sample_img = cv2.imread(sample_fullpath + '_distorted.jpg', False) # Grayscale is assumed
            else:
                sample_img = cv2.imread(sample_fullpath + '.png', 0)  # Grayscale is assumed!
            height = params['img_height']
            sample_img = resize(sample_img,height)
            images.append(normalize(sample_img))

            # GROUND TRUTH
            if self.semantic:
                sample_full_filepath = sample_fullpath + '.semantic'
            else:
                sample_full_filepath = sample_fullpath + '.agnostic'
            
            sample_gt_file = open(sample_full_filepath, 'r')
            sample_gt_plain = sample_gt_file.readline().rstrip().split(word_separator())
            sample_gt_file.close()

            labels.append([self.word2int[lab] for lab in sample_gt_plain])

            self.current_idx = (self.current_idx + 1) % len( self.training_list )


        # Transform to batch
        image_widths = [img.shape[1] for img in images]
        max_image_width = max(image_widths)

        batch_images = np.ones(shape=[params['batch_size'],
                                       params['img_height'],
                                       max_image_width,
                                       params['img_channels']], dtype=np.float32)*self.PAD_COLUMN

        for i, img in enumerate(images):
            batch_images[i, 0:img.shape[0], 0:img.shape[1], 0] = img

        # LENGTH
        width_reduction = 1
        for i in range(params['conv_blocks']):
            width_reduction = width_reduction * params['conv_pooling_size'][i][1]

        lengths = [ batch_images.shape[2] / width_reduction ] * batch_images.shape[0]

        return {
            'inputs': batch_images,
            'seq_lengths': np.asarray(lengths),
            'targets': labels,
        }
        
    def getValidation(self, params):
        if self.validation_dict == None:    
            images = []
            labels = []
    
            # Read files
            for sample_filepath in self.validation_list:
                sample_fullpath = self.corpus_dirpath + '/' + sample_filepath + '/' + sample_filepath
                # IMAGE
                sample_img = cv2.imread(sample_fullpath + '.png', 0)  # Grayscale is assumed!
                height = params['img_height']
                sample_img = resize(sample_img,height)
                images.append(normalize(sample_img))
    
                # GROUND TRUTH
                if self.semantic:
                    sample_full_filepath = sample_fullpath + '.semantic'
                else:
                    sample_full_filepath = sample_fullpath + '.agnostic'
                sample_gt_file = open(sample_full_filepath, 'r')
            
                sample_gt_plain = sample_gt_file.readline().rstrip().split(word_separator())
                sample_gt_file.close()
    
                labels.append([self.word2int[lab] for lab in sample_gt_plain])
    
            # Transform to batch
            image_widths = [img.shape[1] for img in images]
            max_image_width = max(image_widths)
    
            batch_images = np.ones(shape=[len(self.validation_list),
                                           params['img_height'],
                                           max_image_width,
                                           params['img_channels']], dtype=np.float32)*self.PAD_COLUMN
    
            for i, img in enumerate(images):
                batch_images[i, 0:img.shape[0], 0:img.shape[1], 0] = img
    
            # LENGTH
            width_reduction = 1
            for i in range(params['conv_blocks']):
                width_reduction = width_reduction * params['conv_pooling_size'][i][1]
    
            lengths = [ batch_images.shape[2] / width_reduction ] * batch_images.shape[0]
    
            self.validation_dict = {
                'inputs': batch_images,
                'seq_lengths': np.asarray(lengths),
                'targets': labels,
            }
            
        return self.validation_dict, len(self.validation_list)


## Model

In [9]:
def leaky_relu(features, alpha=0.2, name=None):
    with ops.name_scope(name, "LeakyRelu", [features, alpha]):
        features = ops.convert_to_tensor(features, name="features")
        alpha = ops.convert_to_tensor(alpha, name="alpha")
        return math_ops.maximum(alpha * features, features)
#
# params["height"] = height of the input image
# params["width"] = width of the input image

def default_model_params(img_height, vocabulary_size):
    params = dict()
    params['img_height'] = img_height
    params['img_width'] = None
    params['batch_size'] = 16
    params['img_channels'] = 1
    params['conv_blocks'] = 4
    params['conv_filter_n'] = [32, 64, 128, 256]
    params['conv_filter_size'] = [ [3,3], [3,3], [3,3], [3,3] ]
    params['conv_pooling_size'] = [ [2,2], [2,2], [2,2], [2,2] ]
    params['rnn_units'] = 512
    params['rnn_layers'] = 2
    params['vocabulary_size'] = vocabulary_size
    return params


def ctc_crnn(params):
    # TODO Assert parameters

    input = tf.placeholder(shape=(None,
                                   params['img_height'],
                                   params['img_width'],
                                   params['img_channels']),  # [batch, height, width, channels]
                            dtype=tf.float32,
                            name='model_input')

    input_shape = tf.shape(input)

    width_reduction = 1
    height_reduction = 1


    # Convolutional blocks
    x = input
    for i in range(params['conv_blocks']):

        x = tf.layers.conv2d(
            inputs=x,
            filters=params['conv_filter_n'][i],
            kernel_size=params['conv_filter_size'][i],
            padding="same",
            activation=None)

        x = tf.layers.batch_normalization(x)
        x = leaky_relu(x)

        x = tf.layers.max_pooling2d(inputs=x,
                                    pool_size=params['conv_pooling_size'][i],
                                    strides=params['conv_pooling_size'][i])

        width_reduction = width_reduction * params['conv_pooling_size'][i][1]
        height_reduction = height_reduction * params['conv_pooling_size'][i][0]


    # Prepare output of conv block for recurrent blocks
    features = tf.transpose(x, perm=[2, 0, 3, 1])  # -> [width, batch, height, channels] (time_major=True)
    feature_dim = params['conv_filter_n'][-1] * (params['img_height'] / height_reduction)
    feature_width = input_shape[2] / width_reduction
    features = tf.reshape(features, tf.stack([tf.cast(feature_width,'int32'), input_shape[0], tf.cast(feature_dim,'int32')]))  # -> [width, batch, features]

    tf.constant(params['img_height'],name='input_height')
    tf.constant(width_reduction,name='width_reduction')

    # Recurrent block
    rnn_keep_prob = tf.placeholder(dtype=tf.float32, name="keep_prob")
    rnn_hidden_units = params['rnn_units']
    rnn_hidden_layers = params['rnn_layers']

    rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        tf.contrib.rnn.MultiRNNCell(
            [tf.nn.rnn_cell.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(rnn_hidden_units), input_keep_prob=rnn_keep_prob)
             for _ in range(rnn_hidden_layers)]),
        tf.contrib.rnn.MultiRNNCell(
            [tf.nn.rnn_cell.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(rnn_hidden_units), input_keep_prob=rnn_keep_prob)
             for _ in range(rnn_hidden_layers)]),
        features,
        dtype=tf.float32,
        time_major=True,
    )

    rnn_outputs = tf.concat(rnn_outputs, 2)

    logits = tf.contrib.layers.fully_connected(
        rnn_outputs,
        params['vocabulary_size'] + 1,  # BLANK
        activation_fn=None,
    )
    
    tf.add_to_collection("logits",logits) # for restoring purposes

    # CTC Loss computation
    seq_len = tf.placeholder(tf.int32, [None], name='seq_lengths')
    targets = tf.sparse_placeholder(dtype=tf.int32, name='target')
    ctc_loss = tf.nn.ctc_loss(labels=targets, inputs=logits, sequence_length=seq_len, time_major=True)
    loss = tf.reduce_mean(ctc_loss)

    # CTC decoding
    decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len)
    # decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits,seq_len,beam_width=50,top_paths=1,merge_repeated=True)

    return input, seq_len, targets, decoded, loss, rnn_keep_prob


## Training

In [10]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
tf.reset_default_graph()
sess = tf.InteractiveSession(config=config)

# parser = argparse.ArgumentParser(description='Train model.')
# parser.add_argument('-corpus', dest='corpus', type=str, required=True, help='Path to the corpus.')
# parser.add_argument('-set',  dest='set', type=str, required=True, help='Path to the set file.')
# parser.add_argument('-save_model', dest='save_model', type=str, required=True, help='Path to save the model.')
# parser.add_argument('-vocabulary', dest='voc', type=str, required=True, help='Path to the vocabulary file.')
# parser.add_argument('-semantic', dest='semantic', action="store_true", default=False)
# args = parser.parse_args()

# Load primus

# primus = CTC_PriMuS(args.corpus,args.set,args.voc, args.semantic, val_split = 0.1)
primus = CTC_PriMuS('./Data/primus/package_aa',
                    './Data/train_few.txt',
                    './Data/vocabulary_agnostic.txt', 
                    False, 
                    val_split = 0.1)

# Parameterization
img_height = 128
params = default_model_params(img_height,primus.vocabulary_size)
# max_epochs = 64000
max_epochs = 31
dropout = 0.5

# Model
inputs, seq_len, targets, decoded, loss, rnn_keep_prob = ctc_crnn(params)
train_opt = tf.train.AdamOptimizer().minimize(loss)


saver = tf.train.Saver(max_to_keep=None)
sess.run(tf.global_variables_initializer())

# Training loop
for epoch in range(max_epochs):
    print('epoch', epoch)
    batch = primus.nextBatch(params)

    _, loss_value = sess.run([train_opt, loss],
                             feed_dict={
                                inputs: batch['inputs'],
                                seq_len: batch['seq_lengths'],
                                targets: sparse_tuple_from(batch['targets']),
                                rnn_keep_prob: dropout,
                            })

    if epoch % 10 == 0:
        # VALIDATION
        print ('Loss value at epoch ' + str(epoch) + ':' + str(loss_value))
        print ('Validating...')
        validation_batch, validation_size = primus.getValidation(params)
        
        val_idx = 0
        
        val_ed = 0
        val_len = 0
        val_count = 0
            
        while val_idx < validation_size:
            mini_batch_feed_dict = {
                inputs: validation_batch['inputs'][val_idx:val_idx+params['batch_size']],
                seq_len: validation_batch['seq_lengths'][val_idx:val_idx+params['batch_size']],
                rnn_keep_prob: 1.0            
            }            
                        
            
            prediction = sess.run(decoded,
                                  mini_batch_feed_dict)
    
            str_predictions = sparse_tensor_to_strs(prediction)
    

            for i in range(len(str_predictions)):
                ed = edit_distance(str_predictions[i], validation_batch['targets'][val_idx+i])
                val_ed = val_ed + ed
                val_len = val_len + len(validation_batch['targets'][val_idx+i])
                val_count = val_count + 1
                
            val_idx = val_idx + params['batch_size']
    
        print ('[Epoch ' + str(epoch) + '] ' + str(1. * val_ed / val_count) + ' (' + str(100. * val_ed / val_len) + ' SER) from ' + str(val_count) + ' samples.')        
        print ('Saving the model...')
        saver.save(sess,'./saved_models/OMRmodel',global_step=epoch)
        print ('------------------------------')


Training with 113 and validating with 12
Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.BatchNormalization instead.  In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used (consult the `tf.keras.layers.BatchNormalization` documentation).
Instructions for updating:
Use keras.layers.MaxPooling2D instead.


AttributeError: module 'tensorflow_core.compat.v1' has no attribute 'contrib'

## Predict

In [11]:
# parser = argparse.ArgumentParser(description='Decode a music score image with a trained model (CTC).')
# parser.add_argument('-image',  dest='image', type=str, required=True, help='Path to the input image.')
# parser.add_argument('-model', dest='model', type=str, required=True, help='Path to the trained model.')
# parser.add_argument('-vocabulary', dest='voc_file', type=str, required=True, help='Path to the vocabulary file.')
# args = parser.parse_args()

# image = './Data/primus/package_aa/000051660-1_2_1/000051660-1_2_1.png'
img = './Data/test.png'
meta = './agnostic-model/agnostic_model.meta'
mod = './agnostic-model/agnostic_model'
voc_file = './Data/vocabulary_agnostic.txt'

tf.reset_default_graph()
sess = tf.InteractiveSession()

# Read the dictionary
dict_file = open(voc_file,'r')
dict_list = dict_file.read().splitlines()
int2word = dict()
for word in dict_list:
    word_idx = len(int2word)
    int2word[word_idx] = word
dict_file.close()

# Restore weights
saver = tf.train.import_meta_graph(meta)
saver.restore(sess,mod)

graph = tf.get_default_graph()

input = graph.get_tensor_by_name("model_input:0")
seq_len = graph.get_tensor_by_name("seq_lengths:0")
rnn_keep_prob = graph.get_tensor_by_name("keep_prob:0")
height_tensor = graph.get_tensor_by_name("input_height:0")
width_reduction_tensor = graph.get_tensor_by_name("width_reduction:0")
logits = tf.get_collection("logits")[0]

# Constants that are saved inside the model itself
WIDTH_REDUCTION, HEIGHT = sess.run([width_reduction_tensor, height_tensor])

decoded, _ = tf.nn.ctc_greedy_decoder(logits, seq_len)

image = cv2.imread(img,False)
image = resize(image, HEIGHT)
image = normalize(image)
image = np.asarray(image).reshape(1,image.shape[0],image.shape[1],1)

seq_lengths = [ image.shape[2] / WIDTH_REDUCTION ]

prediction = sess.run(decoded,
                      feed_dict={
                          input: image,
                          seq_len: seq_lengths,
                          rnn_keep_prob: 1.0,
                      })

str_predictions = sparse_tensor_to_strs(prediction)
for w in str_predictions[0]:
    print (int2word[w]),
    print ('\t'),


'model_variables' collection should be of type 'byte_list', but instead is of type 'node_list'.
INFO:tensorflow:Restoring parameters from ./agnostic-model/agnostic_model
clef.G-L2
	
accidental.flat-L3
	
digit.4-L4
	
digit.4-L2
	
note.beamedRight1-L4
	
note.beamedBoth1-S4
	
note.beamedBoth1-L5
	
note.beamedLeft1-S5
	
note.quarter-S4
	
note.beamedRight1-S3
	
note.beamedLeft2-L4
	
note.beamedLeft1-L4
	
barline-L1
	
slur.end-L4
	
note.whole-L4
	
barline-L1
	
