In [1]:
from keras.models import Model, load_model
from keras.layers import *
from keras.callbacks import *
from keras.optimizers import Adam
from keras import backend as K
import json
import pickle
import os
import cv2
import re
import itertools
import glob
from sklearn.utils import shuffle
from tqdm import tqdm
import numpy as np
import itertools
from unicodedata import normalize
import keras

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
PATH_TRAIN = "/home/pham.huu.quang/PycharmProjects/OCR/ICDAR/crop_linetext/b-mod_lines/train.easy"
PATH_VALID = "/home/pham.huu.quang/PycharmProjects/OCR/ICDAR/crop_linetext/b-mod_lines/valid.easy"
PATH_TEST = "/home/pham.huu.quang/PycharmProjects/OCR/ICDAR/crop_linetext/b-mod_lines/test.easy"

In [3]:
def get_labels(path_data=PATH_TRAIN):
    with open(path_data, "r") as f_r:
        lines = f_r.readlines()

    character = []
    for line in lines:
        line = line.strip()
        line = normalize("NFC", line)
        line = line.split(" ", 1)[1]
        line = re.sub("\s+", "", line)
        character.extend(line)

    character = list(set(character))
    character = sorted(character)
    character = "".join(character)
    
    character = " " + character
    
    return character


character_list = get_labels()

FileNotFoundError: [Errno 2] No such file or directory: '/home/pham.huu.quang/PycharmProjects/OCR/ICDAR/crop_linetext/b-mod_lines/train.easy'

In [4]:
(character_list) = ' !"#$%\'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz{|}~§©°é'

In [5]:
def text_to_labels(text):
    text = normalize("NFC", text)
    text = re.sub("\s+", " ", text)
    return list(map(lambda x: character_list.index(x), text))


def labels_to_text(label):
    return ''.join(list(map(lambda x: character_list[x] if x < len(character_list) else "", label)))


def ctc_loss(args):
    y_true, y_pred, input_length, label_length = args
    # two first steps are often garbage
    y_pred = y_pred[:, 2:, :]
    return K.ctc_batch_cost(y_true, y_pred, input_length, label_length)


def decode_batch(out):
    ret = []
    for j in range(out.shape[0]):
        out_best = list(np.argmax(out[j, 2:], 1))
        out_best = [k for k, g in itertools.groupby(out_best)]
        outstr = labels_to_text(out_best)
        ret.append(outstr)
    return ret

In [6]:
NO_CLASSES = len(character_list) + 1
BATCH_SIZE = 12
IMAGE_HEIGHT = 32
NO_CHANNEL = 1

In [7]:
FILTER_SIZE_1 = 6
FILTER_SIZE_2 = 5
FILTER_SIZE_3 = 4
FILTER_SIZE_4 = 3
STRIDE = 4

In [8]:
class VizCallback(keras.callbacks.Callback):
    def __init__(self, y_func, text_img_gen, text_size, num_display_words=20):
        super().__init__()
        self.y_func = y_func
        self.text_img_gen = text_img_gen
        self.num_display_words = num_display_words
        self.text_size = text_size


    def on_epoch_end(self, epoch, logs={}):
        batch = next(self.text_img_gen.next_val(
            self.text_img_gen.img_names_val, self.text_img_gen.texts_val))[0]
        inputs = batch['the_inputs'][:self.num_display_words]
        labels = batch['the_labels'][:self.num_display_words].astype(np.int32)
        labels = [labels_to_text(label) for label in labels]
        pred = self.y_func([inputs])[0]
        pred_texts = decode_batch(pred)
        for i in range(min(self.num_display_words, len(inputs))):
            print("label: {} - predict: {}".format(labels[i], pred_texts[i]))

In [9]:
len(character_list)

93

In [10]:
class DataGenerator():
    def __init__(self, train_image_list, val_image_list, batch_size=BATCH_SIZE):
        self.train_image_list = shuffle(train_image_list)
        self.val_image_list = shuffle(val_image_list)
        self.batch_size = batch_size
        self.current_train_index = 0
        self.current_val_index = 0

    def compute_time_step(self, image_width):
        tmp = image_width
        for i in range(2):
            tmp = (tmp-1) // 2 + 1
        tmp = (tmp + STRIDE//4 - 1) // (STRIDE // 4)
        return tmp

    def load_image(self, image_path):
        image = cv2.imread(image_path, 0)
        ratio = image.shape[0] / IMAGE_HEIGHT
        image = cv2.resize(image, (int(image.shape[1]/ratio), IMAGE_HEIGHT))
        image = image / 255.
        image = np.expand_dims(image, axis=-1)
        return image

    def get_batch(self, partition='train'):
        if partition == 'train':
            temp_image_list = self.train_image_list[self.current_train_index:
                                                    self.current_train_index+self.batch_size]
        else:
            temp_image_list = self.val_image_list[self.current_val_index:
                                                  self.current_val_index+self.batch_size]
        image_array = []
        label_array = []
        for ind in range(self.batch_size):
            
            path_img_ind = temp_image_list[ind].split(" ", 1)[0]
            path_img_ind = os.path.join("../crop_linetext/b-mod_lines/lines", path_img_ind)
            
            image_array.append(self.load_image(path_img_ind))
            
            true_label = temp_image_list[ind].split(" ", 1)[1].strip()
            true_label = normalize("NFC", true_label)
            label_array.append(true_label)
            
        max_image_width = max([m.shape[1] for m in image_array])
        max_label_length = max(len(m) for m in label_array)
        input_image = np.ones(
            (self.batch_size, IMAGE_HEIGHT, max_image_width, NO_CHANNEL))
        input_true_label = np.ones(
            (self.batch_size, max_label_length)) * (NO_CLASSES-1)
        input_time_step = np.zeros((self.batch_size, 1))
        input_label_length = np.zeros((self.batch_size, 1))
        for ind in range(self.batch_size):
            real_width = image_array[ind].shape[1]
            real_label_len = len(label_array[ind])
            tmp = text_to_labels(label_array[ind])
            input_image[ind, :, :real_width, :] = image_array[ind]
            input_true_label[ind, :real_label_len] = tmp
            input_time_step[ind] = self.compute_time_step(real_width) - 2
            input_label_length[ind] = real_label_len
        inputs = {
            'input_image': input_image,
            'input_true_label': input_true_label,
            'input_time_step': input_time_step,
            'input_label_length': input_label_length}
        outputs = {'ctc': np.zeros((self.batch_size))}
        return (inputs, outputs)

    def next_train(self):
        while True:
            tmp = self.get_batch('train')
            self.current_train_index += self.batch_size
            if self.current_train_index >= len(self.train_image_list) - self.batch_size:
                self.train_image_list = shuffle(self.train_image_list)
                self.current_train_index = 0
            yield tmp

    def next_val(self):
        while True:
            tmp = self.get_batch('val')
            self.current_val_index += self.batch_size
            if self.current_val_index >= len(self.val_image_list) - self.batch_size:
                self.val_image_list = shuffle(self.val_image_list)
                self.current_val_index = 0
            yield tmp

In [11]:
from sklearn.model_selection import train_test_split

with open(PATH_TRAIN, "r") as f_r:
    train_image_list = f_r.readlines()

with open(PATH_VALID, "r") as f_r:
    val_image_list = f_r.readlines()

print(len(train_image_list), len(val_image_list))

data_gen = DataGenerator(train_image_list, val_image_list)

292738 38425


In [12]:
next(data_gen.next_train())[0]["input_image"].shape

(12, 32, 824, 1)

In [13]:
len(val_image_list), len(train_image_list)

(38425, 292738)

In [14]:
def squeeze_layer(arr, axis=1):
    return K.squeeze(arr, axis)
IMAGE_HEIGHT = 50
def model():
    rate = 0.0025
    input_image = Input(
        shape=(IMAGE_HEIGHT, None, NO_CHANNEL), name='input_image')
    input_true_label = Input(shape=(None,), name='input_true_label')
    input_time_step = Input(shape=(1,), name='input_time_step')
    input_label_length = Input(shape=(1,), name='input_label_length')

    temp = BatchNormalization()(input_image)
#     temp = SpatialDropout2D(rate)(temp)
    temp = Conv2D(filters=32, kernel_size=(3, 3), padding='same')(temp)
    temp = BatchNormalization()(temp)
    temp = Activation('elu')(temp)
    temp = MaxPool2D(pool_size=(3, 3), strides=(2, 2), padding='same')(temp)

    temp = Conv2D(filters=32, kernel_size=(3, 3), strides=(1, 1), padding='same')(temp)
    temp = BatchNormalization()(temp)
    temp = Activation('elu')(temp)
    temp = MaxPool2D(pool_size=(3, 3), strides=(2, 2), padding='same')(temp)

    temp_1 = Conv2D(filters=16, kernel_size=(12, FILTER_SIZE_1),
                  strides=(1, 1), padding='same')(temp)
    temp_1 = MaxPool2D(pool_size=(6, 1))(temp_1)
    temp_2 = Conv2D(filters=16, kernel_size=(12, FILTER_SIZE_2),
                  strides=(1, 1), padding='same')(temp)
    temp_2 = MaxPool2D(pool_size=(6, 1))(temp_2)
    temp_3 = Conv2D(filters=16, kernel_size=(12, FILTER_SIZE_3),
                  strides=(1, 1), padding='same')(temp)
    temp_3 = MaxPool2D(pool_size=(6, 1))(temp_3)
    temp_4 = Conv2D(filters=16, kernel_size=(12, FILTER_SIZE_4),
                  strides=(1, 1), padding='same')(temp)
    temp_4 = MaxPool2D(pool_size=(6, 1))(temp_4)
    temp = concatenate([temp_1, temp_2, temp_3, temp_4], axis=-1)
    
    temp = Reshape([1,-1,16*2])(temp)
    
    temp = BatchNormalization()(temp)
    temp = Activation('elu')(temp)
    temp = Lambda(squeeze_layer)(temp)

    gru_1 = GRU(units=64, return_sequences=True)(temp)
    gru_1 = BatchNormalization()(gru_1)
    gru_1 = Activation('elu')(gru_1)

    dense = TimeDistributed(Dense(units=NO_CLASSES))(gru_1)
    dense = Activation('softmax')(dense)
    loss_out = Lambda(ctc_loss, output_shape=(1,), name='ctc')(
        [input_true_label, dense, input_time_step, input_label_length])
    model = Model([input_image, input_true_label,
                   input_time_step, input_label_length], loss_out)
    print (model.summary())
    y_func = K.function([input_image], [dense])
    return model, y_func

In [15]:
model, y_func = model()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_image (InputLayer)        (None, 50, None, 1)  0                                            
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 50, None, 1)  4           input_image[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 50, None, 32) 320         batch_normalization_1[0][0]      
__________________________________________________________________________________________________
batch_normalization_2 (BatchNor (None, 5

In [16]:
from keras.utils import plot_model
plot_model(model, to_file='model2.png', show_shapes=True)

29.716666666666665

# Attention

In [21]:
from keras_self_attention import SeqSelfAttention

def squeeze_layer(arr, axis=1):
    return K.squeeze(arr, axis)
IMAGE_HEIGHT = 50
def model():
    rate = 0.0025
    input_image = Input(
        shape=(IMAGE_HEIGHT, None, NO_CHANNEL), name='input_image')
    input_true_label = Input(shape=(None,), name='input_true_label')
    input_time_step = Input(shape=(1,), name='input_time_step')
    input_label_length = Input(shape=(1,), name='input_label_length')

    temp = BatchNormalization()(input_image)
    temp = SpatialDropout2D(rate)(temp)
    temp = Conv2D(filters=32, kernel_size=(3, 3), padding='same')(temp)
    temp = BatchNormalization()(temp)
    temp = Activation('elu')(temp)
    temp = MaxPool2D(pool_size=(3, 3), strides=(2, 2), padding='same')(temp)

    temp = Conv2D(filters=32, kernel_size=(3, 3), strides=(1, 1), padding='same')(temp)
    temp = BatchNormalization()(temp)
    temp = Activation('elu')(temp)
    temp = MaxPool2D(pool_size=(3, 3), strides=(2, 2), padding='same')(temp)

    temp_1 = Conv2D(filters=16, kernel_size=(12, FILTER_SIZE_1),
                  strides=(1, 1), padding='same')(temp)
    temp_1 = MaxPool2D(pool_size=(12, 1))(temp_1)
    temp_2 = Conv2D(filters=16, kernel_size=(12, FILTER_SIZE_2),
                  strides=(1, 1), padding='same')(temp)
    temp_2 = MaxPool2D(pool_size=(12, 1))(temp_2)
    temp_3 = Conv2D(filters=16, kernel_size=(12, FILTER_SIZE_3),
                  strides=(1, 1), padding='same')(temp)
    temp_3 = MaxPool2D(pool_size=(12, 1))(temp_3)
    temp_4 = Conv2D(filters=16, kernel_size=(12, FILTER_SIZE_4),
                  strides=(1, 1), padding='same')(temp)
    temp_4 = MaxPool2D(pool_size=(12, 1))(temp_4)
    temp = concatenate([temp_1, temp_2, temp_3, temp_4], axis=-1)
    
    temp = BatchNormalization()(temp)
    temp = Activation('elu')(temp)
    temp = Lambda(squeeze_layer)(temp)
    
#     e = Dense(64, activation="tanh", name="attention1")(temp)
#     energies = Dense(64, activation="relu",name="attention2")(e)
#     attention_weights = Activation('softmax', name="attention3")(energies)
#     temp = Dot(axes=1, name="attention4")([attention_weights,
#                            temp])
    temp = SeqSelfAttention(attention_activation='sigmoid')(temp)
    print(temp.shape)

    gru_1 = GRU(units=64, return_sequences=True)(temp)
    gru_1 = BatchNormalization()(gru_1)
    gru_1 = Activation('elu')(gru_1)

    dense = TimeDistributed(Dense(units=NO_CLASSES))(gru_1)
    dense = Activation('softmax')(dense)
    loss_out = Lambda(ctc_loss, output_shape=(1,), name='ctc')(
        [input_true_label, dense, input_time_step, input_label_length])
    model = Model([input_image, input_true_label,
                   input_time_step, input_label_length], loss_out)
    print (model.summary())
    y_func = K.function([input_image], [dense])
    return model, y_func

In [22]:
model, y_func = model()

(?, ?, 64)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_image (InputLayer)        (None, 50, None, 1)  0                                            
__________________________________________________________________________________________________
batch_normalization_11 (BatchNo (None, 50, None, 1)  4           input_image[0][0]                
__________________________________________________________________________________________________
spatial_dropout2d_2 (SpatialDro (None, 50, None, 1)  0           batch_normalization_11[0][0]     
__________________________________________________________________________________________________
conv2d_13 (Conv2D)              (None, 50, None, 32) 320         spatial_dropout2d_2[0][0]        
__________________________________________________________________________________________________

In [23]:
vis = VizCallback(y_func, data_gen.next_val(), 100)
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min')

In [24]:
optimizer = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-9,
                 decay=1e-6, amsgrad=True, clipnorm=5., clipvalue=0.5)
MODEL_PATH = 'pre-trained2/epoch_{epoch}_{loss:.5f}_{val_loss:.5f}.h5'
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

step_val = len(val_image_list) // BATCH_SIZE
step_train = len(train_image_list) // BATCH_SIZE // 2

checkpointer = ModelCheckpoint(filepath=MODEL_PATH, save_best_only=True, verbose=1)
# early = EarlyStopping(patience=5)
model.fit_generator(generator=data_gen.next_train(), steps_per_epoch=step_train, epochs=200, verbose=1,
    callbacks=[checkpointer, earlystop], validation_data=data_gen.next_val(), validation_steps=step_val)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/200
    2/12197 [..............................] - ETA: 13:08:03 - loss: 605.6382

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/pham.huu.quang/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-24-4177be79885f>", line 12, in <module>
    callbacks=[checkpointer, earlystop], validation_data=data_gen.next_val(), validation_steps=step_val)
  File "/home/pham.huu.quang/anaconda3/lib/python3.7/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/home/pham.huu.quang/anaconda3/lib/python3.7/site-packages/keras/engine/training.py", line 1418, in fit_generator
    initial_epoch=initial_epoch)
  File "/home/pham.huu.quang/anaconda3/lib/python3.7/site-packages/keras/engine/training_generator.py", line 217, in fit_generator
    class_weight=class_weight)
  File "/home/pham.huu.quang/anaconda3/lib/python3.7/site-packages/keras/engine/training.py", line 1217, in train_on_batch
    outputs = self.train_func

KeyboardInterrupt: 

## Test accuracy

In [8]:
import shutil
from keras.models import Model, load_model
from keras.layers import *
from keras.callbacks import *
from keras.optimizers import Adam
from keras import backend as K
import json
import pickle
import os
import cv2
import re
import itertools
import glob
from sklearn.utils import shuffle
from tqdm import tqdm
import numpy as np
import itertools
from unicodedata import normalize
import keras

In [9]:
model = load_model('pre-trained2/epoch_64_0.02831_0.00532.h5', compile=False)

OSError: Unable to open file (unable to open file: name = 'pre-trained2/epoch_64_0.02831_0.00532.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
new_model.summary()

In [None]:
input_layer = model.inputs[0]
output_layer = model.layers[-4].output
new_model = Model(input_layer, output_layer)

In [None]:
def load_image(image_path):
    image = cv2.imread(image_path, 0)
    ratio = image.shape[0] / IMAGE_HEIGHT
    image = cv2.resize(image, (int(image.shape[1]/ratio), IMAGE_HEIGHT))
#     image = image / 255.
#     image = np.expand_dims(image, axis=-1)
#     image = np.expand_dims(image, axis=0)
    return image

IMAGE_HEIGHT = 50
character_list = ' 1453702869'
# character_list = " 9570128364"
def validate_folder(file_list, new_model):
    import re
    import matplotlib.pyplot as plt
    true_ocr = 0
#     fw = open('wrong_ocr.txt', 'w')
#     text = ''
    for f in tqdm(file_list):
        true_label = f.split('/')[-1].split('_')[0]
        true_label = re.sub('[,円月 ]', '', true_label)
        image_ori = load_image(f)
        image = image_ori / 255.
        image = np.expand_dims(image, axis=-1)
        image = np.expand_dims(image, axis=0)
        result = np.squeeze(new_model.predict(image))[2:, :]
        result = np.argmax(result, axis=-1)
        result = [k for k, g in itertools.groupby(result)]
        result = labels_to_text(result)
#         print((result))
#         result = [str(k) for k, _ in itertools.groupby(result) if k < 10]
#         result = ''.join(result)
        if result == true_label:
            true_ocr += 1
        else:
            shutil.move(f, f.replace('val', 'val_verify'))
            pass
#             tmp_dir = os.path.join('/home/phamhoanganh/Documents/ocr_ctc/data_ocr_real_verify', true_label)
#             if not os.path.exists(tmp_dir):
#                 os.mkdir(tmp_dir)
# #             print (f)
#             text += f + '\n'
#             text += true_label + '  /  ' + result + '\n'
#             text += '_' * 100 + '\n'
#             plt.figure()
#             plt.imshow(image_ori, cmap='gray')
#             plt.show()
#             print (f[20:], ' __ '.join([result, true_label]))
#     fw.write(text)
#     fw.close()
    print (true_ocr, ' / ', len(file_list))

In [None]:
file_list = sorted(glob.glob('/home/phamhoanganh/Documents/ocr_ctc/data_ocr/val/*'))
validate_folder(file_list, new_model)

In [None]:
13835/13859

In [None]:
16056/16091

In [None]:
character_list

# prediction

In [10]:
import shutil
from keras.models import Model, load_model
from keras.layers import *
from keras.callbacks import *
from keras.optimizers import Adam
from keras import backend as K
import json
import pickle
import os
import cv2
import re
import itertools
import glob
from sklearn.utils import shuffle
from tqdm import tqdm
import numpy as np
import itertools
from unicodedata import normalize
import keras
import re

In [11]:
model = load_model('/media/SUN-ASTERISK\pham.huu.quang/618a37c1-5b71-4a6f-be41-e2b2fdb03876/PycharmProject/OCR/ICDAR/src/state-of-the-art/easy_11_0.60781_0.42893.h5', compile=False)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


In [12]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_image (InputLayer)        (None, 50, None, 1)  0                                            
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 50, None, 1)  4           input_image[0][0]                
__________________________________________________________________________________________________
spatial_dropout2d_1 (SpatialDro (None, 50, None, 1)  0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 50, None, 32) 320         spatial_dropout2d_1[0][0]        
__________________________________________________________________________________________________
batch_norm

In [13]:
input_layer = model.inputs[0]
output_layer = model.layers[-4].output
new_model = Model(input_layer, output_layer)

In [41]:
def load_image(image_path):
    image = cv2.imread(image_path, 0)
    ratio = image.shape[0] / IMAGE_HEIGHT
    image = cv2.resize(image, (int(image.shape[1]/ratio), IMAGE_HEIGHT))
#     image = image / 255.
    print(image.shape)
#     image = np.expand_dims(image, axis=-1)
#     image = np.expand_dims(image, axis=0)
    return image

IMAGE_HEIGHT = 50
# character_list = " 9570128364"

def labels_to_text(label):
    return ''.join(list(map(lambda x: character_list[x] if x < len(character_list) else "", label)))

def prediction(f, new_model):
    image_ori = load_image(f)
    image = image_ori / 255.
    image = np.expand_dims(image, axis=-1)
    image = np.expand_dims(image, axis=0)
    result = np.squeeze(new_model.predict(image))[2:, :]
    result = np.argmax(result, axis=-1)
    result = [k for k, g in itertools.groupby(result)]
    result = labels_to_text(result)
        
    return result

In [43]:
import datetime
t1 = (datetime.datetime.now())
prediction("/media/SUN-ASTERISK\pham.huu.quang/618a37c1-5b71-4a6f-be41-e2b2fdb03876/PycharmProject/OCR/ICDAR/crop_linetext/b-mod_lines/lines/0a0c9f9b0bdc29a9b41e6188d666e7a8.jpg_rec_l0004.jpg", new_model)
t2 = (datetime.datetime.now())
print((t2 - t1).total_seconds()*1000, "ms")

(50, 692)
96.986 ms


In [17]:
prediction("../crop_linetext/b-mod_lines/lines/b00380517346b2e8f30c7c77b0568f90.jpg_rec_l0004.jpg", new_model)

'We wovk in the category of schemes or more generally in the category of schemes'

0.012199

In [13]:
list_file_test = glob.glob("/home/phamhoanganh/Documents/ocr_quang/DemoCTC/ocr/*/*")

In [None]:
for file in list_file_test:
    true_label = file.split("/")[-2]
    try:
        pre_label = prediction(file, new_model)

        if true_label == pre_label:
            os.remove(file)
    except:
        pass

In [4]:
PATH_TRAIN_EASY = "../crop_linetext/b-mod_lines/train.easy"
PATH_VALID_EASY = "../crop_linetext/b-mod_lines/valid.easy"
PATH_TEST_EASY = "../crop_linetext/b-mod_lines/test.easy"

PATH_TRAIN_MEDIUM = "../crop_linetext/b-mod_lines/train.medium"
PATH_VALID_MEDIUM = "../crop_linetext/b-mod_lines/valid.medium"
PATH_TEST_MEDIUM = "../crop_linetext/b-mod_lines/test.medium"

PATH_TRAIN_HARD = "../crop_linetext/b-mod_lines/train.hard"
PATH_VALID_HARD = "../crop_linetext/b-mod_lines/valid.hard"
PATH_TEST_HARD = "../crop_linetext/b-mod_lines/test.hard"

In [10]:
def get_labels(path_data=PATH_TRAIN_EASY):
    with open(path_data, "r") as f_r:
        lines = f_r.readlines()

    character = []
    for line in lines:
        line = line.strip()
        line = normalize("NFC", line)
        line = line.split(" ", 1)[1]
        line = re.sub("\s+", "", line)
        character.extend(line)

    character = list(set(character))
    character = sorted(character)
    character = "".join(character)
    
    character = " " + character
    
    return character


character_list = get_labels(PATH_TRAIN_EASY) + get_labels(PATH_TRAIN_MEDIUM) + get_labels(PATH_TRAIN_HARD)
character_list = list(set(character_list))
character_list = sorted(character_list)
character_list = "".join(character_list)
character_list = character_list.replace(" ", "")
character_list = " " + character_list
print(character_list)

 !"#$%'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_abcdefghijklmnopqrstuvwxyz{|}~£§©°é€


In [19]:
from tqdm import tqdm
submit_result = []

with open(PATH_TEST_EASY, "r") as f_r:
    lines = f_r.read().strip().split("\n")
    for line in tqdm(lines):
        line = line.strip()
        submit_result.append(line + " " + prediction("../crop_linetext/b-mod_lines/lines/" + line, new_model))

100%|██████████| 40447/40447 [27:19<00:00, 24.68it/s] 


In [20]:
with open(PATH_TEST_MEDIUM, "r") as f_r:
    lines = f_r.read().strip().split("\n")
    for line in tqdm(lines):
        line = line.strip()
        submit_result.append(line + " " + prediction("../crop_linetext/b-mod_lines/lines/" + line, new_model))

100%|██████████| 16411/16411 [11:31<00:00, 23.73it/s]


In [21]:
with open(PATH_TEST_HARD, "r") as f_r:
    lines = f_r.read().strip().split("\n")
    for line in tqdm(lines):
        line = line.strip()
        submit_result.append(line + " " + prediction("../crop_linetext/b-mod_lines/lines/" + line, new_model))

100%|██████████| 2707/2707 [01:43<00:00, 26.19it/s]


In [25]:
len(submit_result) == (40447 + 2707 + 16411)

True

In [27]:
with open("test.txt", "w") as f_w:
    for line in submit_result:
        f_w.write(line + "\n")

# Tesseract

In [1]:
import pytesseract

In [2]:
config = ("-l eng --oem 1 --psm 7")

In [5]:
from tqdm import tqdm
submit_result = []

with open(PATH_TEST_EASY, "r") as f_r:
    lines = f_r.read().strip().split("\n")
    for line in tqdm(lines):
        line = line.strip()
        submit_result.append(line + " " + pytesseract.image_to_string("../crop_linetext/b-mod_lines/lines/" + line, config=config))

100%|██████████| 40447/40447 [2:03:42<00:00,  6.18it/s]  


In [10]:
with open(PATH_TEST_MEDIUM, "r") as f_r:
    lines = f_r.read().strip().split("\n")
    for line in tqdm(lines):
        line = line.strip()
        submit_result.append(line + " " + pytesseract.image_to_string("../crop_linetext/b-mod_lines/lines/" + line, config=config))

100%|██████████| 16411/16411 [52:43<00:00,  6.20it/s]  


In [8]:
with open(PATH_TEST_HARD, "r") as f_r:
    lines = f_r.read().strip().split("\n")
    for line in tqdm(lines):
        line = line.strip()
        submit_result.append(line + " " + pytesseract.image_to_string("../crop_linetext/b-mod_lines/lines/" + line, config=config))

100%|██████████| 2707/2707 [07:43<00:00,  6.47it/s]


In [12]:
with open("test_tesseract.txt", "w") as f_w:
    for line in submit_result:
        f_w.write(line + "\n")

In [11]:
len(submit_result)

59565