In [1]:
from pynvml import *

nvmlInit()
vram = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(0)).free/1024.**2
print('GPU0 Memory: %dMB' % vram)
if vram < 8000:
    raise Exception('GPU Memory too low')

GPU0 Memory: 10515MB


# model1 result

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

# 所有待识别字符
CHAR_VECTOR = "0123456789+-*()=a"
letters = [letter for letter in CHAR_VECTOR]

from keras import backend as K

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN tend to be garbage:
    y_pred = y_pred[:, 2:, :]
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

from keras.layers import *
from keras.models import *

def get_model(img_w, img_h, num_classes, training):
    input_shape = (img_w, img_h, 1)  # (128, 64, 1)

    # Make Networkw
    inputs = Input(name='the_input', shape=input_shape, dtype='float32') # (None, 128, 64, 1)

    # Convolution layer (VGG)
    inner = Conv2D(64, (3, 3), padding='same', name='conv1', kernel_initializer='he_normal')(inputs)  # (None, 128, 64, 64)
    inner = BatchNormalization()(inner)
    inner = Activation('relu')(inner)
    inner = MaxPooling2D(pool_size=(2, 2), name='max1')(inner)  # (None,64, 32, 64)
#     inner = Dropout(0.2)(inner)

    inner = Conv2D(128, (3, 3), padding='same', name='conv2', kernel_initializer='he_normal')(inner)  # (None, 64, 32, 128)
    inner = BatchNormalization()(inner)
    inner = Activation('relu')(inner)
    inner = MaxPooling2D(pool_size=(2, 2), name='max2')(inner)  # (None, 32, 16, 128)
#     inner = Dropout(0.2)(inner)

    inner = Conv2D(256, (3, 3), padding='same', name='conv3', kernel_initializer='he_normal')(inner)  # (None, 32, 16, 256)
    inner = BatchNormalization()(inner)
    inner = Activation('relu')(inner)
    inner = Conv2D(256, (3, 3), padding='same', name='conv4', kernel_initializer='he_normal')(inner)  # (None, 32, 16, 256)
    inner = BatchNormalization()(inner)
    inner = Activation('relu')(inner)
    inner = MaxPooling2D(pool_size=(1, 2), name='max3')(inner)  # (None, 32, 8, 256)
#     inner = Dropout(0.2)(inner)

#     inner = Conv2D(512, (3, 3), padding='same', name='conv5', kernel_initializer='he_normal')(inner)  # (None, 32, 8, 512)
#     inner = BatchNormalization()(inner)
#     inner = Activation('relu')(inner)
#     inner = Conv2D(512, (3, 3), padding='same', name='conv6')(inner)  # (None, 32, 8, 512)
#     inner = BatchNormalization()(inner)
#     inner = Activation('relu')(inner)
#     inner = MaxPooling2D(pool_size=(1, 2), name='max4')(inner)  # (None, 32, 4, 512)
#     inner = Dropout(0.2)(inner)

    inner = Conv2D(256, (2, 2), padding='same', kernel_initializer='he_normal', name='con7')(inner)  # (None, 32, 4, 256)
    inner = BatchNormalization()(inner)
    inner = Activation('relu')(inner)

    # CNN to RNN
    inner = Reshape(target_shape=((32, 2048)), name='reshape')(inner)  # (None, 32, 2048)
    inner = Dense(64, activation='relu', kernel_initializer='he_normal', name='dense1')(inner)  # (None, 32, 64)

    # RNN layer
    lstm_1 = GRU(256, return_sequences=True, kernel_initializer='he_normal', name='lstm1')(inner)  # (None, 32, 512)
    lstm_1b = GRU(256, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='lstm1_b')(inner)
    lstm1_merged = add([lstm_1, lstm_1b])  # (None, 32, 512)
    lstm1_merged = BatchNormalization()(lstm1_merged)
    lstm_2 = GRU(256, return_sequences=True, kernel_initializer='he_normal', name='lstm2')(lstm1_merged)
    lstm_2b = GRU(256, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='lstm2_b')(lstm1_merged)
    lstm2_merged = concatenate([lstm_2, lstm_2b])  # (None, 32, 1024)
    lstm2_merged = Dropout(0.2)(lstm2_merged)
#     lstm_merged = BatchNormalization()(lstm2_merged)

    # transforms RNN output to character activations:
    inner = Dense(num_classes, kernel_initializer='he_normal', name='dense2')(lstm2_merged) #(None, 32, 63)
    y_pred = Activation('softmax', name='softmax')(inner)

    labels = Input(name='the_labels', shape=[max_text_len], dtype='float32') # (None ,8)
    input_length = Input(name='input_length', shape=[1], dtype='int64')     # (None, 1)
    label_length = Input(name='label_length', shape=[1], dtype='int64')     # (None, 1)

    # Keras doesn't currently support loss funcs with extra parameters so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length]) #(None, 1)

    if training:
        return Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)
    else:
        return Model(inputs=[inputs], outputs=y_pred)

Using TensorFlow backend.


In [3]:
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
from keras.utils.vis_utils import plot_model

rootpath = 'G:/my_code/python/mathematical_expression_recognition/Mathematical_Expression_Recognition_train/'

img_w, img_h = 128, 64
batch_size = 256
downsample_factor = 4
max_text_len = 12
num_classes = len(letters) + 1
print(num_classes)
width, height, n_len, n_class = 128, 32, 11, len(letters)


18


# model2

In [4]:
def get_model2(width, height, n_class, n_len):
    rnn_size = 128

    input_tensor = Input((width, height, 1))# (128,64,1)
    x = input_tensor
    
    # VGG16 的结构
    # 1
    x = Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    # x = Dropout(0.2)(x) # 模型有过拟合的倾向，加入dropout层
    # 2
    x = Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    # x = Dropout(0.2)(x) # 模型有过拟合的倾向，加入dropout层
    # 3
    x = Conv2D(256, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Conv2D(256, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    # x = Dropout(0.2)(x) # 模型有过拟合的倾向，加入dropout层

    # CNN to RNN
    conv_shape = x.get_shape()
    x = Reshape(target_shape=(int(conv_shape[1]), int(conv_shape[2]*conv_shape[3])))(x) #  Flatten

    x = Dense(256, kernel_initializer='he_normal')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    # RNN layer
    gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(x)
    gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', 
                 name='gru1_b')(x)
    gru1_merged = add([gru_1, gru_1b])
    gru1_merged = BatchNormalization()(gru1_merged)

    gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
    gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', 
                 name='gru2_b')(gru1_merged)
    x = concatenate([gru_2, gru_2b])
    x = Dropout(0.2)(x)
    x = Dense(n_class, kernel_initializer='he_normal', activation='softmax')(x)
    # 预测
    base_model = Model(input=input_tensor, output=x)
    
    return  base_model

# model3

In [5]:
def get_model3(width, height, n_class, n_len):
    rnn_size = 128

    input_tensor = Input((width, height, 1))# (128,64,1)
    x = input_tensor
    
    # VGG16 的结构
    # 1
    x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    # x = Dropout(0.2)(x) # 模型有过拟合的倾向，加入dropout层
    # 2
    x = Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    # x = Dropout(0.2)(x) # 模型有过拟合的倾向，加入dropout层
    # 3
    x = Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal')(x) # 加入 , padding='same'
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    # x = Dropout(0.2)(x) # 模型有过拟合的倾向，加入dropout层

    # CNN to RNN
    conv_shape = x.get_shape()
    x = Reshape(target_shape=(int(conv_shape[1]), int(conv_shape[2]*conv_shape[3])))(x) #  Flatten

    x = Dense(128, kernel_initializer='he_normal')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    # RNN layer
    gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(x)
    gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', 
                 name='gru1_b')(x)
    gru1_merged = add([gru_1, gru_1b])
    gru1_merged = BatchNormalization()(gru1_merged)

    gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
    gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', 
                 name='gru2_b')(gru1_merged)
    x = concatenate([gru_2, gru_2b])
    x = Dropout(0.25)(x)
    x = Dense(n_class, kernel_initializer='he_normal', activation='softmax')(x)
    # 预测
    base_model = Model(input=input_tensor, output=x)
    
    return  base_model

In [6]:
test_str='1*(8-1)=1'
aaa = test_str.split('=')
print(aaa)
print(eval(aaa[0]))
print(eval(aaa[1]))
print(eval(aaa[0]) == eval(aaa[1]))

['1*(8-1)', '1']
7
1
False


In [7]:
import itertools
from collections import defaultdict
from tqdm import tqdm
import cv2
import string
digits = string.digits
operators = '+-*'
characters = digits + operators + '()='
characters+='a'#CTC用 占位符

data_csv = pd.read_csv('G:/my_code/python/mathematical_expression_recognition/Mathematical_Expression_Recognition_train/train.csv')
img_dirpath = data_csv['filename'].tolist()
label = data_csv['label'].tolist()
X_data, X_valid, y_data, y_valid = train_test_split(img_dirpath, label, test_size=10000, random_state=17)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=10000, random_state=17)

def decode_label(out):
    out_best = list(np.argmax(out[0, 2:], axis=1))  # get max index
    out_best = [k for k, g in itertools.groupby(out_best)]  # remove overlap value
    outstr = ''
    for i in out_best:
        if i < len(letters):
            outstr += letters[i]
    return outstr

def decode_label2(out):
    out_best = list(np.argmax(out[0, 2:], axis=1))  # get max index
    out_best = [k for k, g in itertools.groupby(out_best)]  # remove overlap value
    outstr = ''.join([characters[x] for x in out_best if x > -1 and x < 16])
    return outstr

model_best = get_model(img_w, img_h, num_classes, False)
model_best.load_weights("best_weight.hdf5")

model_best2 = get_model2(width, height, n_class, n_len)
model_best2.load_weights("7.model_gru_best2.h5")

model_best3 = get_model3(width, height, n_class, n_len)
model_best3.load_weights("6.model_gru_best2_0.9833.h5")

total = 0
acc = 0
letter_total = 0
letter_acc = 0

result = defaultdict(str)

for i, img_file in tqdm(enumerate(X_test)):
    img = cv2.imread( rootpath+ img_file, cv2.IMREAD_GRAYSCALE)
    img_pred = img.astype(np.float32)
    img_pred1 = cv2.resize(img_pred, (img_w, img_h))
    img_pred1 = (img_pred1 / 255.0) * 2.0 - 1.0
    img_pred1 = img_pred1.T
    img_pred1 = np.expand_dims(img_pred1, axis=-1)
    img_pred1 = np.expand_dims(img_pred1, axis=0)
    net_out_value = model_best.predict(img_pred1)
    pred_texts = decode_label(net_out_value)
    try:
        #如果算式从计算结果上就不对，使用模型2预测
        tem_list = pred_texts.split('=')
        if eval(tem_list[0]) != eval(tem_list[1]):
            img_pred2 = cv2.resize(img_pred, (width, height))
            img_pred2 = (img_pred2 / 255.0) * 2.0 - 1.0
            img_pred2 = img_pred2.T
            img_pred2 = np.expand_dims(img_pred2, axis=-1)
            X = np.zeros((1, width, height, 1))
            X[0] = img_pred2
            net_out_value = model_best2.predict(X)
            pred_texts = decode_label2(net_out_value)
        
        #如果算式从计算结果上就不对，使用模型3预测
        tem_list = pred_texts.split('=')
        if eval(tem_list[0]) != eval(tem_list[1]):
            img_pred3 = cv2.resize(img_pred, (width, height))
            img_pred3 = (img_pred3 / 255.0) * 2.0 - 1.0
            img_pred3 = img_pred3.T
            img_pred3 = np.expand_dims(img_pred3, axis=-1)
            X = np.zeros((1, width, height, 1))
            X[0] = img_pred3
            net_out_value = model_best3.predict(X)
            pred_texts = decode_label2(net_out_value)
    except:
        None
        
#     tem_list = pred_texts.split('=')
#     print(eval(tem_list[0]) == eval(tem_list[1]))
    
    for j in range(min(len(pred_texts), len(y_test[i]))):
        if pred_texts[j] == y_test[i][j]:
            letter_acc += 1
    letter_total += max(len(pred_texts), len(label[i]))
    
    if pred_texts == y_test[i]:
        acc += 1
    else:
        print(img_file)
    total += 1

print("ACC : ", acc / total)
print("letter ACC : ", letter_acc / letter_total)

620it [00:15, 44.09it/s]

train/21325.jpg


795it [00:19, 44.17it/s]

train/72013.jpg


835it [00:20, 44.28it/s]

train/92815.jpg


1005it [00:24, 44.47it/s]

train/95905.jpg


1190it [00:28, 44.24it/s]

train/6626.jpg


1930it [00:45, 44.44it/s]

train/20814.jpg


2060it [00:48, 44.38it/s]

train/3267.jpg


2190it [00:51, 44.30it/s]

train/71218.jpg


2330it [00:54, 44.64it/s]

train/16788.jpg


2510it [00:58, 43.57it/s]

train/61510.jpg


2845it [01:06, 43.82it/s]

train/88604.jpg


3190it [01:14, 43.93it/s]

train/54150.jpg


3205it [01:14, 43.92it/s]

train/49853.jpg


3250it [01:15, 43.84it/s]

train/83498.jpg


3260it [01:15, 42.46it/s]

train/65360.jpg


3375it [01:18, 44.48it/s]

train/98488.jpg


3390it [01:18, 43.11it/s]

train/92081.jpg


3790it [01:28, 45.05it/s]

train/26328.jpg


3915it [01:30, 43.82it/s]

train/63270.jpg


3925it [01:31, 40.04it/s]

train/43023.jpg


5020it [01:56, 43.27it/s]

train/3687.jpg


5165it [01:59, 44.24it/s]

train/55041.jpg


5565it [02:08, 41.27it/s]

train/53708.jpg


6395it [02:27, 44.03it/s]

train/84900.jpg


6440it [02:28, 42.91it/s]

train/674.jpg


6445it [02:28, 41.00it/s]

train/99929.jpg


6530it [02:30, 43.71it/s]

train/70714.jpg


6650it [02:33, 43.29it/s]

train/6572.jpg


7000it [02:41, 42.55it/s]

train/82426.jpg


7445it [02:51, 43.24it/s]

train/38918.jpg


7520it [02:53, 44.88it/s]

train/13678.jpg


7885it [03:01, 44.34it/s]

train/75680.jpg


8130it [03:07, 44.32it/s]

train/33362.jpg


8690it [03:20, 43.65it/s]

train/54579.jpg


8810it [03:23, 44.35it/s]

train/22006.jpg


9000it [03:27, 44.01it/s]

train/58282.jpg


9175it [03:31, 43.71it/s]

train/82493.jpg


9425it [03:37, 44.37it/s]

train/35305.jpg


9510it [03:39, 43.15it/s]

train/85879.jpg


9720it [03:43, 43.95it/s]

train/88913.jpg


9845it [03:46, 44.34it/s]

train/79071.jpg


10000it [03:50, 44.22it/s]


ACC :  0.9959
letter ACC :  0.9347749938185115


In [8]:
total = 0
acc = 0
letter_total = 0
letter_acc = 0
for i, img_file in tqdm(enumerate(X_test)):
    img = cv2.imread( rootpath+ img_file, cv2.IMREAD_GRAYSCALE)
    img_pred = img.astype(np.float32)
    img_pred1 = cv2.resize(img_pred, (img_w, img_h))
    img_pred1 = (img_pred1 / 255.0) * 2.0 - 1.0
    img_pred1 = img_pred1.T
    img_pred1 = np.expand_dims(img_pred1, axis=-1)
    img_pred1 = np.expand_dims(img_pred1, axis=0)
    net_out_value = model_best.predict(img_pred1)
    pred_texts = decode_label(net_out_value)
    
    for j in range(min(len(pred_texts), len(y_test[i]))):
        if pred_texts[j] == y_test[i][j]:
            letter_acc += 1
    letter_total += max(len(pred_texts), len(label[i]))
    
    if pred_texts == y_test[i]:
        acc += 1
    total += 1

print("ACC : ", acc / total)
print("letter ACC : ", letter_acc / letter_total)

10000it [03:45, 44.23it/s]


ACC :  0.9827
letter ACC :  0.9330331846326613


In [9]:
img_file = 'train/62460.jpg'
img = cv2.imread( rootpath+ img_file, cv2.IMREAD_GRAYSCALE)
img_pred = img.astype(np.float32)
img_pred1 = cv2.resize(img_pred, (img_w, img_h))
img_pred1 = (img_pred1 / 255.0) * 2.0 - 1.0
img_pred1 = img_pred1.T
img_pred1 = np.expand_dims(img_pred1, axis=-1)
img_pred1 = np.expand_dims(img_pred1, axis=0)
net_out_value = model_best.predict(img_pred1)
pred_texts = decode_label(net_out_value)
print(pred_texts)

img_pred2 = cv2.resize(img_pred, (width, height))
img_pred2 = (img_pred2 / 255.0) * 2.0 - 1.0
img_pred2 = img_pred2.T
img_pred2 = np.expand_dims(img_pred2, axis=-1)
X = np.zeros((1, width, height, 1))
X[0] = img_pred2
net_out_value = model_best2.predict(X)
pred_texts = decode_label2(net_out_value)
print(pred_texts)

5-(8+8)=-9
5-8+6=-9
