In [1]:
import os
import tensorflow as tf
from PIL import Image
from nets import nets_factory
import numpy as np

In [2]:
CHAR_SET_LEN = 10
IMAGE_HEIGHT = 60
IMAGE_WIDTH = 160
BATCH_SIZE = 30
TFRECORD_FILE = "./captcha-tfrecords/train.tfrecords"

# placeholder
x = tf.placeholder(tf.float32, [None, 224, 224])
y0 = tf.placeholder(tf.float32, [None])
y1 = tf.placeholder(tf.float32, [None])
y2 = tf.placeholder(tf.float32, [None])
y3 = tf.placeholder(tf.float32, [None])

# learning rate
lr = tf.Variable(0.001, dtype=tf.float32)

In [3]:
# read data from tfrecord
def read_and_decode(filename):
    filename_queue = tf.train.string_input_producer([filename])
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    features = tf.parse_single_example(serialized_example,
                                       features={
                                           'image': tf.FixedLenFeature([], tf.string),
                                           'label0': tf.FixedLenFeature([], tf.int64),
                                           'label1': tf.FixedLenFeature([], tf.int64),
                                           'label2': tf.FixedLenFeature([], tf.int64),
                                           'label3': tf.FixedLenFeature([], tf.int64)
                                       })
    # get images
    image = tf.decode_raw(features['image'], tf.uint8)
    image = tf.reshape(image, [224, 224])
    # preprocess images
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.subtract(image, 0.5)
    image = tf.multiply(image, 2.0)
    # get labels
    label0 = tf.cast(features['label0'], tf.int32)
    label1 = tf.cast(features['label1'], tf.int32)
    label2 = tf.cast(features['label2'], tf.int32)
    label3 = tf.cast(features['label3'], tf.int32)

    return image, label0, label1, label2, label3


In [4]:
if __name__ == "__main__":
    # get images and labels
    image, label0, label1, label2, label3 = read_and_decode(TFRECORD_FILE)

    # randomly shuffle data
    image_batch, label_batch0, label_batch1, label_batch2, label_batch3 = tf.train.shuffle_batch(
        [image, label0, label1, label2, label3], batch_size=BATCH_SIZE,
        capacity=50000, min_after_dequeue=10000, num_threads=1)

    # define the network structure
    train_network_fn = nets_factory.get_network_fn(
        'alexnet_v2',
        num_classes=CHAR_SET_LEN,
        weight_decay=0.0005,
        is_training=True)

    with tf.Session() as sess:
        # inputs: a tensor or size [batch_size, height, width, channels]
        X = tf.reshape(x, [BATCH_SIZE, 224, 224, 1])

        logits0, logits1, logits2, logits3, end_points = train_network_fn(X)

        one_hot_labels0 = tf.one_hot(indices=tf.cast(y0, tf.int32), depth=CHAR_SET_LEN)
        one_hot_labels1 = tf.one_hot(indices=tf.cast(y1, tf.int32), depth=CHAR_SET_LEN)
        one_hot_labels2 = tf.one_hot(indices=tf.cast(y2, tf.int32), depth=CHAR_SET_LEN)
        one_hot_labels3 = tf.one_hot(indices=tf.cast(y3, tf.int32), depth=CHAR_SET_LEN)

        loss0 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits0, labels=one_hot_labels0))
        loss1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=one_hot_labels1))
        loss2 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2, labels=one_hot_labels2))
        loss3 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits3, labels=one_hot_labels3))

        total_loss = (loss0 + loss1 + loss2 + loss3) / 4.0

        optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(total_loss)

        correct_prediction0 = tf.equal(tf.argmax(one_hot_labels0, 1), tf.argmax(logits0, 1))
        accuracy0 = tf.reduce_mean(tf.cast(correct_prediction0, tf.float32))

        correct_prediction1 = tf.equal(tf.argmax(one_hot_labels1, 1), tf.argmax(logits1, 1))
        accuracy1 = tf.reduce_mean(tf.cast(correct_prediction1, tf.float32))

        correct_prediction2 = tf.equal(tf.argmax(one_hot_labels2, 1), tf.argmax(logits2, 1))
        accuracy2 = tf.reduce_mean(tf.cast(correct_prediction2, tf.float32))

        correct_prediction3 = tf.equal(tf.argmax(one_hot_labels3, 1), tf.argmax(logits3, 1))
        accuracy3 = tf.reduce_mean(tf.cast(correct_prediction3, tf.float32))

        saver = tf.train.Saver()

        sess.run(tf.global_variables_initializer())

        coord = tf.train.Coordinator()

        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        for i in range(10001):
            b_image, b_label0, b_label1, b_label2, b_label3 = sess.run([image_batch, label_batch0, label_batch1, label_batch2, label_batch3])
            sess.run(optimizer, feed_dict={x: b_image, y0: b_label0, y1: b_label1, y2: b_label2, y3: b_label3})

            if i % 20 == 0:
                if i % 2000 == 0:
                    sess.run(tf.assign(lr, lr / 3))
                acc0, acc1, acc2, acc3, loss_ = sess.run([accuracy0, accuracy1, accuracy2, accuracy3, total_loss],
                                                         feed_dict={x: b_image, y0: b_label0, y1: b_label1, y2:b_label2, y3: b_label3})

                learning_rate = sess.run(lr)

                print("Iter:%d  Loss:%.3f  Accuracy:%.2f,%.2f,%.2f,%.2f  Learning-Rate:%.4f" %(i, loss_, acc0, acc1, acc2, acc3, learning_rate))
                if not os.path.exists('./captcha-models'):
                    os.mkdir('./captcha-models')
                
                if acc0 > 0.98 and acc1 > 0.98 and acc2 > 0.98 and acc3 > 0.98:
                    saver.save(sess, './captcha-models/crack_captcha.model', global_step=i)
                    break
                    
        coord.request_stop()
        coord.join(threads)

Iter:0  Loss:6.998  Accuracy:0.17,0.17,0.30,0.27  Learning-Rate:0.0003
Iter:20  Loss:2.307  Accuracy:0.20,0.10,0.07,0.10  Learning-Rate:0.0003
Iter:40  Loss:2.287  Accuracy:0.13,0.17,0.17,0.10  Learning-Rate:0.0003
Iter:60  Loss:2.282  Accuracy:0.07,0.17,0.07,0.10  Learning-Rate:0.0003
Iter:80  Loss:2.307  Accuracy:0.00,0.03,0.00,0.20  Learning-Rate:0.0003
Iter:100  Loss:2.309  Accuracy:0.13,0.10,0.13,0.03  Learning-Rate:0.0003
Iter:120  Loss:2.312  Accuracy:0.13,0.00,0.07,0.00  Learning-Rate:0.0003
Iter:140  Loss:2.317  Accuracy:0.13,0.07,0.07,0.03  Learning-Rate:0.0003
Iter:160  Loss:2.305  Accuracy:0.10,0.13,0.03,0.13  Learning-Rate:0.0003
Iter:180  Loss:2.300  Accuracy:0.07,0.03,0.07,0.23  Learning-Rate:0.0003
Iter:200  Loss:2.301  Accuracy:0.00,0.17,0.10,0.10  Learning-Rate:0.0003
Iter:220  Loss:2.305  Accuracy:0.17,0.07,0.20,0.07  Learning-Rate:0.0003
Iter:240  Loss:2.308  Accuracy:0.27,0.07,0.10,0.03  Learning-Rate:0.0003
Iter:260  Loss:2.299  Accuracy:0.07,0.07,0.07,0.13  Learn