# Libriries

In [30]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import sys
import os
import random
from pathlib import Path
import imageio
import skimage
import skimage.io
import skimage.transform
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import scipy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Dropout, BatchNormalization,LeakyReLU
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from keras.utils import to_categorical
from keras.layers.advanced_activations import LeakyReLU, PReLU
import tensorflow_addons as tfa
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import tensorflow as tf


In [2]:
IMAGE_PATH = 'chinese_mnist/data/'
IMAGE_WIDTH = 64
IMAGE_HEIGHT = 64
IMAGE_CHANNELS = 1
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2
CONV_2D_DIM_1 = 16
CONV_2D_DIM_2 = 16
CONV_2D_DIM_3 = 32
CONV_2D_DIM_4 = 64
MAX_POOL_DIM = 2
KERNEL_SIZE = 3
BATCH_SIZE = 32
NO_EPOCHS = 50
DROPOUT_RATIO = 0.5
PATIENCE = 5
VERBOSE = 1

In [3]:
os.listdir("chinese_mnist")

['chinese_mnist.csv', 'data']

In [4]:
data_df = pd.read_csv('chinese_mnist/chinese_mnist.csv')
print(data_df.shape) 
data_df.sample(100).head(4)

(15000, 5)


Unnamed: 0,suite_id,sample_id,code,value,character
14641,67,10,9,8,八
3878,88,8,13,1000,千
211,28,10,10,9,九
5530,57,1,15,100000000,亿


In [5]:
image_files = list(os.listdir(IMAGE_PATH))
print("Number of image files: {}".format(len(image_files)))

Number of image files: 15000


In [6]:
def create_file_name(x):
    
    file_name = f"input_{x[0]}_{x[1]}_{x[2]}.jpg"
    return file_name

In [7]:
data_df["file"] = data_df.apply(create_file_name, axis=1)

In [8]:
file_names = list(data_df['file'])
print("Matching image names: {}".format(len(set(file_names).intersection(image_files))))

Matching image names: 15000


In [9]:
def read_image_sizes(file_name):
    image = skimage.io.imread(IMAGE_PATH + file_name)
    return list(image.shape)

In [10]:
tqdm.pandas()
m = np.stack(data_df['file'].progress_apply(read_image_sizes))
df = pd.DataFrame(m,columns=['w','h'])
data_df = pd.concat([data_df,df],axis=1, sort=False)


The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version

100%|██████████████████████████████████████████████████████████████████████████| 15000/15000 [00:09<00:00, 1613.16it/s]


In [11]:
data_df.head()

Unnamed: 0,suite_id,sample_id,code,value,character,file,w,h
0,1,1,10,9,九,input_1_1_10.jpg,64,64
1,1,10,10,9,九,input_1_10_10.jpg,64,64
2,1,2,10,9,九,input_1_2_10.jpg,64,64
3,1,3,10,9,九,input_1_3_10.jpg,64,64
4,1,4,10,9,九,input_1_4_10.jpg,64,64


In [12]:
print(f"Number of suites: {data_df.suite_id.nunique()}")
print(f"Samples: {data_df.sample_id.unique()}")

Number of suites: 100
Samples: [ 1 10  2  3  4  5  6  7  8  9]


In [15]:
train_df, test_df = train_test_split(data_df, 
                                     test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=data_df["code"].values)

In [16]:
train_df, val_df = train_test_split(train_df, 
                                    test_size=VAL_SIZE, random_state=RANDOM_STATE, stratify=train_df["code"].values)

In [17]:
print("Train set rows: {}".format(train_df.shape[0]))
print("Test  set rows: {}".format(test_df.shape[0]))
print("Val   set rows: {}".format(val_df.shape[0]))

Train set rows: 9600
Test  set rows: 3000
Val   set rows: 2400


In [18]:
def read_image(file_name):
    image = skimage.io.imread(IMAGE_PATH + file_name)
    image = skimage.transform.resize(image, (IMAGE_WIDTH, IMAGE_HEIGHT, 1), mode='reflect')
    return image[:,:,:]

In [19]:
def categories_encoder(dataset, var='character'):
    X = np.stack(dataset['file'].apply(read_image))
    y = pd.get_dummies(dataset[var], drop_first=False)
    return X, y

In [20]:
X_train, y_train = categories_encoder(train_df)
X_val, y_val = categories_encoder(val_df)
X_test, y_test = categories_encoder(test_df)

In [65]:
model=Sequential()
model.add(Conv2D(filters = 64, kernel_size=(3,3), input_shape=(64, 64, 1), activation='relu', padding='same'))
model.add(MaxPool2D(2))
model.add(Conv2D(filters = 128, kernel_size=(3,3),  activation='relu', padding='same'))
model.add(MaxPool2D(2))
model.add(Conv2D(filters = 160, kernel_size=(3,3),  activation='relu', padding='same'))
model.add(MaxPool2D(2))
model.add(Conv2D(filters = 256, kernel_size=(3,3),  activation='relu', padding='same'))
model.add(Conv2D(filters = 256, kernel_size=(3,3),  activation='relu', padding='same'))
model.add(MaxPool2D(2))
model.add(Conv2D(filters = 384, kernel_size=(3,3),  activation='relu', padding='same'))
model.add(Conv2D(filters = 384, kernel_size=(3,3), activation='relu', padding='same'))
model.add(MaxPool2D(2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(15, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_37"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_201 (Conv2D)          (None, 64, 64, 64)        640       
_________________________________________________________________
max_pooling2d_97 (MaxPooling (None, 32, 32, 64)        0         
_________________________________________________________________
conv2d_202 (Conv2D)          (None, 32, 32, 128)       73856     
_________________________________________________________________
max_pooling2d_98 (MaxPooling (None, 16, 16, 128)       0         
_________________________________________________________________
conv2d_203 (Conv2D)          (None, 16, 16, 160)       184480    
_________________________________________________________________
max_pooling2d_99 (MaxPooling (None, 8, 8, 160)         0         
_________________________________________________________________
conv2d_204 (Conv2D)          (None, 8, 8, 256)       

In [66]:
annealer = LearningRateScheduler(lambda x: 1e-3 * 0.99 ** (x+NO_EPOCHS))
earlystopper = EarlyStopping(monitor='loss', patience=PATIENCE, verbose=VERBOSE)
checkpointer = ModelCheckpoint('best_model.h5',
                                monitor='val_accuracy',
                                verbose=VERBOSE,
                                save_best_only=True,
                                save_weights_only=True)

In [67]:
train_model  = model.fit(X_train, y_train,
                  batch_size=BATCH_SIZE,
                  epochs=NO_EPOCHS,
                  verbose=1,
                  validation_data=(X_val, y_val),
                  callbacks=[earlystopper, checkpointer, annealer])

Epoch 1/50
Epoch 00001: val_accuracy improved from -inf to 0.93875, saving model to best_model.h5
Epoch 2/50
Epoch 00002: val_accuracy improved from 0.93875 to 0.94917, saving model to best_model.h5
Epoch 3/50
Epoch 00003: val_accuracy improved from 0.94917 to 0.97708, saving model to best_model.h5
Epoch 4/50
Epoch 00004: val_accuracy improved from 0.97708 to 0.98542, saving model to best_model.h5
Epoch 5/50
Epoch 00005: val_accuracy did not improve from 0.98542
Epoch 6/50
Epoch 00006: val_accuracy did not improve from 0.98542
Epoch 7/50
Epoch 00007: val_accuracy improved from 0.98542 to 0.99000, saving model to best_model.h5
Epoch 8/50
Epoch 00008: val_accuracy did not improve from 0.99000
Epoch 9/50
Epoch 00009: val_accuracy improved from 0.99000 to 0.99292, saving model to best_model.h5
Epoch 10/50
Epoch 00010: val_accuracy improved from 0.99292 to 0.99583, saving model to best_model.h5
Epoch 11/50
Epoch 00011: val_accuracy did not improve from 0.99583
Epoch 12/50
Epoch 00012: val_a

In [68]:
def create_trace(x,y,ylabel,color):
        trace = go.Scatter(
            x = x,y = y,
            name=ylabel,
            marker=dict(color=color),
            mode = "markers+lines",
            text=x
        )
        return trace
    
def plot_accuracy_and_loss(train_model):
    hist = train_model.history
    acc = hist['accuracy']
    val_acc = hist['val_accuracy']
    loss = hist['loss']
    val_loss = hist['val_loss']
    epochs = list(range(1,len(acc)+1))
    #define the traces
    
    trace_ta = create_trace(epochs,acc,"Training accuracy", "Green")
    trace_va = create_trace(epochs,val_acc,"Validation accuracy", "Red")
    trace_tl = create_trace(epochs,loss,"Training loss", "Blue")
    trace_vl = create_trace(epochs,val_loss,"Validation loss", "Magenta")
    fig = tools.make_subplots(rows=1,cols=2, subplot_titles=('Training and validation accuracy',
                                                             'Training and validation loss'))
    #add traces to the figure
    fig.append_trace(trace_ta,1,1)
    fig.append_trace(trace_va,1,1)
    fig.append_trace(trace_tl,1,2)
    fig.append_trace(trace_vl,1,2)
    #set the layout for the figure
    fig['layout']['xaxis'].update(title = 'Epoch')
    fig['layout']['xaxis2'].update(title = 'Epoch')
    fig['layout']['yaxis'].update(title = 'Accuracy', range=[0,1])
    fig['layout']['yaxis2'].update(title = 'Loss', range=[0,1])
    #plot
    iplot(fig, filename='accuracy-loss')

plot_accuracy_and_loss(train_model)

In [69]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.0420963354408741
Test accuracy: 0.9946666955947876


In [70]:
def test_accuracy_report(model):
    predicted = model.predict(X_test)
    test_predicted = np.argmax(predicted, axis=1)
    test_truth = np.argmax(y_test.values, axis=1)
    print(metrics.classification_report(test_truth, test_predicted, target_names=y_test.columns)) 
    test_res = model.evaluate(X_test, y_test.values, verbose=0)
    print('Loss function: %s, accuracy:' % test_res[0], test_res[1])

In [71]:
test_accuracy_report(model)

              precision    recall  f1-score   support

           一       0.99      0.99      0.99       200
           七       0.99      0.99      0.99       200
           万       1.00      1.00      1.00       200
           三       1.00      0.99      1.00       200
           九       1.00      0.96      0.98       200
           二       0.99      0.99      0.99       200
           五       1.00      1.00      1.00       200
           亿       0.98      1.00      0.99       200
           八       1.00      1.00      1.00       200
           六       1.00      0.99      1.00       200
           十       0.99      0.99      0.99       200
           千       0.99      0.99      0.99       200
           四       1.00      1.00      1.00       200
           百       1.00      0.99      1.00       200
           零       1.00      1.00      1.00       200

    accuracy                           0.99      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       0.99   

In [72]:
model_optimal = model
model_optimal.load_weights('best_model.h5')
score = model_optimal.evaluate(X_test, y_test, verbose=0)
print(f'Best validation loss: {score[0]}, accuracy: {score[1]}')

test_accuracy_report(model_optimal)

Best validation loss: 0.0420963354408741, accuracy: 0.9946666955947876
              precision    recall  f1-score   support

           一       0.99      0.99      0.99       200
           七       0.99      0.99      0.99       200
           万       1.00      1.00      1.00       200
           三       1.00      0.99      1.00       200
           九       1.00      0.96      0.98       200
           二       0.99      0.99      0.99       200
           五       1.00      1.00      1.00       200
           亿       0.98      1.00      0.99       200
           八       1.00      1.00      1.00       200
           六       1.00      0.99      1.00       200
           十       0.99      0.99      0.99       200
           千       0.99      0.99      0.99       200
           四       1.00      1.00      1.00       200
           百       1.00      0.99      1.00       200
           零       1.00      1.00      1.00       200

    accuracy                           0.99      3000
   macro 