### Optical Character Recognition using TensorFlow
- exporting results to JSON

In [None]:
# import libraries
import tensorflow as tf 
from keras.models import Sequential
from keras.layers import Conv2D, Flatten, MaxPooling2D, Dense
import os
import cv2
import numpy as np

In [2]:
# Loading dataset and Preprocessing
# !unzip OCR_dataset.zip (mnist dataset)
images = []
labels = []

path = 'data/training_data'

dir_list = os.listdir(path)
for i in dir_list:
  dir = os.path.join(path, i)
  file_list = os.listdir(dir)
  for j in file_list:
    # Reading and Preprocessing Images:
    files = os.path.join(dir, j)
    img = cv2.imread(files)
    img = cv2.resize(img, (64,64))
    img = np.array(img, dtype=np.float32)
    img = img/255
    # Building Lists of Images and Labels:
    images.append(img)
    labels.append(i)

In [3]:
# Converting list to NumPy arrays:
X = np.array(images)
y = np.array(labels)

In [4]:
# Label Encoding:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [5]:
# Shuffling the data:
from sklearn.utils import shuffle
X_sh, y_sh = shuffle(X, y, random_state=42)

In [None]:
# Building the Model
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=(3,3), activation='relu', input_shape=(64,64,3)))
model.add(MaxPooling2D())
model.add(Conv2D(filters=32, kernel_size=(3,3),  activation='relu'))
model.add(MaxPooling2D())
model.add(Conv2D(filters=64, kernel_size=(3,3),  activation='relu'))
model.add(MaxPooling2D())
model.add(Conv2D(filters=128, kernel_size=(3,3), activation='relu'))
model.add(Flatten())
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=36, activation='softmax'))

In [None]:
# Model compiling and training
# Compiling the Model:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics = ['accuracy'])
# Training the Model:
history = model.fit(X_sh, y_sh ,validation_split=0.2, batch_size=16, epochs=10)

In [None]:
# plotting
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['loss', 'val_loss'])

In [9]:
# Model Testing
# step 1 convert images using cv2
test_images = []
test_labels = []

path = 'data/testing_data'

dir_list = os.listdir(path)
for i in dir_list:
  dir = os.path.join(path, i)
  file_list = os.listdir(dir)
  for j in file_list:
    files = os.path.join(dir, j)
    img = cv2.imread(files)
    img = cv2.resize(img, (64,64))
    img = np.array(img, dtype=np.float32)
    img = img/255
    test_images.append(img)
    test_labels.append(i)

In [10]:
X_test = np.array(test_images)
y_test = np.array(test_labels)

In [None]:
# using sklearn inverse_transform
# Making predictions
preds = model.predict(X_test)
predicted_labels = le.inverse_transform(np.argmax(preds, axis=1))

In [None]:
# Visualize the predictions
plt.imshow(X_test[228])
plt.title(f'Label: {predicted_labels[228]}')
plt.show()
# predicted_labels[228]

In [None]:
# Model Evaluation
y_test = le.fit_transform(y_test)
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.2%}')

In [None]:
import pandas as pd 
pd.DataFrame(predicted_labels, columns=['text'])

In [18]:
# export json

df = pd.DataFrame(data=predicted_labels)

# After creating the DataFrame we used to_json() to create a JSON  file 
df.to_json('my_text.json')

## FIN