# Creating an image dataset using character images from Stanford OCR and Tokens from Gutenberg

## Imports

In [19]:
import numpy as np
from sets import Set 
from collections import defaultdict
import pickle
import nltk
import re
import os
import pickle
from string import lower
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
from string import lower

## Creating a character image variants dictionary

In [20]:
data_file = open('letter.data', 'r')
char_pixel_dict = defaultdict(Set)	# key : character, value : set of pixel array variants for that character


for line in data_file:
	data_arr = line.split()
	char = data_arr[1]

	char_label = ord(char) - 97

	char_features = map(lambda x : int(x), data_arr[6:])

	char_set = char_pixel_dict.get(char, Set())

	char_feature_tup = tuple(char_features)

	char_set.add(char_feature_tup)

	char_pixel_dict[char] = char_set

    

for key, val in char_pixel_dict.items():
	print "Character : " + str(key)
	print "Number of variants: " + str(len(val))


with open("./letter_variants.npy", "w") as f:
	pickle.dump(char_pixel_dict, f)

Character : a
Number of variants: 4021
Character : c
Number of variants: 2072
Character : b
Number of variants: 1282
Character : e
Number of variants: 4945
Character : d
Number of variants: 1441
Character : g
Number of variants: 2471
Character : f
Number of variants: 921
Character : i
Number of variants: 4027
Character : h
Number of variants: 861
Character : k
Number of variants: 909
Character : j
Number of variants: 189
Character : m
Number of variants: 1587
Character : l
Number of variants: 1696
Character : o
Number of variants: 3854
Character : n
Number of variants: 4988
Character : q
Number of variants: 341
Character : p
Number of variants: 1377
Character : s
Number of variants: 1394
Character : r
Number of variants: 2634
Character : u
Number of variants: 2538
Character : t
Number of variants: 2126
Character : w
Number of variants: 520
Character : v
Number of variants: 661
Character : y
Number of variants: 1221
Character : x
Number of variants: 413
Character : z
Number of variants:

## Script to create images and labels for tokens extracted from the book Moby Dick

In [22]:
def get_dir(dir_name):
	dir_path = os.path.join(os.getcwd(), dir_name)
	
	if (not os.path.exists(dir_path)):
		os.makedirs(dir_path)

	return dir_path

def init_gutenberg():
	nltk.download('gutenberg')


def get_book_tokens_gt_len(book_name, length):
	book_tokens = nltk.corpus.gutenberg.words(book_name)
	book_token_set = Set(book_tokens)
	book_token_list = map(lambda x : x.strip(), book_token_set)
	book_words = filter(lambda x : re.match(r'^[a-zA-Z]+$', x), book_token_list)
	book_words_gt_eq_len = filter(lambda x : len(x) >= length, book_words)

	book_words_gt_eq_len = map(lambda x : lower(x), book_words_gt_eq_len)

	return np.array(book_words_gt_eq_len)

def _get_file_name(img_num, file_type = "img"):
	if (file_type == "img"):
		file_name = "img_" + str(img_num) + ".png"
	else:
		file_name = "img_" + str(img_num) + ".txt"

	return file_name

def _save_img_in_folder(dir_path, img_num, word_arr):
	file_name = _get_file_name(img_num)

	file_path = os.path.join(dir_path, file_name)

	plt.imsave(file_path, word_arr)


def _draw_and_save_word_in_dir(img_num, img_arr, dir_path):
	img_arr = img_arr.reshape(img_arr.shape[0], 16,- 1)

	word_arr = img_arr[0]

	for i in range(1, len(img_arr)):
		word_arr = np.hstack((word_arr, np.zeros((16,2), dtype = int)))
		word_arr = np.hstack((word_arr, img_arr[i]))

	word_arr*=255
	word_arr = word_arr[:,8:]		# first 8 cols are zeros, 2 col padding at start and end

	_save_img_in_folder(dir_path, img_num, word_arr)


def _write_word_arr_in_dir(img_num, img_arr, dir_path):
	file_name = _get_file_name(img_num, "txt")

	file_path = os.path.join(dir_path, file_name)

	img_arr = img_arr[1:]

	with open(file_path, "w") as f:
		for img in img_arr:
			line = " ".join([`num` for num in img]) + "\n"
			f.write(line)

def save_word_to_dir(word_arr, word_num, dir_path):
	_draw_and_save_word_in_dir(word_num, word_arr, dir_path)
	_write_word_arr_in_dir(word_num, word_arr, dir_path)


def get_char_img_len(char_to_img_arr_dict):
	list_of_a_variants = list(char_to_img_arr_dict['a'])

	return len(list_of_a_variants[0])


def get_random_variant_of_char(char_to_img_arr_dict, char):
	char_variants_list = list(char_to_img_arr_dict[char])
	num_variants = len(char_variants_list)

	#print "Char : " + char
	#rint "Number of variants : " + str(num_variants)

	random_variant_idx = np.random.choice(num_variants)

	random_variant = char_variants_list[random_variant_idx]

	return np.array(random_variant)


def save_word_imgs_to_dir(words, dir_path, char_to_img_arr_dict):
	char_feature_len = get_char_img_len(char_to_img_arr_dict)

	#print "last word " + str(words[-1])

	for word_num, word in enumerate(words):
		#print "Writing " + str(word) + " to file"
		word_arr = np.zeros((1,char_feature_len), dtype = int)

		for char in word:
			variant_of_char = get_random_variant_of_char(char_to_img_arr_dict, char)
			word_arr = np.vstack((word_arr, variant_of_char))

		save_word_to_dir(word_arr, word_num, dir_path)
		#print "Write successful!"


def write_words_list_to_dir(words, file_path):
	with open(file_path, 'w') as word_file:
		for word_num, word in enumerate(words):
			word_file.write(str(word_num) + " " + word + "\n")


def filter_words_for_stratification(words_list, test_split):
	len_words_list = map(lambda x : len(x), words_list)

	min_words_req_per_len = int(np.ceil(test_split * 10))

	len_words_counter = Counter(len_words_list)

	print "Len words counter " + str(len_words_counter)

	not_allowed_lengths = [x[0] for x in len_words_counter.items() if x[1] < min_words_req_per_len]

	print "Not allowed lengths " + str(not_allowed_lengths)

	len_words_list = np.array(len_words_list)

	mask = np.zeros(len(words_list), dtype=bool)

	for n_length in not_allowed_lengths:
		n_len_mask = (len_words_list == n_length)
		mask = np.ma.mask_or(mask, n_len_mask)

	inverted_mask = np.invert(mask)

	return words_list[inverted_mask]


def create_data_set(words_list, char_to_img_arr_dict, test_split):
	words_list = filter_words_for_stratification(words_list, test_split)

	print "Number of tokens after filtering " + str(len(words_list))

	len_words_list = map(lambda x : len(x), words_list)

	words_train, words_test, _ , _ = train_test_split(words_list, len_words_list, test_size = test_split, shuffle = True, stratify = len_words_list)
	
	data_dir = get_dir('data')

	train_dir_path = get_dir('data/train_words')
	test_dir_path = get_dir('data/test_words')

	train_words_file_path = os.path.join(data_dir, 'train_words.txt')
	test_words_file_path = os.path.join(data_dir, 'test_words.txt')

	save_word_imgs_to_dir(words_train, train_dir_path, char_to_img_arr_dict)
	save_word_imgs_to_dir(words_test, test_dir_path, char_to_img_arr_dict)

	write_words_list_to_dir(words_train, train_words_file_path)
	write_words_list_to_dir(words_test, test_words_file_path)

## Running the data preparation script

In [23]:
init_gutenberg()

mb_dick_tokens = get_book_tokens_gt_len('melville-moby_dick.txt', 3)

print "Number of extracted tokens " + str(len(mb_dick_tokens))

with open("letter_variants.npy") as f:
    char_to_img_arr_dict = pickle.load(f)

create_data_set(mb_dick_tokens, char_to_img_arr_dict, 0.2)

print "Dataset preparation successful.. Check ./data"

[nltk_data] Downloading package gutenberg to /Users/ady/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Number of extracted tokens 18862
Len words counter Counter({7: 3005, 6: 2941, 8: 2708, 5: 2393, 9: 2174, 4: 1701, 10: 1440, 11: 881, 3: 589, 12: 550, 13: 286, 14: 122, 15: 48, 16: 13, 17: 9, 18: 1, 20: 1})
Not allowed lengths [18, 20]
Number of tokens after filtering 18860
Dataset preparation successful.. Check ./data
