## Imports

In [15]:
import pycrfsuite
from collections import defaultdict
import os
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
from itertools import chain

## Helper functions for converting data in suitable input format for CRFSuite 

In [16]:
def return_file_path(dir_path, file_path):
	return os.path.join(dir_path, file_path)


def convert_x_i(x_i):
	features_dict = defaultdict()
	num_features = len(x_i)

	dd = defaultdict()

	dd['bias'] = 1.0

	for idx in range(num_features):
		pixel_i = "pixel_" + str(idx)
		dd[pixel_i] = x_i[idx]

	return dd


def convert_x(file_path):
	x_arr = []

	with open(file_path, "r") as x_file:
		for x_i_str in x_file:
			x_i_str = x_i_str.strip()
			x_i_str_arr = x_i_str.split()
			x_i = [float(x_ij) for x_ij in x_i_str_arr]

			x_i_features = convert_x_i(x_i)

			x_arr.append(x_i_features)

	return x_arr


def prepare_data(data_dir, mode = "train"):
	file_dir = os.path.join(data_dir, "{}_words".format(mode))
	words_file = return_file_path(data_dir, "{}_words.txt".format(mode))

	X = []
	Y = []

	with open(words_file) as f:
		for line in f:
			line = line.strip()
			i, word = line.split()

			x_i_file_path = return_file_path(file_dir, "img_{}.txt".format(i))
			x_i_arr  = convert_x(x_i_file_path)

			y_i_arr = list(word)

			X.append(x_i_arr)
			Y.append(y_i_arr)

	return X, Y

## Functions to train and test the model

In [17]:
def train_model(X, Y, max_iter_count,  model_store = "handwriting-reco.crfsuite"):
	trainer = pycrfsuite.Trainer(verbose=False)

	for xseq, yseq in zip(X, Y):
		trainer.append(xseq, yseq)

	trainer.set_params({
	    'c1': 1.0,   # coefficient for L1 penalty
	    'c2': 1e-3,  # coefficient for L2 penalty
	    'max_iterations': max_iter_count,  # stop earlier

	    # include transitions that are possible, but not observed
	    'feature.possible_transitions': True
	})

	trainer.train(model_store)

	print(trainer.logparser.last_iteration)


def get_preds(X, model_store = "handwriting-reco.crfsuite"):
	tagger = pycrfsuite.Tagger()
	tagger.open(model_store)
	Y_pred = [tagger.tag(x) for x in X]

	return Y_pred


def test_model(X_test, Y_test):
	Y_test_pred = get_preds(X_test)
	
	lb = LabelBinarizer()
	
	y_test_combined = lb.fit_transform(list(chain.from_iterable(Y_test)))
	y_pred_combined = lb.transform(list(chain.from_iterable(Y_test_pred)))

	print "Test accuracy : {}".format(accuracy_score(y_test_combined, y_pred_combined))

## Train the model for 500 iterations

In [19]:
data_dir = './data'
X_train, Y_train = prepare_data(data_dir)
train_model(X_train, Y_train, 500)

print "Training successful with 500 iterations.. Enable verbose in the CRF model above and re-run to track progress"

{'loss': 51853.452636, 'error_norm': 57.176499, 'linesearch_trials': 2, 'active_features': 3636, 'num': 500, 'time': 1.73, 'scores': {}, 'linesearch_step': 0.5, 'feature_norm': 85.522574}
Training successful with 500 iterations.. Enable verbose in the CRF model above and re-run to track progress


## Test the model

In [20]:
X_test, Y_test = prepare_data(data_dir, mode = "test")
test_model(X_test, Y_test)

Test accuracy : 0.853043730931
