## Imports

In [4]:
import pycrfsuite
from collections import defaultdict
import os
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
from itertools import chain

## Helper functions for converting data in suitable input format for CRFSuite 

In [5]:
def return_file_path(dir_path, file_path):
	return os.path.join(dir_path, file_path)


def convert_x_i(x_i):
	features_dict = defaultdict()
	num_features = len(x_i)

	dd = defaultdict()

	dd['bias'] = 1.0

	for idx in range(num_features):
		pixel_i = "pixel_" + str(idx)
		dd[pixel_i] = x_i[idx]

	return dd


def convert_x(file_path):
	x_arr = []

	with open(file_path, "r") as x_file:
		for x_i_str in x_file:
			x_i_str = x_i_str.strip()
			x_i_str_arr = x_i_str.split()
			x_i = [float(x_ij) for x_ij in x_i_str_arr]

			x_i_features = convert_x_i(x_i)

			x_arr.append(x_i_features)

	return x_arr


def prepare_data(data_dir, mode = "train"):
	file_dir = os.path.join(data_dir, "{}_words".format(mode))
	words_file = return_file_path(data_dir, "{}_words.txt".format(mode))

	X = []
	Y = []

	with open(words_file) as f:
		for line in f:
			line = line.strip()
			i, word = line.split()

			x_i_file_path = return_file_path(file_dir, "img_{}.txt".format(i))
			x_i_arr  = convert_x(x_i_file_path)

			y_i_arr = list(word)

			X.append(x_i_arr)
			Y.append(y_i_arr)

	return X, Y

## Functions to train and test the model

In [6]:
def train_model(X, Y, max_iter_count,  model_store = "handwriting-reco.crfsuite"):
	trainer = pycrfsuite.Trainer(verbose=True)

	for xseq, yseq in zip(X, Y):
		trainer.append(xseq, yseq)

	trainer.set_params({
	    'c1': 1.0,   # coefficient for L1 penalty
	    'c2': 1e-3,  # coefficient for L2 penalty
	    'max_iterations': max_iter_count,  # stop earlier

	    # include transitions that are possible, but not observed
	    'feature.possible_transitions': True
	})

	trainer.train(model_store)

	print(trainer.logparser.last_iteration)


def get_preds(X, model_store = "handwriting-reco.crfsuite"):
	tagger = pycrfsuite.Tagger()
	tagger.open(model_store)
	Y_pred = [tagger.tag(x) for x in X]

	return Y_pred


def test_model(X_test, Y_test):
	Y_test_pred = get_preds(X_test)
	
	lb = LabelBinarizer()
	
	y_test_combined = lb.fit_transform(list(chain.from_iterable(Y_test)))
	y_pred_combined = lb.transform(list(chain.from_iterable(Y_test_pred)))

	print "Test accuracy : {}".format(accuracy_score(y_test_combined, y_pred_combined))

## Train the model for 500 iterations

In [7]:
data_dir = './data'
X_train, Y_train = prepare_data(data_dir)
train_model(X_train, Y_train, 500)

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 4030
Seconds required: 1.388

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 305464.683024
Feature norm: 1.000000
Error norm: 44264.474734
Active features: 4027
Line search trials: 1
Line search step: 0.000014
Seconds required for this iteration: 1.616

***** Iteration #2 *****
Loss: 249870.246174
Feature norm: 2.854602
Error norm: 38458.005653
Active features: 3842
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.794

***** Iteration #3 *****
Loss: 204309.033442
Feature norm: 5.424405
Error norm: 46755.305055
Active features: 3807
Line search trials: 1
Line search step: 1.000000
Seconds required for 

***** Iteration #39 *****
Loss: 59719.593160
Feature norm: 57.499783
Error norm: 2837.529307
Active features: 3984
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.902

***** Iteration #40 *****
Loss: 59530.194254
Feature norm: 57.995971
Error norm: 3219.174778
Active features: 3990
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.875

***** Iteration #41 *****
Loss: 59330.155473
Feature norm: 58.327949
Error norm: 2884.621778
Active features: 3990
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.851

***** Iteration #42 *****
Loss: 59127.434118
Feature norm: 58.732059
Error norm: 2722.369905
Active features: 3983
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.814

***** Iteration #43 *****
Loss: 58961.923237
Feature norm: 59.050500
Error norm: 2478.414026
Active features: 3980
Line search trials: 1
Line search step: 1.000000
Seco

***** Iteration #82 *****
Loss: 55157.716081
Feature norm: 70.622794
Error norm: 1790.190458
Active features: 3937
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.872

***** Iteration #83 *****
Loss: 55070.633346
Feature norm: 70.818381
Error norm: 1170.700224
Active features: 3938
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.852

***** Iteration #84 *****
Loss: 55044.717978
Feature norm: 71.182226
Error norm: 1735.071404
Active features: 3940
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.834

***** Iteration #85 *****
Loss: 54964.523731
Feature norm: 71.385368
Error norm: 1153.235764
Active features: 3938
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.872

***** Iteration #86 *****
Loss: 54945.478770
Feature norm: 71.743788
Error norm: 1736.179888
Active features: 3933
Line search trials: 1
Line search step: 1.000000
Seco

***** Iteration #125 *****
Loss: 53502.005604
Feature norm: 79.777430
Error norm: 513.810554
Active features: 3890
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.660

***** Iteration #126 *****
Loss: 53486.184083
Feature norm: 79.920090
Error norm: 629.250791
Active features: 3889
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.675

***** Iteration #127 *****
Loss: 53464.738768
Feature norm: 80.044109
Error norm: 454.358429
Active features: 3888
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.779

***** Iteration #128 *****
Loss: 53447.109913
Feature norm: 80.161570
Error norm: 546.210522
Active features: 3882
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.674

***** Iteration #129 *****
Loss: 53427.786484
Feature norm: 80.295721
Error norm: 537.175386
Active features: 3885
Line search trials: 2
Line search step: 0.500000
Seco

***** Iteration #167 *****
Loss: 52959.749013
Feature norm: 83.625261
Error norm: 367.058816
Active features: 3833
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.761

***** Iteration #168 *****
Loss: 52952.056078
Feature norm: 83.678056
Error norm: 369.000333
Active features: 3833
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.709

***** Iteration #169 *****
Loss: 52944.378318
Feature norm: 83.736066
Error norm: 360.475920
Active features: 3834
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.675

***** Iteration #170 *****
Loss: 52936.665496
Feature norm: 83.782296
Error norm: 344.938729
Active features: 3834
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.750

***** Iteration #171 *****
Loss: 52930.069117
Feature norm: 83.831110
Error norm: 345.651034
Active features: 3839
Line search trials: 2
Line search step: 0.500000
Seco

***** Iteration #208 *****
Loss: 52741.445236
Feature norm: 84.236454
Error norm: 243.228607
Active features: 3800
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.729

***** Iteration #209 *****
Loss: 52737.946079
Feature norm: 84.247605
Error norm: 225.594058
Active features: 3801
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.593

***** Iteration #210 *****
Loss: 52734.601377
Feature norm: 84.264083
Error norm: 265.130611
Active features: 3801
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.577

***** Iteration #211 *****
Loss: 52730.957732
Feature norm: 84.279427
Error norm: 224.750317
Active features: 3798
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.731

***** Iteration #212 *****
Loss: 52727.923327
Feature norm: 84.287309
Error norm: 263.998034
Active features: 3800
Line search trials: 2
Line search step: 0.500000
Seco

***** Iteration #249 *****
Loss: 52611.284012
Feature norm: 85.075678
Error norm: 192.474728
Active features: 3770
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.745

***** Iteration #250 *****
Loss: 52608.480704
Feature norm: 85.093584
Error norm: 166.588417
Active features: 3770
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.622

***** Iteration #251 *****
Loss: 52606.553851
Feature norm: 85.098390
Error norm: 241.049043
Active features: 3770
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.584

***** Iteration #252 *****
Loss: 52604.380139
Feature norm: 85.109784
Error norm: 206.970176
Active features: 3771
Line search trials: 3
Line search step: 0.250000
Seconds required for this iteration: 2.526

***** Iteration #253 *****
Loss: 52602.861743
Feature norm: 85.129569
Error norm: 457.831565
Active features: 3765
Line search trials: 2
Line search step: 0.500000
Seco

***** Iteration #289 *****
Loss: 52534.814090
Feature norm: 85.330335
Error norm: 176.009848
Active features: 3756
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.631

***** Iteration #290 *****
Loss: 52533.415598
Feature norm: 85.341907
Error norm: 163.789207
Active features: 3755
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.595

***** Iteration #291 *****
Loss: 52532.310223
Feature norm: 85.338302
Error norm: 198.524369
Active features: 3752
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.627

***** Iteration #292 *****
Loss: 52530.726682
Feature norm: 85.349803
Error norm: 167.948598
Active features: 3752
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.737

***** Iteration #293 *****
Loss: 52529.640623
Feature norm: 85.345883
Error norm: 187.593020
Active features: 3751
Line search trials: 2
Line search step: 0.500000
Seco

***** Iteration #332 *****
Loss: 52490.309510
Feature norm: 85.623958
Error norm: 118.720694
Active features: 3722
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.575

***** Iteration #333 *****
Loss: 52489.457161
Feature norm: 85.630246
Error norm: 111.784456
Active features: 3719
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.738

***** Iteration #334 *****
Loss: 52488.736798
Feature norm: 85.636785
Error norm: 134.784305
Active features: 3716
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.621

***** Iteration #335 *****
Loss: 52487.851778
Feature norm: 85.643007
Error norm: 121.417370
Active features: 3715
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.625

***** Iteration #336 *****
Loss: 52487.085083
Feature norm: 85.649784
Error norm: 136.274025
Active features: 3716
Line search trials: 2
Line search step: 0.500000
Seco

***** Iteration #373 *****
Loss: 52455.627051
Feature norm: 85.080580
Error norm: 99.644516
Active features: 3711
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.574

***** Iteration #374 *****
Loss: 52455.113063
Feature norm: 85.069826
Error norm: 109.999891
Active features: 3710
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.749

***** Iteration #375 *****
Loss: 52454.271295
Feature norm: 85.059411
Error norm: 94.718344
Active features: 3712
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.616

***** Iteration #376 *****
Loss: 52453.746450
Feature norm: 85.050479
Error norm: 103.711385
Active features: 3710
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.573

***** Iteration #377 *****
Loss: 52453.208158
Feature norm: 85.036733
Error norm: 108.622093
Active features: 3709
Line search trials: 2
Line search step: 0.500000
Second

***** Iteration #413 *****
Loss: 52436.807928
Feature norm: 84.772503
Error norm: 71.700369
Active features: 3682
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.723

***** Iteration #414 *****
Loss: 52436.508071
Feature norm: 84.768028
Error norm: 84.889136
Active features: 3682
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.674

***** Iteration #415 *****
Loss: 52436.016963
Feature norm: 84.761361
Error norm: 65.701498
Active features: 3681
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.765

***** Iteration #416 *****
Loss: 52435.740688
Feature norm: 84.754189
Error norm: 86.212707
Active features: 3681
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.641

***** Iteration #417 *****
Loss: 52435.283284
Feature norm: 84.746933
Error norm: 76.384744
Active features: 3680
Line search trials: 2
Line search step: 0.500000
Seconds r

***** Iteration #457 *****
Loss: 52420.931435
Feature norm: 84.334695
Error norm: 57.487642
Active features: 3669
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.719

***** Iteration #458 *****
Loss: 52420.692792
Feature norm: 84.329257
Error norm: 60.184252
Active features: 3669
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.691

***** Iteration #459 *****
Loss: 52420.461177
Feature norm: 84.331598
Error norm: 51.172132
Active features: 3669
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.574

***** Iteration #460 *****
Loss: 52420.303965
Feature norm: 84.327274
Error norm: 71.789541
Active features: 3668
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.630

***** Iteration #461 *****
Loss: 52420.032894
Feature norm: 84.328956
Error norm: 66.579101
Active features: 3671
Line search trials: 2
Line search step: 0.500000
Seconds r

***** Iteration #497 *****
Loss: 52412.607479
Feature norm: 84.260316
Error norm: 51.985598
Active features: 3659
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.576

***** Iteration #498 *****
Loss: 52412.472793
Feature norm: 84.259050
Error norm: 56.409981
Active features: 3659
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.658

***** Iteration #499 *****
Loss: 52412.302938
Feature norm: 84.261066
Error norm: 48.858489
Active features: 3656
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.773

***** Iteration #500 *****
Loss: 52412.176633
Feature norm: 84.258839
Error norm: 54.749534
Active features: 3655
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.615

L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 741.283

Storing the model
Number of active features: 3655 (4030)
Number of act

## Test the model

In [9]:
X_test, Y_test = prepare_data(data_dir, mode = "test")
test_model(X_test, Y_test)

Test accuracy : 0.847740810693
