load the digit dataset from sklearn, split the dataset into 80% training and 20% testing

In [1]:
from nolearn.lasagne import NeuralNet
from lasagne import layers
from lasagne import nonlinearities
from lasagne.updates import nesterov_momentum
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

SEARCH_CLUES_PATH = "./cw-large/"
#SEARCH_CLUES_PATH = "/data0/corpora/crossword_corpus/"

def process_filename(line):
	if line.strip() != "":
		line = line.rstrip('\n')
		#file_id, filename = line.split('\t')
		#file_id = int(file_id)
		#return file_id, filename
        return line

def process_txt(line):
	if line.strip() != "":
		line = line.rstrip('\n')
		clues, url = line.split('\t')
		# clues = clues.split(' ')
		return clues

def process_input():
	sentences = []
	output = []
	count = 0
	for line in open(SEARCH_CLUES_PATH + 'random1000.list', 'r').readlines():
		filename = process_filename(line)
		for line2 in open(SEARCH_CLUES_PATH + filename, 'r').readlines():
			sentences.append(process_txt(line2))
			#output.append(filename)
			output.append(count)
		count = count + 1
	vectorizer = CountVectorizer(min_df=1, encoding='latin_1')
	input = vectorizer.fit_transform(sentences).toarray()
	output = np.array(output)
	return input, output

def split_data(input, output):
	x_train, x_test, y_train, y_test = train_test_split(input, output, test_size = 0.2)
	return x_train, x_test, y_train, y_test

x, y = process_input()
x_train, x_test, y_train, y_test = split_data(x, y)



### Timer

In [2]:
import time
from functools import wraps
 
def fn_timer(function):
	@wraps(function)
	def function_timer(*args, **kwargs):
		t0 = time.time()
		result = function(*args, **kwargs)
		t1 = time.time()
		print ("Total time running %s: %s seconds" %
			   (function.func_name, str(t1-t0))
			   )
		return result
	return function_timer

#### Random Forest Classifier

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
clf_rf = RandomForestClassifier()
clf_rf.fit(x_train, y_train)
y_pred_rf = clf_rf.predict(x_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print "random forest accuracy: ", acc_rf

random forest accuracy:  0.534373476353


#### Stochastic Gradient Descent

In [4]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier()
clf_sgd.fit(x_train, y_train)
y_pred_sgd = clf_sgd.predict(x_test)
acc_sgd = accuracy_score(y_test, y_pred_sgd)
print "stochastic gradient descent accuracy: ",acc_sgd

stochastic gradient descent accuracy:  0.618722574354


#### Support Vector Machine

In [5]:
from sklearn.svm import LinearSVC
clf_svm = LinearSVC()
clf_svm.fit(x_train, y_train)
y_pred_svm = clf_svm.predict(x_test)
acc_svm = accuracy_score(y_test, y_pred_svm)
print "Linear SVM accuracy: ",acc_svm

Linear SVM accuracy:  0.658703071672


#### Nearest Neighbors

In [6]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier()
clf_knn.fit(x_train, y_train)
y_pred_knn = clf_knn.predict(x_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
print "nearest neighbors accuracy: ",acc_knn

nearest neighbors accuracy:  0.420770355924


#### Neural Network

In [8]:
@fn_timer
def train(x_train, y_train):
	clf_nn = NeuralNet(
	    layers=[  # three layers: one hidden layer
	        ('input', layers.InputLayer),
	        ('hidden1', layers.DenseLayer),
	        ('hidden2', layers.DenseLayer),
	        ('output', layers.DenseLayer),
	        ],
	    # layer parameters:
	    input_shape=(None, 7773),  # 784 input pixels per batch
	    hidden1_num_units=10000,  # number of units in hidden layer
	    hidden2_num_units=10000,
	    output_nonlinearity=nonlinearities.softmax,  # output layer uses identity function
	    output_num_units=1000,  # 10 target values

	    # optimization method:
	    update=nesterov_momentum,
	    update_learning_rate=0.01,
	    update_momentum=0.9,
	    
	    max_epochs=50,  # we want to train this many epochs
	    verbose=1,
	    )
	clf_nn.fit(x_train, y_train)
	return clf_nn

def test(clf_nn, x_test):
	return clf_nn.predict(x_test)

x, y = process_input()
x_train, x_test, y_train, y_test = split_data(x, y)
clf_nn = train(x_train, y_train)
y_pred_nn = test(clf_nn, x_test)
acc_nn = accuracy_score(y_test, y_pred_nn)
print "neural network accuracy: ", acc_nn

# Neural Network with 187751000 learnable parameters

## Layer information

  #  name       size
---  -------  ------
  0  input      7773
  1  hidden1   10000
  2  hidden2   10000
  3  output     1000

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -------
      1       [36m6.85284[0m       [32m6.81208[0m      1.00598      0.04366  334.91s
      2       [36m6.61464[0m       [32m6.56107[0m      1.00817      0.04366  303.86s
      3       [36m6.09759[0m       [32m6.27026[0m      0.97246      0.04871  272.84s
      4       [36m5.66809[0m       [32m6.14618[0m      0.92221      0.04412  274.12s
      5       [36m5.42365[0m       [32m6.12558[0m      0.88541      0.05101  272.57s
      6       [36m5.29877[0m       6.13488      0.86371      0.05882  278.87s
      7       [36m5.22113[0m       6.14057      0.85027      0.06756  275.15s
      8       [36m5.15661[0m       6.14396      0.83