load the digit dataset from sklearn, split the dataset into 80% training and 20% testing

In [14]:
from nolearn.lasagne import NeuralNet
from lasagne import layers
from lasagne import nonlinearities
from lasagne.updates import nesterov_momentum
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

SEARCH_CLUES_PATH = "./cw/"

def process_filename(line):
	if line.strip() != "":
		line = line.rstrip('\n')
		file_id, filename = line.split('\t')
		file_id = int(file_id)
		return file_id, filename

def process_txt(line):
	if line.strip() != "":
		line = line.rstrip('\n')
		clues, url = line.split('\t')
		# clues = clues.split(' ')
		return clues

def process_input():
	sentences = []
	output = []
	for line in open(SEARCH_CLUES_PATH + 'list', 'r').readlines():
		file_id, filename = process_filename(line)
		for line2 in open(SEARCH_CLUES_PATH + filename, 'r').readlines():
			sentences.append(process_txt(line2))
			output.append(file_id)
	vectorizer = CountVectorizer(min_df=1, encoding='cp1252')
	input = vectorizer.fit_transform(sentences).toarray()
	output = np.array(output)
	return input, output

def split_data(input, output):
	x_train, x_test, y_train, y_test = train_test_split(input, output, test_size = 0.2)
	return x_train, x_test, y_train, y_test

x, y = process_input()
x_train, x_test, y_train, y_test = split_data(x, y)

### Timer

In [15]:
import time
from functools import wraps
 
def fn_timer(function):
	@wraps(function)
	def function_timer(*args, **kwargs):
		t0 = time.time()
		result = function(*args, **kwargs)
		t1 = time.time()
		print ("Total time running %s: %s seconds" %
			   (function.func_name, str(t1-t0))
			   )
		return result
	return function_timer

#### Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
clf_rf = RandomForestClassifier()
clf_rf.fit(x_train, y_train)
y_pred_rf = clf_rf.predict(x_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print "random forest accuracy: ", acc_rf

random forest accuracy:  0.869951534733


#### Stochastic Gradient Descent

In [17]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier()
clf_sgd.fit(x_train, y_train)
y_pred_sgd = clf_sgd.predict(x_test)
acc_sgd = accuracy_score(y_test, y_pred_sgd)
print "stochastic gradient descent accuracy: ",acc_sgd

stochastic gradient descent accuracy:  0.903877221325


#### Support Vector Machine

In [18]:
from sklearn.svm import LinearSVC
clf_svm = LinearSVC()
clf_svm.fit(x_train, y_train)
y_pred_svm = clf_svm.predict(x_test)
acc_svm = accuracy_score(y_test, y_pred_svm)
print "Linear SVM accuracy: ",acc_svm

Linear SVM accuracy:  0.903877221325


#### Nearest Neighbors

In [19]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier()
clf_knn.fit(x_train, y_train)
y_pred_knn = clf_knn.predict(x_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
print "nearest neighbors accuracy: ",acc_knn

nearest neighbors accuracy:  0.79563812601


#### Neural Network

In [20]:
@fn_timer
def train(x_train, y_train):
	clf_nn = NeuralNet(
	    layers=[  # three layers: one hidden layer
	        ('input', layers.InputLayer),
	        ('hidden1', layers.DenseLayer),
	        ('hidden2', layers.DenseLayer),
	        ('output', layers.DenseLayer),
	        ],
	    # layer parameters:
	    input_shape=(None, 2538),  # 784 input pixels per batch
	    hidden1_num_units=100,  # number of units in hidden layer
	    hidden2_num_units=100,
	    output_nonlinearity=nonlinearities.softmax,  # output layer uses identity function
	    output_num_units=10,  # 10 target values

	    # optimization method:
	    update=nesterov_momentum,
	    update_learning_rate=0.01,
	    update_momentum=0.9,
	    
	    max_epochs=50,  # we want to train this many epochs
	    verbose=1,
	    )
	clf_nn.fit(x_train, y_train)
	return clf_nn

def test(clf_nn, x_test):
	return clf_nn.predict(x_test)

x, y = process_input()
x_train, x_test, y_train, y_test = split_data(x, y)
clf_nn = train(x_train, y_train)
y_pred_nn = test(clf_nn, x_test)
acc_nn = accuracy_score(y_test, y_pred_nn)
print "neural network accuracy: ", acc_nn

# Neural Network with 265010 learnable parameters

## Layer information

  #  name       size
---  -------  ------
  0  input      2538
  1  hidden1     100
  2  hidden2     100
  3  output       10

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m2.19567[0m       [32m2.05797[0m      1.06691      0.29450  0.35s
      2       [36m1.98207[0m       [32m1.93797[0m      1.02276      0.29450  0.37s
      3       [36m1.91680[0m       [32m1.90710[0m      1.00509      0.29578  0.32s
      4       [36m1.89085[0m       [32m1.88642[0m      1.00235      0.30780  0.34s
      5       [36m1.86547[0m       [32m1.86059[0m      1.00262      0.35991  0.31s
      6       [36m1.83209[0m       [32m1.82454[0m      1.00414      0.41542  0.31s
      7       [36m1.78531[0m       [32m1.77345[0m      1.00668      0.45657  0.31s
      8       [36m1.71973[0m       [32m1.70335[0m  