**Questions**

- What common file type should we used in between embedding models and downstream tests? txt would be more general / accessible..
- What should the stopping criterion for training be? How do we ensure that this means a fair comparison with the word2vec model?
- All hyperparameters set to tuned values stated in paper. Batch size and learning rate set to 'defaults'. Are we happy with this?
- Not sure about the way loss functions are summed up each epoch. In paper they're averaged across data points. In the code they're averaged across data points... then summed across batches. Should we leave this?
- Should we compare the actual sparsity of the embeddings out of all methods with a single metric? cf. sparsity metric in this script

In [0]:
import numpy as np
import torch
import time

from torch import nn
from random import shuffle
from sklearn.datasets import make_blobs

In [0]:
# mount Google drive

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# test that mounting to drive works

def loadData(filename):
  lines = open(filename).readlines() # opens file, reads lines
  data = []
  words = []
  for line in lines:
    tokens = line.strip().split()
    words.append(tokens[0]) # append word to DataHandler.words
    data.append([float(i) for i in tokens[1:]]) # append embedding vector to DataHandler.words
  data = np.array(data)
  return data

data = loadData('/content/gdrive/My Drive/NLP Class/WGL/embeddings/word2vec_original_15k_300d_train.txt')
print(data.shape)

(15000, 300)


**utils**
- constructs DataHandler class to load, retrieve and process data
- defines 3 helper functions

In [0]:
### DataHandler ----------------------------------------------------------------

class DataHandler:

	def __init__(self):
		pass

	def loadData(self, filename):
		lines = open(filename).readlines() # opens file, reads lines
		self.data = []
		self.words = []
		for line in lines:
			tokens = line.strip().split()
			self.words.append(tokens[0]) # append word to DataHandler.words
			self.data.append([float(i) for i in tokens[1:]]) # append embedding vector to DataHandler.words
		self.data = np.array(self.data)
		self.data_size = self.data.shape[0] # data_size = number of words
		self.inp_dim = self.data.shape[1] # number of elements in each embedding (e.g. 300 for standard word2vec)
		self.original_data = self.data[:] # store of original array

	def getWordsList(self): # returns list of words
		return self.words

	def getDataShape(self): # returns shape of data (words X elements array)
		return self.data.shape

	def resetDataOrder(self): # resets order of data array (so matches self.words, list of words)
		self.data = self.original_data[:]

	def getNumberOfBatches(self, batch_size): # returns integer number of batches (including possible incomplete/remainder batch at end)
		return int(( self.data_size + batch_size - 1 ) / batch_size)

	def getBatch(self, i, batch_size, noise_level, denoising):
		"returns input and target batch, adding noise to input if denoising = True"
		batch_y = self.data[i*batch_size:min((i+1)*batch_size, self.data_size)] # returns i-th batch from rows of self.data
		batch_x = batch_y # replicates
		if denoising:
			batch_x = batch_y + get_noise_features(batch_y.shape[0], self.inp_dim, noise_level) # if denoising True, applies noise to Y
		return batch_x, batch_y

	def shuffleTrain(self):
		"returns shuffled version of data array"
		indices = np.arange(self.data_size)
		np.random.shuffle(indices)
		self.data = self.data[indices]

### Helper functions -----------------------------------------------------------

def compute_sparsity(X):
	"returns percentage of elements of X equal to zero (returns scalar regardless of dimensions of X)"
	non_zeros = 1. * np.count_nonzero(X) # count number of non zero elements in X
	total = X.size
	sparsity = 100. * (1 - (non_zeros)/total)
	return sparsity

def dump_vectors(X, outfile, words):
	"takes array X, list of words and writes to txt file"
	print ("shape", X.shape)
	assert len(X) == len(words) #TODO print error statement
	fw = open(outfile, 'w') # open outfile
	for i in range(len(words)):
		fw.write(words[i] + " ")
		for j in X[i]:
			fw.write(str(j) + " ")
		fw.write("\n")
	fw.close()

def get_noise_features(n_samples, n_features, noise_amount):
	"generates n_samples X n_features array of gaussian noise with mean = 0, std = noise_amount"
	noise_x,  _ =  make_blobs(n_samples=n_samples, n_features=n_features, # n_samples = number of words; n_features embedding lengths
			cluster_std=noise_amount,
			centers=np.array([np.zeros(n_features)]))
	return noise_x

**model**

SPINE autoencoder class
- input is (batch_size x original embedding size) tensor
- h is (batch_size x hidden layer size) tensor
- out is (batch_size x original embedding size) tensor

Note:
- PSL and ASL averaged rather than summed over hidden layer, contrary to paper; does these mean equations in paper are wrong?

In [0]:
class SPINEModel(torch.nn.Module):

	def __init__(self, params):
		super(SPINEModel, self).__init__()
		
		# params is argument of the model; a dictionary
		self.inp_dim = params['inp_dim'] # size of input embeddings
		self.hdim = params['hdim'] # size of hidden layer (output embeddings)
		self.noise_level = params['noise_level'] # noise level (std of gaussian noise applied to inputs)
		self.getReconstructionLoss = nn.MSELoss() # reconstruction loss set to MSE
		self.rho_star = 1.0 - params['sparsity'] # rho_star (sparsity fraction)
		
		# autoencoder
		self.linear1 = nn.Linear(self.inp_dim, self.hdim)
		self.linear2 = nn.Linear(self.hdim, self.inp_dim)
		

	def forward(self, batch_x, batch_y):
		
		# forward
		linear1_out = self.linear1(batch_x)
		h = linear1_out.clamp(min=0, max=1) # capped relu
		out = self.linear2(h)

		# different terms of the loss
		batch_size = batch_x.size(0)
		reconstruction_loss = self.getReconstructionLoss(out, batch_y) # reconstruction loss
		psl_loss = self._getPSLLoss(h, batch_size) # partial sparsity loss
		asl_loss = self._getASLLoss(h) # average sparsity loss
		total_loss = reconstruction_loss + psl_loss + asl_loss
		
		return out, h, total_loss, [reconstruction_loss, psl_loss, asl_loss]


	def _getPSLLoss(self, h, batch_size):
		"Computes PSL: Z * (1 - Z), averaged across data and hidden dimension"
		return torch.sum(h*(1-h))/ (batch_size * self.hdim)


	def _getASLLoss(self, h):
		"Computes ASL: encourages mean activation of each hidden unit to be ~0.15"
		# h is a data points x hidden units array
		temp = torch.mean(h, dim=0) - self.rho_star # compute mean activation of each unit in hidden layer and subtract rho_star
		temp = temp.clamp(min=0) # take max of 0, temp
		return torch.sum(temp * temp) / self.hdim # MSE

**main**
- define (hyper)parameters
- construct Solver class to train model and extract sparse embeddings from hidden layer
- train and output final embeddings

In [0]:
params = {}

# embedding size (paper = 1000)
params['hdim'] = 1000

# add noise to inputs or not (paper = True)
params['denoising'] = True

# noise level std (paper = 0.2)
params['noise_level'] = 0.2

# sparsity; fraction of elements we want to be 0; feeds into rho_star (paper = 0.85)
params['sparsity'] = 0.85



# number of epochs (default = 100)
params['num_epochs'] = 100

# batch size (default = 64)
params['batch_size'] = 64



# input and output files
params['input'] = '/content/gdrive/My Drive/NLP Class/WGL/embeddings/word2vec_original_15k_300d_train.txt'
params['output'] = '/content/gdrive/My Drive/NLP Class/WGL/embeddings/spine_test_out.txt'

In [0]:
class Solver:

	def __init__(self, params):

		# Build data handler
		self.data_handler = DataHandler() # instantiate DataHandler
		self.data_handler.loadData(params['input']) # load data to create DataHandler.data and DataHandler.words
		params['inp_dim'] = self.data_handler.getDataShape()[1]


		# Build model
		self.model = SPINEModel(params) # instantiate SPINEModel
		self.dtype = torch.FloatTensor
		use_cuda = torch.cuda.is_available()
		if use_cuda: # put data and model on GPU if available
			self.model.cuda()
			self.dtype = torch.cuda.FloatTensor
		self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) # SGD optimiser with lr=0.1


	def train(self, params):
		num_epochs, batch_size = params['num_epochs'], params['batch_size'],
		optimizer = self.optimizer
		dtype = self.dtype

		# cycle through epochs
		for iteration in range(num_epochs):
			self.data_handler.shuffleTrain() # returns shuffled version of data array
			num_batches = self.data_handler.getNumberOfBatches(batch_size) # calculate number of batches
			epoch_losses = np.zeros(4) # rl, asl, psl, total

			# cycle through batches
			for batch_idx in range(num_batches):
				optimizer.zero_grad() # clear old gradients
				batch_x, batch_y = self.data_handler.getBatch(batch_idx, batch_size, params['noise_level'], params['denoising'] ) # get batch data
				batch_x = torch.from_numpy(batch_x).type(dtype) # convert to tensor
				batch_y = torch.from_numpy(batch_y).type(dtype) # convert to tensor
				out, h, loss, loss_terms = self.model(batch_x, batch_y) # run model
				reconstruction_loss, psl_loss, asl_loss = loss_terms
				loss.backward() # compute gradients
				optimizer.step() # parameter update

				# add to epoch losses
				epoch_losses[0]+=reconstruction_loss.item()
				epoch_losses[1]+=asl_loss.item()
				epoch_losses[2]+=psl_loss.item()
				epoch_losses[3]+=loss.item()
		
			print("After epoch %r, Reconstruction Loss = %.4f, ASL = %.4f,"\
						"PSL = %.4f, and total = %.4f"
						%(iteration+1, epoch_losses[0], epoch_losses[1], epoch_losses[2], epoch_losses[3]) )
			
			# TODO: also print sparsity as with original code

	def getSpineEmbeddings(self, batch_size, params):
		"returns numpy array of new embeddings (in original order)"
		ret = []
		self.data_handler.resetDataOrder()
		num_batches = self.data_handler.getNumberOfBatches(batch_size)
		for batch_idx in range(num_batches):
			batch_x, batch_y = self.data_handler.getBatch(batch_idx, batch_size, params['noise_level'], params['denoising'] )
			batch_x = torch.from_numpy(batch_x).type(self.dtype)
			batch_y = torch.from_numpy(batch_y).type(self.dtype)
			_, h, _, _ = self.model(batch_x, batch_y)
			ret.extend(h.cpu().data.numpy())
		return np.array(ret)

	def getWordsList(self):
		return self.data_handler.getWordsList()



In [0]:
time1 = time.time()

# instantiate Solver and train
solver = Solver(params)
solver.train(params)
	
# output final embeddings
output_path = params['output']
final_batch_size = 512
spine_embeddings = solver.getSpineEmbeddings(final_batch_size, params) # (array output)
dump_vectors(spine_embeddings, output_path, solver.getWordsList()) # (dump to txt)

time2 = time.time()

print('time:',time2-time1)

After epoch 1, Reconstruction Loss = 7.7457, ASL = 0.0000,PSL = 11.1187, and total = 18.8644
After epoch 2, Reconstruction Loss = 7.3466, ASL = 0.0000,PSL = 9.9387, and total = 17.2853
After epoch 3, Reconstruction Loss = 7.2054, ASL = 0.0000,PSL = 8.9071, and total = 16.1126
After epoch 4, Reconstruction Loss = 7.1220, ASL = 0.0000,PSL = 7.9870, and total = 15.1090
After epoch 5, Reconstruction Loss = 7.0605, ASL = 0.0000,PSL = 7.1987, and total = 14.2593
After epoch 6, Reconstruction Loss = 7.0115, ASL = 0.0000,PSL = 6.4795, and total = 13.4910
After epoch 7, Reconstruction Loss = 6.9714, ASL = 0.0000,PSL = 5.8721, and total = 12.8434
After epoch 8, Reconstruction Loss = 6.9386, ASL = 0.0000,PSL = 5.3306, and total = 12.2693
After epoch 9, Reconstruction Loss = 6.9126, ASL = 0.0000,PSL = 4.8532, and total = 11.7658
After epoch 10, Reconstruction Loss = 6.8932, ASL = 0.0000,PSL = 4.4386, and total = 11.3318
After epoch 11, Reconstruction Loss = 6.8739, ASL = 0.0000,PSL = 4.0648, and t