**Notebook structure**

This notebook comprises three sections:

*0 - Importing libraries, mounting Google drive and defining paths*

*1 - Defining SPINE classes and functions*

*2 - Execution*

Execution is divided up into:

*2a - Grid-search*: runs grid-search then writes a file search_300.npz containing grid-search results

*2b - Tuple selection*: reads in search_300.npz then selects final hyperparameter tuples; stores tuple information in params_300.npz

*2c - Embedding generation*: reads in params_300.npz and generates SPINE embeddings along with metadata file

**Note**: lines which write files (to write search_300.npz, params_300.npz, SPINE embeddings or embeddings metadata) are presently commented out. This is done to ensure no overwriting of original WAKU versions of these files

$\color{red}{\text{Please check the paths are defined correctly in section 0 before running this code}}$

**Sources**

The code in this notebook is based on code from the following public repo: https://github.com/jacobdanovitch/SPINE

The SPINE paper (Subramanian et al., 2018) can be found here: https://arxiv.org/pdf/1711.08792.pdf

# 0. Importing libraries, mounting Google drive and defining paths

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import time
import pickle
import scipy

from torch import nn
from random import shuffle
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from scipy.stats import *

In [0]:
# mount Google drive

from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# find WAKU on gdrive and cd into it
%cd '/content/gdrive'
directory_name = !find . -type d -name "WAKU"
directory = directory_name[0]
print(directory)
%cd $directory

In [0]:
### paths

# dense embeddings and corresponding dict
path_dense = './embeddings/dense/300_0.0_embeddings.npz'
path_dict = './embeddings/index2word.pickle'

# WordSim-353 dataset
path_ws = './raw_data/word_similarity/WordSim353.txt'

# search_300.npz and params_300.npz
path_search_300 = './spine/search_300.npz'
path_params_300 = './spine/params_300.npz'

# final embeddings and metadata file
path_embeddings = './embeddings/spine/sp_300_%dpercent_embeddings.npz'# %d in place of target sparsity
path_metadata = './embeddings/spine/sp_300_metadata.pickle'

# 1. Defining SPINE classes and functions

## 1a. Helper functions

In [0]:
# pickle read and write functions

def load_pickle(filepath):
    "read in pickle file"
    pickle_in = open(filepath,"rb")
    emb_dict = pickle.load(pickle_in)
    return emb_dict

def save_pickle(d_object, filename):
    "dump to pickle file"
    with open(filename, 'wb') as f:
      pickle.dump(d_object, f)

In [0]:
# txt read and write functions

def loadData(filepath):
    "loads SPINE style embeddings txt, returns data and words"
    lines = open(filepath).readlines()
    data = []
    words = []
    for line in lines:
        tokens = line.strip().split()
        words.append(tokens[0])
        data.append([float(i) for i in tokens[1:]])
    data = np.array(data)
    return data, words

def dump_vectors(X, outfile, words):
	"takes array X, list of words and writes to txt file"
	print ("shape", X.shape)
	assert len(X) == len(words)
	fw = open(outfile, 'w')
	for i in range(len(words)):
		fw.write(words[i] + " ")
		for j in X[i]:
			fw.write(str(j) + " ")
		fw.write("\n")
	fw.close()

In [0]:
def compute_sparsity(X):
	"returns percentage of elements of X equal to zero (returns scalar regardless of dimensions of X)"
	non_zeros = 1. * np.count_nonzero(X) # count number of non zero elements in X
	total = X.size
	sparsity = 100. * (1 - (non_zeros)/total)
	return sparsity

def get_noise_features(n_samples, n_features, noise_amount):
	"generates n_samples X n_features array of gaussian noise with mean = 0, std = noise_amount"
	noise_x,  _ =  make_blobs(n_samples=n_samples, n_features=n_features, cluster_std=noise_amount, centers=np.array([np.zeros(n_features)]))
	return noise_x

## 1b. DataHandler

In [0]:
class DataHandler:

	def __init__(self):
		pass

	def loadData(self, data, words):
		self.words = words
		self.data = data
		self.data_size = self.data.shape[0] # data_size = number of words
		self.inp_dim = self.data.shape[1] # number of elements in each embedding (e.g. 300 for standard word2vec)
		self.original_data = self.data[:] # store of original array

	def getWordsList(self): # returns list of words
		return self.words

	def getDataShape(self): # returns shape of data (words X elements array)
		return self.data.shape

	def resetDataOrder(self): # resets order of data array (so matches self.words, list of words)
		self.data = self.original_data[:]

	def getNumberOfBatches(self, batch_size): # returns integer number of batches (including possible incomplete/remainder batch at end)
		return int(( self.data_size + batch_size - 1 ) / batch_size)

	def getBatch(self, i, batch_size, noise_level, denoising):
		"returns input and target batch, adding noise to input if denoising = True"
		batch_y = self.data[i*batch_size:min((i+1)*batch_size, self.data_size)] # returns i-th batch from rows of self.data
		batch_x = batch_y # replicates
		if denoising:
			batch_x = batch_y + get_noise_features(batch_y.shape[0], self.inp_dim, noise_level) # if denoising True, applies noise to Y
		return batch_x, batch_y

	def shuffleTrain(self):
		"returns shuffled version of data array"
		indices = np.arange(self.data_size)
		np.random.shuffle(indices)
		self.data = self.data[indices]



## 1c. SPINEModel

In [0]:
class SPINEModel(torch.nn.Module):

	def __init__(self, params):
		super(SPINEModel, self).__init__()
		
		# params is argument of the model; a dictionary
		self.inp_dim = params['inp_dim'] # size of input embeddings
		self.hdim = params['hdim'] # size of hidden layer (output embeddings)
		self.noise_level = params['noise_level'] # noise level (std of gaussian noise applied to inputs)
		self.getReconstructionLoss = nn.MSELoss() # reconstruction loss set to MSE
		self.rho_star = params['rho_star'] # rho_star (sparsity fraction)
		
		# autoencoder
		self.linear1 = nn.Linear(self.inp_dim, self.hdim)
		#print(self.linear1.weight.data)
		self.linear2 = nn.Linear(self.hdim, self.inp_dim)
		

	def forward(self, batch_x, batch_y):
		
		# forward
		linear1_out = self.linear1(batch_x)
		h = linear1_out.clamp(min=0, max=1) # capped relu
		out = self.linear2(h)

		# different terms of the loss
		batch_size = batch_x.size(0)
		reconstruction_loss = self.getReconstructionLoss(out, batch_y) # reconstruction loss
		psl_loss = self._getPSLLoss(h, batch_size) # partial sparsity loss
		asl_loss = self._getASLLoss(h) # average sparsity loss
		total_loss = reconstruction_loss + params['psl_coeff']*psl_loss + params['asl_coeff']*asl_loss
		
		return out, h, total_loss, [reconstruction_loss, psl_loss, asl_loss]


	def _getPSLLoss(self, h, batch_size):
		"Computes PSL: Z * (1 - Z), averaged across data and hidden dimension"
		return torch.sum(h*(1-h))/ (batch_size * self.hdim)


	def _getASLLoss(self, h):
		"Computes ASL: encourages mean activation of each hidden unit to be ~0.15"
		# h is a data points x hidden units array
		temp = (torch.mean(h, dim=0) - self.rho_star) # compute mean activation of each unit in hidden layer and subtract rho_star
		if params['max'] == True:
			temp = temp.clamp(min=0) # take max of 0, temp
		return torch.sum(temp * temp) / self.hdim # MSE

## 1d. Solver

In [0]:
class Solver:

	def __init__(self, params):

		# Build data handler
		self.data_handler = DataHandler() # instantiate DataHandler
		self.data_handler.loadData(params['data_train'], params['words_train']) # load data to create DataHandler.data and DataHandler.words
		params['inp_dim'] = self.data_handler.getDataShape()[1]

		# Build model
		self.model = SPINEModel(params) # instantiate SPINEModel
		self.dtype = torch.FloatTensor
		use_cuda = torch.cuda.is_available()
		if use_cuda: # put data and model on GPU if available
			self.model.cuda()
			self.dtype = torch.cuda.FloatTensor
		self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) # SGD optimiser with lr=0.1

	def train(self, params):
		num_epochs, batch_size = params['num_epochs'], params['batch_size'],
		optimizer = self.optimizer
		dtype = self.dtype
		train_results = np.zeros((num_epochs,3))

		# cycle through epochs
		for iteration in range(num_epochs):
			self.data_handler.shuffleTrain() # returns shuffled version of data array
			num_batches = self.data_handler.getNumberOfBatches(batch_size) # calculate number of batches
			epoch_losses = np.zeros(4) # rl, asl, psl, total

			# cycle through batches
			total_data = 0
			for batch_idx in range(num_batches):
				optimizer.zero_grad() # clear old gradients
				batch_x, batch_y = self.data_handler.getBatch(batch_idx, batch_size, params['noise_level'], params['denoising']) # get batch data
				batch_x = torch.from_numpy(batch_x).type(dtype) # convert to tensor
				batch_y = torch.from_numpy(batch_y).type(dtype) # convert to tensor
				out, h, loss, loss_terms = self.model(batch_x, batch_y) # run model
				reconstruction_loss, psl_loss, asl_loss = loss_terms
				loss.backward() # compute gradients
				optimizer.step() # parameter update

				# add to epoch losses
				this_size = batch_x.size(0)
				epoch_losses[0]+=reconstruction_loss.item()*this_size
				epoch_losses[1]+=asl_loss.item()*this_size
				epoch_losses[2]+=psl_loss.item()*this_size
				epoch_losses[3]+=loss.item()*this_size
				total_data+=this_size

			# divide epoch losses by total data points
			epoch_losses/=total_data

			full_h = self.getSpineEmbeddings(params)
			epoch_sparsity = compute_sparsity(full_h)
			epoch_sim = sim(full_h)
	 
			# fill epoch_results with sparsity, sim result and epoch number
			train_results[iteration] = np.array([epoch_sparsity, epoch_sim, iteration+1])

			#print("After epoch %2r, RL = %.4f, ASL = %.4f, PSL = %.4f, total = %.4f, sparsity = %.4f, sim = %.4f" %(iteration+1, epoch_losses[0], epoch_losses[1], epoch_losses[2], epoch_losses[3], epoch_sparsity, epoch_sim))

		# print results after convergence
		print("Stopped at epoch %2r, RL = %.4f, ASL = %.4f, PSL = %.4f, total = %.4f, sparsity = %.4f, sim = %.4f" %(iteration+1, epoch_losses[0], epoch_losses[1], epoch_losses[2], epoch_losses[3], epoch_sparsity, epoch_sim))
		return train_results, full_h


	def getSpineEmbeddings(self, params):
		"returns numpy array of new embeddings (in original order)"
		ret = []
		self.data_handler.resetDataOrder()
		num_batches = self.data_handler.getNumberOfBatches(params['batch_size_eval'])
		for batch_idx in range(num_batches):
			batch_x, batch_y = self.data_handler.getBatch(batch_idx, params['batch_size_eval'], params['noise_level'], False)
			batch_x = torch.from_numpy(batch_x).type(self.dtype)
			batch_y = torch.from_numpy(batch_y).type(self.dtype)
			with torch.no_grad():
				_, h, _, _ = self.model(batch_x, batch_y)
				ret.extend(h.cpu().data.numpy())
		return np.array(ret)

	def getWordsList(self):
		return self.data_handler.getWordsList()



## 1e. Word sim

In [0]:
### load in word sim data
def loadTestData():
  "loads in annotated pairs"
  data = {}
  tmp = open(path_ws).readlines()
  data['words'] = [ row.strip().split('\t')[0:2] for i, row in enumerate(tmp) if i!=0 ]
  data['sim_scores'] = [ float(row.strip().split('\t')[2]) for i, row in enumerate(tmp) if i!=0 ]
  return data

In [0]:
### main sim functions
def getSimilarity(e1, e2):
  "computes cosine similarity (cosine of angle between embedding vectors)"
  if ( np.sqrt(np.sum(e1*e1)) * np.sqrt(np.sum(e2*e2))) == 0:
    return -1
  else:
    return np.sum(e1 * e2)/( np.sqrt(np.sum(e1*e1)) * np.sqrt(np.sum(e2*e2)))
 
def getSimilarityScoreForWords(w1,w2, embeddings):
  if (w2 not in params['words_train']) or (w1 not in params['words_train']):
    return -1
  else:
    finalVector_w1 = embeddings[params['words_train'].index(w1)]
    finalVector_w2 = embeddings[params['words_train'].index(w2)]
    return getSimilarity(finalVector_w1, finalVector_w2)

def sim(embeddings):
  pred_scores = []
  pred_scores = [[getSimilarityScoreForWords(w1w2[0],w1w2[1], embeddings), human_score] for w1w2, human_score in zip(sim_output_val['words'], sim_output_val['sim_scores'])]
  pred_scores = np.array( [ val for val in pred_scores if val[0] != -1])
  if len(pred_scores) > 316:
    spearman_rank_coeff, _ = spearmanr(pred_scores[:,0], pred_scores[:,1])
    return spearman_rank_coeff
  else:
    return np.nan


# 2. Execution

## 2a. Grid-search

read in word sim annotated pairs

In [0]:
data = loadTestData()
sim_output_val = {'sim_scores': data['sim_scores'], 'words': data['words']}

read in the dense embeddings (word2vec)

In [0]:
data_full = np.load(path_dense)['a']
words_full = load_pickle(path_dict)
words_full = list(words_full.values())

params

In [0]:
params = {}

# process params
params['hdim'] = 300
params['rho_star'] = 0.15
params['noise_level'] = 0.2
params['num_epochs'] = 30
params['batch_size'] = 64
params['batch_size_eval'] = 512
params['max'] = True
params['denoising'] = True

# data params
params['data_train'] = data_full
params['words_train'] = words_full


grid search

In [0]:
# grid search

search_300 = np.zeros((1,5))
for asl_coeff in [10, 1, 0.1, 0.01, 0.001]:
  for psl_coeff in [50, 20, 10, 4, 3, 2, 1, 0.75, 0.1, 0.01, 0.001]:
    
    # assign params
    params['asl_coeff'] = asl_coeff
    params['psl_coeff'] = psl_coeff
    
    # seeds
    torch.manual_seed(0) # fix torch seed for weight initialisation
    np.random.seed(0) # fix np seed for gaussian noise and shuffle during training
    
    # instantiate and train model
    solver = Solver(params)
    train_results, _ = solver.train(params)

    # add columns for hyperparams
    train_results = np.insert(train_results,3,asl_coeff,axis=1)
    train_results = np.insert(train_results,4,psl_coeff,axis=1)
    
    # add to main table
    search_300 = np.append(search_300,train_results,axis=0)
    print('done asl =', asl_coeff, 'psl =', psl_coeff)

# delete step row and save down
search_300 = np.delete(search_300, 0, 0)
#np.savez(path_search_300, search_300)

## 2b. Tuple selection

In [0]:
#### define epoch floor ####
e_floor = 10
search_300 = np.load(path_search_300)['arr_0']


In [0]:
##### select optimal hyperparameters

targets = [30, 50, 70, 90]
params_300 = np.zeros((len(targets),6))

search_300_sorted = search_300[(-1*search_300[:,1]).argsort()]
search_300_sorted = search_300_sorted[search_300_sorted[:,2]>=e_floor]

for t, target in enumerate(targets):
  idx = (search_300_sorted[:,0]>target-2.5) & (search_300_sorted[:,0]<target+2.5)
  print('%d percent target' % target)
  print('actual sparsity =',search_300_sorted[idx][0,0])
  print('sim =',search_300_sorted[idx][0,1])
  print('epoch, asl, psl =', search_300_sorted[idx][0,2:])
  print('\n')
  params_300[t,0:5] = search_300_sorted[idx][0]
  params_300[t,5] = target

#np.savez(path_params_300, params_300)

In [0]:
# load in search data and create beam chart

filter_300 = search_300[search_300[:,2]>=e_floor]
plt.figure(figsize=(12,7))
plt.axvspan(7.5, 12.5, color='lightgrey',alpha=0.4)
plt.axvspan(27.5, 32.5, color='lightgrey',alpha=0.4)
plt.axvspan(47.5, 52.5, color='lightgrey',alpha=0.4)
plt.axvspan(67.5, 72.5, color='lightgrey',alpha=0.4)
plt.axvspan(87.5, 92.5, color='lightgrey',alpha=0.4)
plt.scatter(filter_300[:,0],filter_300[:,1],s=10)

plt.xticks(np.arange(0, 110, step=10))
plt.ylim(top=0.395)
plt.xlim(20,100)

plt.ylabel(r'word similarity score, $\rho_{sim}$',size=20)
plt.xlabel(r'sparsity (%)',size=20)
plt.title('Word similarity vs. sparsity following SPINE grid search \n',size=20)

select = plt.scatter(params_300[:,0],params_300[:,1],c='r',marker='X',s=75)

plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend([select], ['tuples selected for downstream tasks'],loc=3, prop={'size': 15});

## 2c. Embedding generation

In [0]:
## params_300 has the following columns:
# target sparsity
# actual sparsity
# val sim score
# epoch number
# asl coeff
# psl coeff

# read in npz and start meta-data dict
params_300 = np.load(path_params_300)['arr_0']
sp_300_metadata = {}

# cycle through the rows
for row in params_300:
  
  # set params
  sparsity = row[0]
  val_sim = row[1]
  params['num_epochs'] = int(row[2]) # num epochs
  params['asl_coeff'] = row[3] # asl
  params['psl_coeff'] = row[4] # psl
  target = row[5]

  # train the model
  torch.manual_seed(0)
  np.random.seed(0)
  solver = Solver(params)
  _, full_h = solver.train(params)

  # save down embeddings
  #np.savez(path_embeddings % target, full_h)

  # make row in dictionary
  sp_300_metadata['sp_300_%dpercent_embeddings' % target] = {'sparsity': sparsity, 'val_sim': val_sim, 'num_epochs': params['num_epochs'], 'asl_coeff': params['asl_coeff'], 'psl_coeff': params['psl_coeff']}

# save down dictionary
#save_pickle(sp_300_metadata, path_metadata)