### Getting and preprocessing the data
First we get the dataset online, then apply subsampling, then divide the dataset in equally long sentences

In [3]:
%%time
from collections import Counter
import numpy as np
from gensim.test.utils import datapath
import gensim.downloader as api
import random
from itertools import dropwhile

def sampling(dataset,threshold=1e-4, min_count=5):
    
    # Count occurences of each word in the dataset 
    word_counts = Counter(dataset)  
    total_count = len(dataset)
    
    freqs = {word: count/total_count for word, count in word_counts.items()}
    p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
    train_words = [word for word in dataset if random.random() < (1 - p_drop[word]) and word_counts[word]>min_count]
    return train_words

"Transforms a list of words to a list of sentences with length=len_sen"
def words_to_sentences(words):
    new_ds = []
    len_sen = 20
    for i in range(0, len(words), len_sen):
        y = [words[i:i + len_sen]]
        new_ds.extend(y)
    return new_ds
    

# Get dataset online
dataset = api.load('text8')

# Convert to list of words
text8_ds = []
for x in dataset: 
    for y in x:
        text8_ds.append(y)
        
# Subsampling
text8_ds = sampling(text8_ds)

# New dataset with sentences of length=20
text8_dataset = words_to_sentences(text8_ds)

CPU times: user 29.5 s, sys: 1.45 s, total: 31 s
Wall time: 31.1 s


In [4]:
len(text8_ds)

8419357

### Evaluation methods

In [5]:
from scipy import stats, spatial 
import csv, numpy as np
from scipy.stats.stats import pearsonr
from scipy import spatial 
#IMPORT DATA
def get_wordsim_data():
    wordsim_data = [] 
    with open('./data/wordsim/set1.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=' ',quotechar='|')
        for row in reader: 
            wordsim_data.append(row[0].split(',')[0:3])
    del wordsim_data[0]
    with open('./data/wordsim/set2.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=' ',quotechar='|')
        for i,row in enumerate(reader):
            if i!=0:
                wordsim_data.append(row[0].split(',')[0:3])

    wordsim_vocab = set()
    for x in wordsim_data:
        wordsim_vocab.add(x[0])
        wordsim_vocab.add(x[1])
    return wordsim_data

#len(wordsim_vocab.intersection(text8_dataset_first_sentence.vocab))
def wordsim_task(dict_emb):
    wordsim_data = get_wordsim_data()
    scores = []
    distances = []
    for task in wordsim_data: 
        if (task[0] in dict_emb.keys() ) and (task[1] in dict_emb.keys()):
            scores.append(float(task[2]))
            distances.append(spatial.distance.cosine(dict_emb[task[0]], dict_emb[task[1]]))
            
            
    #return stats.zscore(np.array([x[1] for x in out],dtype=float))
    return np.corrcoef(scores,distances)

#print(wordsim_task(gensim_emb))
#wordsim_task(dict_emb)

### Model

In [6]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import pdb

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
cpu = torch.device('cpu')
gpu = torch.device("cuda:0")


class SkipGramModel(nn.Module):

    def __init__(self, vocab_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0,0)
        
            
    def forward(self, pos_u, pos_v,neg_v):
        pos_u = pos_u.view(-1).to(device)
        pos_v = pos_v.to(device)
        neg_v = neg_v.to(device)
        emb_u = self.u_embeddings(pos_u)
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))

In [18]:
from torch.utils.data import Dataset
from collections import defaultdict
import random

class wDataSet(Dataset):
    def __init__(self, dataset, power=0.75,ctx_window=2,):
        self.LEN_SEN = 20
        self.ctx_window = ctx_window
        self.dataset = dataset
        self.word2idx = dict()
        self.idx2word = dict()
        self.word_count = defaultdict(int)
        self.vocab_size = int()
        self.vocab = set()
        self.create_vocab()
        #self.pairs = self.generate_pairs()
        #self.key_pairs = self.generate_key_pairs(self.pairs)
        self.power = power        
        self.neg_table = self.make_neg_table(self.power)

        
    def generate_pairs(self):
        print("Generating pairs")
        pairs = []
        for sentence in self.dataset:
            for i,word in enumerate(sentence):
                for j in range(1,random.randint(2,self.ctx_window+1)):
                    if(i+j<len(sentence)):
                        pairs.append((word,sentence[i+j]))
                    if((i-j)>0):
                        pairs.append((word,sentence[i-j]))

        return pairs
    
    def __len__(self):
        len_dataset = len(self.dataset)     
        center_pairs = ((self.LEN_SEN - self.ctx_window*2)*self.ctx_window*2) 
        border_pairs = sum([self.ctx_window + i for i in range(self.ctx_window)])*2
        len_sen_without_last = (center_pairs + border_pairs)* (len_dataset-1)
        
        # The last sentence does not has the same length as the other ones, hence it's length needs to be computed otherwise
        len_last_sen = len(self.dataset[(len_dataset-1)])
        pairs_last_sen = 0
        for j in range(len_last_sen):
            if(j<self.ctx_window):
                print(j,(j+self.ctx_window))
                pairs_last_sen += (j+self.ctx_window)
            elif( j>= len_last_sen - self.ctx_window):
                print(j,(len_last_sen-j+self.ctx_window))
                pairs_last_sen += (len_last_sen-1-j+self.ctx_window)
            else:
                print(j,2*self.ctx_window)        
        return len_sen_without_last
        
        
    def __getitem__(self, idx):
        #Getting the number of pairs per sentence
        border_pairs = sum([self.ctx_window + i for i in range(self.ctx_window)])*2
        center_pairs = ((self.LEN_SEN - self.ctx_window*2)*self.ctx_window*2)
        n_pairs_in_sen = border_pairs + center_pairs
        id_sen = int(idx/n_pairs_in_sen)
        sen  = self.dataset[id_sen]
        pair_id_in_sen = idx - id_sen*(n_pairs_in_sen)
        counter = 0
        for i,word in enumerate(sen):
            for j in range(1,self.ctx_window+1):
                if(i+j< len(sen)):
                    if(counter == pair_id_in_sen):
                        #print(word)
                        #print(i+j)
                        if (self.word2idx[word],self.word2idx[sen[i+j]]) is None:
                            pdb.set_trace()
                        return(self.word2idx[word],self.word2idx[sen[i+j]])
                    counter+=1
                    
                if(i-j>=0):
                    if(counter == pair_id_in_sen):
                        #print(word)
                        if(self.word2idx[word],self.word2idx[sen[i-j]]) is None:
                            pdb.set_trace()
                        return(self.word2idx[word],self.word2idx[sen[i-j]])
                    counter+=1

    
    def get_neg_samples(self, count, batch_size):
        return torch.tensor(np.random.choice(list(self.idx2word.keys()),size=(batch_size)*count,replace=True,p=self.neg_table)).view(batch_size,-1)
   
    """ Defines the probability of choosing a negative sampling, set empiraccaly by mikolov"""
    def make_neg_table(self, power):
        pow_frequency = np.array([self.word_count[self.idx2word[i]] for i in range(len(self.word_count))])**power
        return pow_frequency / pow_frequency.sum()
        

    def generate_key_pairs(self,pairs):
        print("Generating key_pairs")
        key_pairs = []
        for x,y in pairs:
            key_pairs.append((self.word2idx.get(x),self.word2idx.get(y)))
        print("finished creating key_pairs")
        return key_pairs
    
    """"Creating vocabulary and creating dictionary with a one to one mapping int to word"""
    def create_vocab(self):
        print("Creating vocab")
        for i,sentence in enumerate(self.dataset):
            for word in sentence:
                self.word_count[word] += 1
                self.vocab.add(word)
        self.word2idx = {w: idx for (idx, w) in enumerate(self.vocab)}
        self.idx2word = {idx: w for (idx, w) in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)
        

In [16]:
import torch
import pickle
from torch.utils.data import DataLoader
import copy
import time
import numbers

class W2V():
    def __init__(self, data,dim=100, neg_samples=10, alpha=0.4, iterations=20, batch_size=2000, 
                 shuffle=True,use_cuda=True,workers=1,momentum=0,nesterov=False,step_size=1,gamma=1):
        self.momentum = momentum
        self.nesterov = nesterov
        self.step_size = step_size
        self.gamma = gamma
        self.shuffle = shuffle        
        self.batch_size = batch_size
        self.alpha = alpha
        self.dim = dim
        self.data = data
        self.workers = workers
        self.ctxw = self.data.ctx_window
        self.neg_samples = neg_samples
        self.use_cuda = use_cuda
        self.models = []
        self.optimizers = []
        self.ws_list = []
        self.loss_list = []
        self.model = SkipGramModel(len(self.data.vocab), self.dim)
        self.model.to(device)
    
        print(device)
        #self.optimizer = torch.optim.SGD(self.model.parameters(), lr=alpha, momentum=momentum,nesterov=nesterov)
        #self.scheduler = StepLR(self.optimizer, step_size=step_size, gamma=gamma)
        #self.optimizer = torch.optim.Adagrad(self.model.parameters(), lr=alpha)
        self.optimizer = torch.optim.Adam(self.model.parameters(),lr=alpha)


        self.iterations = iterations
 
    def train_with_loader(self,save_embedding=True):
        loader = DataLoader(self.data, self.batch_size, self.shuffle, num_workers=self.workers)
        print('starting training')
        tenth = int(len(loader)/10)

        self.time=0
        no_improvement = 0
        best_score = -1
        prev_score = -1
        for epoch in range(self.iterations):

            percent = 0
            start = time.time()
            processed_batches = 0 
            pairs = 0
            cum_loss = 0 
            avg_loss =0
            best_loss = 10 
            
            for i,(pos_u,pos_v) in enumerate(loader):
                if(i%tenth == 0 ):
                    end = time.time()
                    hours, rem = divmod(end-start, 3600)
                    minutes, seconds = divmod(rem, 60)
                    time_since_start = "Time:  {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)
                    if(processed_batches!=0):
                        avg_loss = cum_loss / processed_batches
                    print("0%" + "=" *(int(percent/10))+ str(percent) +"%, " + time_since_start + ", cum_loss = {}".format(cum_loss),end="\r" )
                    percent+=10
                    
                neg_v = self.data.get_neg_samples(self.neg_samples,pos_v.size()[0])
                pos_v = pos_v.view(len(neg_v),-1)
                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u,pos_v,neg_v)
                cum_loss += loss
                loss.backward()
                self.optimizer.step()
                pairs += len(pos_u)
                processed_batches += 1
                
            print("\n{0:d} epoch of {1:d}".format(epoch+1, self.iterations))
            avg_loss = cum_loss / processed_batches
            print(" {0:d} {1:d} batches, pairs {2:d}, cum loss: {3:.5f}".format(i,processed_batches, pairs,cum_loss))
            self.loss_list.append(cum_loss)
            self.time = time_since_start
            self.model = self.model.to(cpu)
            score = -1*(wordsim_task(self.get_embedding())[0][1])
            if(score < best_score):
                best_score = score
            print("Current score on wordsim Task: {}".format(score))
            self.ws_list.append(score)
            self.model = self.model.to(gpu)
            
            
            if(score > best_score ):
                best_score = score
            
            if(score - prev_score < 0.0009):
                no_improvement += 1
                
            if(no_improvement == 2 or score > 0.66):
                print("No improvement in word similarity early stoppage")
                self.iterations = epoch
                break
                
            
            prev_score = score 
        
        if(save_embedding):
            self.save_embedding()
            

    def get_embedding(self):
        embedding_dict = dict()
        embedding = self.model.u_embeddings.weight.data.numpy()
        for i in range(len(self.data.idx2word)):
            embedding_dict[self.data.idx2word[i]]= embedding[i]
        return embedding_dict
    
    def save_embedding(self, with_loss=True):
        print('ntm')
        # Creating filename
        optim = "Optim" + str(self.optimizer).split(" ")[0] + "_"
        filename = "dict_emb_" +  optim + "_".join([x + str(y) for x,y in vars(self).items() if isinstance(y, numbers.Number)]) + ".pkl"
        
        # Getting Embedding
        self.model.to(torch.device('cpu'))
        dict_emb = w2v.get_embedding()
        
        # Adding loss history to embedding
        dict_emb['loss_list'] = [x.to(torch.device('cpu')) for x in self.loss_list]
        
        # Adding score list to embedding 
        dict_emb['ws_list'] = self.ws_list
        
                
        # Saving time spent to calculate 1 epoch
        dict_emb['time'] = self.time
        
        # Logging
        print("Saving embedding: {} to disk with ws_score: {} ".format(filename,dict_emb['ws_list']))
    
        # Writing embedding dictionnary to disk
        with open(filename, 'wb') as output:
            pickle.dump(dict_emb, output, pickle.HIGHEST_PROTOCOL)
        
        self.model.to(device)
        self.loss_list = [x.to(device) for x in self.loss_list]
    
 

            

In [19]:
y = text8_dataset[0:100]
y.append(["this", "is","a","test"])
dataset = wDataSet(y,ctx_window=3)
#print(dataset.__getitem__(1082*10))
print(dataset.__len__())
dataset.idx2word[191]
w2v = W2V(dataset,alpha =0.1, momentum=0, nesterov=False,shuffle=False,batch_size=10, iterations=2)
w2v.train_with_loader()

Creating vocab
0 3
1 4
2 5
3 4
10800
cuda:0
starting training
0 3
1 4
2 5
3 4
0 3
1 4
2 5
3 4
1 epoch of 2
 1079 1080 batches, pairs 10800, cum loss: 556788.43750
Current score on wordsim Task: 0.42025990073836106
0 3
1 4
2 5
3 4
2 epoch of 2
 1079 1080 batches, pairs 10800, cum loss: 1455673.25000
Current score on wordsim Task: -0.2091706269209296
ntm
Saving embedding: dict_emb_OptimAdam_momentum0_nesterovFalse_step_size1_gamma1_shuffleFalse_batch_size10_alpha0.1_dim100_workers1_ctxw3_neg_samples10_use_cudaTrue_iterations2.pkl to disk with ws_score: [0.42025990073836106, -0.2091706269209296] 


In [40]:
text8_wDataset = wDataSet((text8_dataset),ctx_window=5)


Creating vocab


KeyboardInterrupt: 

In [46]:
for x in [0.0005]:
    w2v = W2V(text8_wDataset,alpha =x, momentum=0, nesterov=False,shuffle=False)
    w2v.train_with_loader()


cuda:0
starting training
0%0%, Time:  00:00:00.32, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
AssertionError: can only join a child process
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'


1 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 152676016.00000
Current score on wordsim Task: 0.3253677086052125
0%0%, Time:  00:00:00.42, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


2 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 135144880.00000
Current score on wordsim Task: 0.5093265756211574
0%0%, Time:  00:00:00.30, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


3 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 131182928.00000
Current score on wordsim Task: 0.5653052417286016
0%0%, Time:  00:00:00.34, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


4 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 129233088.00000
Current score on wordsim Task: 0.5896498238077336
0%0%, Time:  00:00:00.36, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


5 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 127974056.00000
Current score on wordsim Task: 0.6123099128405374
0%0%, Time:  00:00:00.32, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


6 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 127030560.00000
Current score on wordsim Task: 0.622774421705143
0%0%, Time:  00:00:00.30, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
AssertionError: can only join a child process
    assert self._parent_pid == os.getpid(), 'can only join a child process'
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    w.join()


7 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 126332216.00000
Current score on wordsim Task: 0.63082296720012
0%0%, Time:  00:00:00.31, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


8 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 125760640.00000
Current score on wordsim Task: 0.6350203292814472
0%0%, Time:  00:00:00.32, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


9 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 125301168.00000
Current score on wordsim Task: 0.6386351131542509
0%0%, Time:  00:00:00.32, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


10 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 124902088.00000
Current score on wordsim Task: 0.6402834096355231
0%0%, Time:  00:00:00.46, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
AssertionError: can only join a child process
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'


11 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 124572488.00000
Current score on wordsim Task: 0.6436167362171213
0%0%, Time:  00:00:00.31, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


12 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 124280880.00000
Current score on wordsim Task: 0.6449099370927117
0%0%, Time:  00:00:00.31, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
AssertionError: can only join a child process
    assert self._parent_pid == os.getpid(), 'can only join a child process'
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    w.join()


13 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 124034968.00000
Current score on wordsim Task: 0.6437743380204528
0%0%, Time:  00:00:00.30, cum_loss = 0

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f84ac052e18>
Traceback (most recent call last):
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skg/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/opt/conda/envs/skg/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process


14 epoch of 20
 21686 21687 batches, pairs 43373869, cum loss: 123828624.00000
Current score on wordsim Task: 0.6441727745238435
No improvement in word similarity early stoppage
ntm
Saving embedding: dict_emb_OptimAdam_momentum0_nesterovFalse_step_size1_gamma1_shuffleFalse_batch_size2000_alpha0.0005_dim100_workers1_ctxw5_neg_samples10_use_cudaTrue_iterations13.pkl to disk with ws_score: [0.3253677086052125, 0.5093265756211574, 0.5653052417286016, 0.5896498238077336, 0.6123099128405374, 0.622774421705143, 0.63082296720012, 0.6350203292814472, 0.6386351131542509, 0.6402834096355231, 0.6436167362171213, 0.6449099370927117, 0.6437743380204528, 0.6441727745238435] 


In [12]:
w2v.optimizer

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.0075
    momentum: 0
    nesterov: False
    weight_decay: 0
)

### EVALUATION

### Wordsim Task

In [42]:
import pickle
import torch
with open("mean_list_gensim", 'rb') as output:
        l = pickle.load(output)
l

[0.4854457162977399,
 0.5859706113928841,
 0.6215866474289622,
 0.638220801506811,
 0.6464491677517064,
 0.6515312416174555,
 0.6552487631702965,
 0.6578048847820147,
 0.6575116046386906,
 0.6544169030425343]

In [18]:
from scipy import spatial

x = spatial.distance.cosine(gensim_emb['love'], gensim_emb['sex'])
y = spatial.distance.cosine(gensim_emb['anarchism'],gensim_emb['music'])
z = spatial.distance.cosine(gensim_emb['revolution'],gensim_emb['creatine'])

l = ['music','anarchism','revolution','philosophy','creatine']
print(x)
print(y)
z

0.5102460086345673
0.658313125371933


0.8949277028441429

In [10]:
dict_emb.pop('loss_list')

[tensor(5.5450, requires_grad=True),
 tensor(5.5439, requires_grad=True),
 tensor(5.5429, requires_grad=True),
 tensor(5.5426, requires_grad=True),
 tensor(5.5412, requires_grad=True),
 tensor(5.5425, requires_grad=True),
 tensor(5.5414, requires_grad=True),
 tensor(5.5402, requires_grad=True),
 tensor(5.5409, requires_grad=True),
 tensor(5.5391, requires_grad=True),
 tensor(5.5376, requires_grad=True),
 tensor(5.5360, requires_grad=True),
 tensor(5.5368, requires_grad=True),
 tensor(5.5365, requires_grad=True),
 tensor(5.5357, requires_grad=True),
 tensor(5.5351, requires_grad=True),
 tensor(5.5354, requires_grad=True),
 tensor(5.5333, requires_grad=True),
 tensor(5.5336, requires_grad=True)]

In [59]:
gensim_emb = dict()
for sentences in text8_dataset:
    for word in sentences:
        gensim_emb[word] = model.wv[word]

In [19]:
%%time
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec


class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
        self.cum_loss = 0
        self.processed_batches = 0
        self.loss_list = []


    def on_epoch_end(self, model):
        print("Epoch #{} end: cum_loss={}".format(self.epoch,self.cum_loss))
        self.epoch += 1
        self.loss_list.append(self.cum_loss)

    
    def on_batch_end(self, model):
        """Method called at the end of each batch.
        Parameters
        ----------
        model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
            Current model.
        """
        self.cum_loss += model.get_latest_training_loss()

        
#TODO: logging, save loss, batch_size
epoch_logger = EpochLogger()
model = Word2Vec(text8_dataset, size=100,window=5,negative=10,  min_count=1, workers=1,sg=1,iter=20, batch_words=5000, callbacks=[epoch_logger],compute_loss=True)


Epoch #0 end: cum_loss=73705438549.78125
Epoch #1 end: cum_loss=196557943237.78125
Epoch #2 end: cum_loss=334222049901.78125
Epoch #3 end: cum_loss=487322937293.78125
Epoch #4 end: cum_loss=656024549813.7812
Epoch #5 end: cum_loss=840282420421.7812
Epoch #6 end: cum_loss=1039964615789.7812
Epoch #7 end: cum_loss=1254864077429.7812
Epoch #8 end: cum_loss=1480556680341.7812
Epoch #9 end: cum_loss=1706713552021.7812
Epoch #10 end: cum_loss=1932870423701.7812
Epoch #11 end: cum_loss=2159027295381.7812
Epoch #12 end: cum_loss=2385184167061.7812
Epoch #13 end: cum_loss=2611341038741.7812
Epoch #14 end: cum_loss=2837497910421.7812
Epoch #15 end: cum_loss=3063654782101.7812
Epoch #16 end: cum_loss=3289811653781.7812
Epoch #17 end: cum_loss=3515968525461.7812
Epoch #18 end: cum_loss=3742125397141.7812
Epoch #19 end: cum_loss=3968282268821.7812
CPU times: user 57min 42s, sys: 49.2 s, total: 58min 32s
Wall time: 58min 29s


In [14]:
gensim_emb = dict()
for sentences in text8_dataset:
    for word in sentences:
        gensim_emb[word] = model.wv[word]

In [16]:
wordsim_task(gensim_emb)

array([[ 1.        , -0.67838157],
       [-0.67838157,  1.        ]])

In [23]:
def get_distances(word):
    for x in dict_emb.keys():
        yield(x, spatial.distance.cosine(dict_emb[word],dict_emb[x]))

In [54]:
n_dict_emb = {(word): (x / np.linalg.norm(x)) for (word, x) in (dict_emb.items())}
n_dict_emb_gensim = {(word): (x / np.linalg.norm(x)) for (word, x) in (gensim_emb.items())}

RuntimeError: Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead.

In [221]:
import random
def get_closest(score_dict, word):
    closest = ()
    distance = 3
    for (x,y),score in score_dict.items():
        #print(x,y,score)
        if((x != y) and ((x==word)or(y==word))):
            if (distance > score):
                closest = (x,y)
                distance = score
    return closest

def get_closest_with_score(dict_emb,y):
    distance = 100
    for x,emb in dict_emb.items():
        if(spatial.distance.cosine(dict_emb[x], dict_emb[y])<distance):
            closest = x
    return x
        

### ANALOGY TASK

In [204]:
file = open("./data/questions-words.txt")
questions = file.readlines()
for i,x in enumerate(questions): 
    questions[i] = x.rstrip("\n").split()
    if x[0]==':':
        del questions[i]
    


In [213]:
def analogy_task(questions,dict_emb):
    score = []
    if all(word in dict_emb for word in questions):
        y = dict_emb[questions[0]] -  dict_emb[questions[1]] +  dict_emb[questions[2]]
        x = get_closest_with_score(dict_emb,y)
        if x == questions[3]:
            score.append(1)
        else: 
            score.append(0)
    return score
        

In [31]:
import numpy as np
def calculate_sim(dict_emb): 
    # Create dictionnary with id for every word, this is needed because sometimes we only have access to the dict_emb
    # and not the whole model 
    idx2word = {idx: w for (idx, w) in enumerate(dict_emb.keys())}
    word2idx = {w: idx for (idx, w) in enumerate(dict_emb.keys())}
    
    emb_size = len(next(iter(dict_emb.values())))
    
    # Create an embedding dictionnary with normalized vectors
    normalized_dict_emb = {(word): (x / np.linalg.norm(x)) for (word, x) in (dict_emb.items())}
    
    # Create an vocab_size*emb_size Matrix that holds the normalized embeding of each word in it's row called matrix_row
    # Create an emb_size*vocab_size Matrix that holds the normalized embeding of each word in it's colomn  matrix_colomn
    for i in range(0,len(dict_emb.keys())):
        y = normalized_dict_emb[idx2word[i]]
        if i ==0:
            matrix_colomn = torch.tensor(y).view(emb_size,1)
            matrix_row = torch.tensor(y)
        else:
            matrix_colomn = torch.cat([matrix_colomn,torch.tensor(y).view(emb_size,1)],1)
            #pdb.set_trace()
            matrix_row = torch.cat([matrix_row,torch.tensor(y)])
    
    matrix_row = matrix_row.view(-1,emb_size)
    
    matrix_row = matrix_row.to(device)
    matrix_colomn = matrix_colomn.to(device)
    
    return 1-(torch.matmul(matrix_row,matrix_colomn)),word2idx






    