In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import pdb

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class SkipGramModel(nn.Module):

    def __init__(self, vocab_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-1,1)
        
            
    def forward(self, pos_u, pos_v,neg_v):
        pos_u = pos_u.view(-1).to(device)
        pos_v = pos_v.to(device)
        neg_v = neg_v.to(device)
        emb_u = self.u_embeddings(pos_u)
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)

In [2]:
from torch.utils.data import Dataset
from collections import defaultdict
import random

class wDataSet(Dataset):
    def __init__(self, dataset, power=0.75,ctx_window=2, neg_samples=2):
        self.ctx_window = ctx_window
        self.dataset = dataset
        self.word2idx = dict()
        self.idx2word = dict()
        self.word_count = defaultdict(int)
        self.vocab_size = int()
        self.vocab = set()
        self.create_vocab()
        self.pairs = self.generate_pairs()
        self.key_pairs = self.generate_key_pairs(self.pairs)
        self.power = power        

        
    def generate_pairs(self):
        print("Generating pairs")
        pairs = []
        for sentence in self.dataset:
            for i,word in enumerate(sentence):
                for j in range(1,self.ctx_window):
                    if(i+j<len(sentence)):
                        pairs.append((word,sentence[i+j]))
                    if((i-j)>0):
                        pairs.append((word,sentence[i-j]))

        return pairs
        
    def __len__(self):
        return len(self.key_pairs)
        
        
    def __getitem__(self, idx):
        return self.key_pairs
    
    def get_neg_samples(self, count, batch_size):
        return torch.tensor(np.random.choice(list(self.idx2word.keys()),size=(batch_size)*count,replace=True,p=self.make_neg_table(self.power))).view(batch_size,-1)
         
   
    """ Defines the probability of choosing a negative sampling, set empiraccaly by mikolov"""
    def make_neg_table(self, power):
        pow_frequency = np.array([self.word_count[self.idx2word[i]] for i in range(len(self.word_count))])**power
        return pow_frequency / pow_frequency.sum()
        

    def generate_key_pairs(self,pairs):
        print("Generating key_pairs")
        key_pairs = []
        for x,y in pairs:
            key_pairs.append((self.word2idx.get(x),self.word2idx.get(y)))
        print("finished creating key_pairs")
        return key_pairs
    
    """"Creating vocabulary and creating dictionary with a one to one mapping int to word"""
    def create_vocab(self):
        print("Creating vocab")
        for i,sentence in enumerate(self.dataset):
            for word in sentence:
                self.word_count[word] += 1
                self.vocab.add(word)
        self.word2idx = {w: idx for (idx, w) in enumerate(self.vocab)}
        self.idx2word = {idx: w for (idx, w) in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)
        

In [16]:
import torch
import pickle
from torch.utils.data import DataLoader
import copy
import time

class W2V():
    def __init__(self, data,dim=100, neg_samples=2, alpha=0.01, iterations=10, batch_size=20, 
                 shuffle=False,use_cuda=True,workers=7):
        self.shuffle = shuffle        
        self.batch_size = batch_size
        self.alpha = alpha
        self.dim = dim
        self.data = data
        self.workers = workers
        self.ctxw = self.data.ctx_window
        self.neg_samples = neg_samples
        self.use_cuda = use_cuda
        
        self.models = []
        self.optimizers = []
        self.loss_list = []
        self.model = SkipGramModel(len(self.data.vocab), self.dim)
        self.model.to(device)
        print(device)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=alpha)

        self.iterations = iterations
 
    def train_with_loader(self,save_embedding=True):
        loader = DataLoader(self.data.key_pairs, self.batch_size, self.shuffle, num_workers=self.workers)
        print('starting training')
        tenth = int(len(loader)/10)
        for epoch in range(1,self.iterations):
            percent = 0
            start = time.time()
            for i,(pos_u,pos_v) in enumerate(loader):
                if(i%tenth == 0 ):
                    end = time.time()
                    hours, rem = divmod(end-start, 3600)
                    minutes, seconds = divmod(rem, 60)
                    print("Time since start: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
                    print(str(percent) + "% of epoch is done" )
                    percent+=10
                neg_v = self.data.get_neg_samples(self.neg_samples,pos_v.size()[0])
                pos_v = pos_v.view(len(neg_v),-1)
                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u,pos_v,neg_v)
                loss.backward()
                self.optimizer.step()
                prev_loss = loss
            print("loss = " + str(loss))
            print("{0:d} epoch of {1:d}".format(epoch+1, self.iterations))
            self.loss_list.append(loss)

        if(save_embedding):
            self.save_embedding()
            

    def get_embedding(self):
        embedding_dict = dict()
        embedding = self.model.u_embeddings.weight.data.numpy()
        for i in range(len(self.data.idx2word)):
            embedding_dict[self.data.idx2word[i]]= embedding[i]
        return embedding_dict
    
    def save_embedding(self, with_loss=True):
        # Creating filename
        filename = "dict_emb" + "_".join([x + str(y) for x,y in vars(w2v).items() if type(y) == int]) + ".pkl"
            
        # Getting Embedding
        self.model.to(torch.device('cpu'))
        dict_emb = w2v.get_embedding()
        
        # Adding loss history to embedding
        dict_emb['loss_history'] = self.loss_list
    
        # Writing embedding dictionnary to disk
        with open(filename, 'wb') as output:
            pickle.dump(dict_emb, output, pickle.HIGHEST_PROTOCOL)
            

In [4]:
%%time
from collections import Counter
import numpy as np
from gensim.test.utils import datapath
import gensim.downloader as api
import random
from itertools import dropwhile

def sampling(dataset,threshold=1e-4, min_count=5):
    
    # Count occurences of each word in the dataset 
    word_counts = Counter(dataset)  
    total_count = len(dataset)
    
    freqs = {word: count/total_count for word, count in word_counts.items()}
    p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
    train_words = [word for word in dataset if random.random() < (1 - p_drop[word]) and word_counts[word]>min_count]
    return train_words

"Transforms a list of words to a list of sentences with length=len_sen"
def words_to_sentences(words, len_sen=20):
    new_ds = []
    for i in range(0, len(words), 20):
        y = [words[i:i + 20]]
        new_ds.extend(y)
    return new_ds
    

# Get dataset online
dataset = api.load('text8')

# Convert to list of words
text8_ds = []
for x in dataset: 
    for y in x:
        text8_ds.append(y)
        
# Subsampling
text8_ds = sampling(text8_ds)

# New dataset with sentences of length=20
text8_dataset = words_to_sentences(text8_ds)

CPU times: user 19.4 s, sys: 739 ms, total: 20.1 s
Wall time: 20.9 s


In [5]:
text8_first_sentence = text8_dataset[0:200]

In [10]:
text8_dataset_first_sentence = wDataSet((text8_first_sentence))
#text8_wDataset = wDataSet((text8_dataset))


Creating vocab
Generating pairs
Generating key_pairs
finished creating key_pairs


In [17]:
w2v = W2V(text8_dataset_first_sentence)
#w2v = W2V(text8_wDataset)

cpu


In [18]:
w2v.train_with_loader()


starting training
Time since start: 00:00:00.15
0% of epoch is done
Time since start: 00:00:00.54
10% of epoch is done
Time since start: 00:00:00.80
20% of epoch is done
Time since start: 00:00:01.26
30% of epoch is done
Time since start: 00:00:01.50
40% of epoch is done
Time since start: 00:00:01.70
50% of epoch is done
Time since start: 00:00:02.01
60% of epoch is done
Time since start: 00:00:02.32
70% of epoch is done
Time since start: 00:00:02.62
80% of epoch is done
Time since start: 00:00:02.90
90% of epoch is done
loss = tensor(2.0839, grad_fn=<DivBackward0>)
2 epoch of 10
Time since start: 00:00:00.18
0% of epoch is done
Time since start: 00:00:00.58
10% of epoch is done
Time since start: 00:00:00.90
20% of epoch is done
Time since start: 00:00:01.24
30% of epoch is done
Time since start: 00:00:01.76
40% of epoch is done
Time since start: 00:00:02.23
50% of epoch is done
Time since start: 00:00:02.46
60% of epoch is done
Time since start: 00:00:02.78
70% of epoch is done
Time s

In [24]:
with open("dict_embbatch_size20_dim100_workers7_ctxw2_neg_samples2_iterations10.pkl", 'rb') as output:
        dict_emb = pickle.load( output)
dict_emb_model = w2v.get_embedding()
dict_emb['loss_history']

[tensor(2.0839, requires_grad=True),
 tensor(2.0765, requires_grad=True),
 tensor(2.0738, requires_grad=True),
 tensor(2.0649, requires_grad=True),
 tensor(2.0643, requires_grad=True),
 tensor(2.0574, requires_grad=True),
 tensor(2.0486, requires_grad=True),
 tensor(2.0463, requires_grad=True),
 tensor(2.0443, requires_grad=True)]

### EVALUATION

### Wordsim Task

In [None]:
%%time 
#TODO: logging, save loss, batch_size
from gensim.models import Word2Vec
model = Word2Vec(text8_dataset, size=100, window = 10, negative = 7, alpha=0.075, min_count=1, workers=4,sg=1, iter=20)

In [38]:
with open("dict_embw2vbs10000_neg3_dim100_epochs10_ctxw3_.pkl", 'rb') as output:
        dict_emb1 = pickle.load( output)

In [1]:
import pickle
with open("dict_emb_w2vbs10000_neg5_dim100_epochs20_ctxw6_.pkl", 'rb') as output:
        dict_emb = pickle.load( output)

In [3]:
from scipy import spatial

x = spatial.distance.cosine(dict_emb['artist'], dict_emb['music'])
y = spatial.distance.cosine(dict_emb['anarchism'],dict_emb['music'])
z = spatial.distance.cosine(dict_emb['revolution'],dict_emb['creatine'])

l = ['music','anarchism','revolution','philosophy','creatine']
print(x)
print(y)
z

0.08877456188201904
0.5148996114730835


1.0128879891708493

In [31]:
import numpy as np
def calculate_sim(dict_emb): 
    # Create dictionnary with id for every word, this is needed because sometimes we only have access to the dict_emb
    # and not the whole model 
    idx2word = {idx: w for (idx, w) in enumerate(dict_emb.keys())}
    word2idx = {w: idx for (idx, w) in enumerate(dict_emb.keys())}
    
    emb_size = len(next(iter(dict_emb.values())))
    
    # Create an embedding dictionnary with normalized vectors
    normalized_dict_emb = {(word): (x / np.linalg.norm(x)) for (word, x) in (dict_emb.items())}
    
    # Create an vocab_size*emb_size Matrix that holds the normalized embeding of each word in it's row called matrix_row
    # Create an emb_size*vocab_size Matrix that holds the normalized embeding of each word in it's colomn  matrix_colomn
    for i in range(0,len(dict_emb.keys())):
        y = normalized_dict_emb[idx2word[i]]
        if i ==0:
            matrix_colomn = torch.tensor(y).view(emb_size,1)
            matrix_row = torch.tensor(y)
        else:
            matrix_colomn = torch.cat([matrix_colomn,torch.tensor(y).view(emb_size,1)],1)
            #pdb.set_trace()
            matrix_row = torch.cat([matrix_row,torch.tensor(y)])
    
    matrix_row = matrix_row.view(-1,emb_size)
    
    matrix_row = matrix_row.to(device)
    matrix_colomn = matrix_colomn.to(device)
    
    return 1-(torch.matmul(matrix_row,matrix_colomn)),word2idx






    

In [9]:
#IMPORT DATA
import csv
wordsim_data = [] 
with open('./data/wordsim/set1.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ',quotechar='|')
    for row in reader: 
        wordsim_data.append(row[0].split(',')[0:3])
    del wordsim_data[0]
with open('./data/wordsim/set2.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ',quotechar='|')
    for i,row in enumerate(reader):
        if i!=0:
            wordsim_data.append(row[0].split(',')[0:3])

wordsim_vocab = set()
for x in wordsim_data:
    wordsim_vocab.add(x[0])
    wordsim_vocab.add(x[1])

#len(wordsim_vocab.intersection(text8_dataset_first_sentence.vocab))


In [12]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
model = Word2Vec(text8_dataset, size=100, window=3, negative=4, alpha=0.01, min_count=1, workers=4)


In [18]:
gensim_emb = dict()
for sentences in text8_dataset:
    for word in sentences:
        gensim_emb[word] = model.wv[word]

In [40]:
from scipy import spatial

x = spatial.distance.cosine(gensim_emb['artist'], gensim_emb['music'])
y = spatial.distance.cosine(gensim_emb['anarchism'],gensim_emb['music'])
z = spatial.distance.cosine(gensim_emb['revolution'],gensim_emb['anarchism'])

l = ['music','anarchism','revolution','philosophy','creatine']
print(x)
print(y)
z

0.4117097854614258
0.6018523275852203


0.46802496910095215

In [52]:
from scipy import stats
def wordsim_task(wordsim_data, dict_emb):
    out = []
    for task in wordsim_data: 
        if (task[0] in dict_emb.keys() ) and (task[1] in dict_emb.keys()):
            target_distance = spatial.distance.cosine(dict_emb[task[0]], dict_emb[task[1]])
            out.append((task,target_distance))
    #return stats.zscore(np.array([x[1] for x in out],dtype=float))
    return out

"""scores_standarized1 = wordsim_task(wordsim_data,dict_emb1)
gensim_score = wordsim_task(wordsim_data,gensim_emb)
scores_standarized = wordsim_task(wordsim_data,dict_emb)
target_score_standarized = stats.zscore(np.array([x[2] for x in wordsim_data],dtype=float))
scores_compared = [x-y for x,y in zip(scores_standarized,target_score_standarized)]
scores_compared_gensim = [x-y for x,y in zip(gensim_score,target_score_standarized)]
scores_compared1 = [x-y for x,y in zip(scores_standarized1,target_score_standarized)]
print(np.array(scores_compared).mean())
print(np.array(scores_compared_gensim).mean())
np.array(scores_compared1).mean()"""

gs = [x[1] for x in wordsim_task(wordsim_data,gensim_emb)]
emb = [x[1] for x in wordsim_task(wordsim_data,dict_emb)]
emb1 = [x[1] for x in wordsim_task(wordsim_data,dict_emb1)]
diffgsemb = []
diffgsemb1 = []
diffembemb1 = []
for x,y,z in zip(gs,emb,emb1):
    diffgsemb.append(x-y)
    diffgsemb1.append(x-z)
    diffembemb1.append(y-z)
print("Diff gs emb: " + str(np.array(diffgsemb).mean()))
print("Diff gs emb1: "+ str(np.array(diffgsemb1).mean()))
print("Diff emb emb1: "+ str(np.array(diffembemb1).mean()))
print(str(np.array(gs).ptp()))
print(str(np.array(emb).ptp()))
print(str(np.array(emb1).ptp()))


Diff gs emb: -0.4428806549327365
Diff gs emb1: -0.47147978613954106
Diff emb emb1: -0.028599131206804608
0.9631040096282959
1.2209117263555527
1.185003474354744


In [23]:
def get_distances(word):
    for x in dict_emb.keys():
        yield(x, spatial.distance.cosine(dict_emb[word],dict_emb[x]))

In [98]:
import itertools
import numpy as np
score = []
score_dict = dict()
for i,(x,y) in enumerate(itertools.product(text8_dataset_first_sentence.vocab,text8_dataset_first_sentence.vocab)):
    if(i%1000000==0):
        print(i)
    distance = spatial.distance.cosine(dict_emb[x], dict_emb[y])
    score_dict[(x,y)] = distance
    score.append(distance)
print(np.mean(score))
    


0
0.9276562322803592


In [221]:
import random
def get_closest(score_dict, word):
    closest = ()
    distance = 3
    for (x,y),score in score_dict.items():
        #print(x,y,score)
        if((x != y) and ((x==word)or(y==word))):
            if (distance > score):
                closest = (x,y)
                distance = score
    return closest

def get_closest_with_score(dict_emb,y):
    distance = 100
    for x,emb in dict_emb.items():
        if(spatial.distance.cosine(dict_emb[x], dict_emb[y])<distance):
            closest = x
    return x
        

In [93]:
l = np.array([x[2] for x in wordsim_data], dtype=float)
p = [(x - l.mean())/l.std() for x in l]
p

for x,y in zip(scores_standarized,p):
    print(x-y)

-0.7492966550299953
0.35838971834824074
-3.6421836694051715
0.5337566638904002
-1.5335255686763896
-0.27116383578494063
-0.07315699991429062
-0.35248162150251877
-0.20424514891865797
-2.0675020019409907
-0.05355751696260591
-2.2962946093394776
0.5764732376444341
0.6538135002808666
-0.3271025309448874
-0.5258993188553471
0.4380364233285022
0.4499698326204565
0.638634555895662
-1.317862725586364
-1.3873519205331235
1.4026833840509063
0.6341375411286543
1.5253665791361173
3.1867268122451593
1.8131602284400248


### ANALOGY TASK

In [204]:
file = open("./data/questions-words.txt")
questions = file.readlines()
for i,x in enumerate(questions): 
    questions[i] = x.rstrip("\n").split()
    if x[0]==':':
        del questions[i]
    


In [213]:
def analogy_task(questions,dict_emb):
    score = []
    if all(word in dict_emb for word in questions):
        y = dict_emb[questions[0]] -  dict_emb[questions[1]] +  dict_emb[questions[2]]
        x = get_closest_with_score(dict_emb,y)
        if x == questions[3]:
            score.append(1)
        else: 
            score.append(0)
    return score
        