In [24]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import pdb

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class SkipGramModel(nn.Module):

    def __init__(self, vocab_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-1,1)
        
            
    def forward(self, pos_u, pos_v,neg_v):
        #pdb.set_trace()
        pos_u = pos_u.view(-1)
        emb_u = self.u_embeddings(pos_u)
        #neg_v = neg_v.view(len(pos_u),-1)
        #pdb.set_trace()
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        #pdb.set_trace()
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        #pdb.set_trace()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)

In [25]:
from torch.utils.data import Dataset
from collections import defaultdict
import random

class wDataSet(Dataset):
    def __init__(self, dataset, power=0.75, neg_samples=2,sampling=False,sampling_treshhold=1):
        self.dataset = dataset
        self.neg_samples=neg_samples
        self.word2idx = dict()
        self.idx2word = dict()
        self.word_count = defaultdict(int)
        self.vocab_size = int()
        self.vocab = set()
        self.create_vocab(sampling, sampling_treshhold)
        self.pairs = self.generate_pairs()
        self.key_pairs = self.generate_key_pairs(self.pairs)
        pdb.set_trace()
        self.power = power        

        
    def generate_pairs(self):
        print("Generating pairs")
        pairs = []
        for sentence in self.dataset:
            for i,word in enumerate(sentence):
                for j in range(1,self.neg_samples):
                    if(i+j<len(sentence)):
                        pairs.append((word,sentence[i+j]))
                    if((i-j)>0):
                        pairs.append((word,sentence[i-j]))
        #if not(all(x[0] not in self.sampled_words for x in pairs)):
         #   pdb.set_trace()
        return pairs
        
    def __len__(self):
        return len(self.key_pairs)
        
        
    def __getitem__(self, idx):
        return self.key_pairs
    
    def get_neg_samples(self, count, batch_size):
        return torch.tensor(np.random.choice(list(self.idx2word.keys()),size=(batch_size)*self.neg_samples,replace=True,p=self.make_neg_table(self.power))).view(batch_size,-1)
        #neg_v = []
        #for x in range(1,batch_size+1):
         #   neg_v.append(random.sample(range(0,self.vocab_size),count))
       # return torch.tensor(neg_v).view(batch_size,-1)
         
   
    """ Defines the probability of choosing a negative sampling, set empiraccaly by mikolov"""
    def make_neg_table(self, power):
        pow_frequency = np.array([self.word_count[self.idx2word[i]] for i in range(len(self.word_count))])**power
        return pow_frequency / pow_frequency.sum()
        
        
    
    def generate_key_pairs(self,pairs):
        print("Generating key_pairs")
        key_pairs = []
        #print(pairs)
        for x,y in pairs:
            key_pairs.append((self.word2idx.get(x),self.word2idx.get(y)))
            if self.word2idx.get(x) is None: 
                pdb.set_trace()
        print("finished creating key_pairs")
        return key_pairs
    
    """"Creating vocabulary: first counting all words then deleting all frequent
    words, then creating dictionary with a one to one mapping int to word"""
    def create_vocab(self,sampling, treshhold):
        print("Creating vocab")
        if sampling:
            self.create_vocab_with_sampling(treshhold)
        else:
            for i,sentence in enumerate(self.dataset):
                for word in sentence:
                    self.word_count[word] += 1
                    self.vocab.add(word)

        self.word2idx = {w: idx for (idx, w) in enumerate(self.vocab)}
        self.idx2word = {idx: w for (idx, w) in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)
        
    def create_vocab_with_sampling(self,treshhold):
        for i,sentence in enumerate(self.dataset):
                for word in sentence:
                    self.word_count[word] += 1
        #Creeate Sampling table     
        sampling_table = self.make_sampling_table(treshhold)  
        assert len(sampling_table)== len(self.word_count)
        #Select which words are going to be deleted
        sampled_words = [word for i,word in enumerate(self.word_count.keys()) if random.random() < sampling_table[i]]
        #Create new dataset by deleting sampled words
        self.sampled_words = sampled_words
        self.word_count = defaultdict(int)
        new_dataset = []
        for i,sentence in enumerate(self.dataset):
            new_sentence = [word for word in sentence if word not in sampled_words]
            new_dataset.append(new_sentence)
            for word in new_sentence:
                self.vocab.add(word)
                self.word_count[word] += 1 
        self.dataset = new_dataset
        for sentence in self.dataset:
            assert all(words not in sampled_words for words in sentence)
    def make_sampling_table(self,treshhold): 
        count = np.array([x for x in self.word_count.values()])
        table = [1-x for x in list( np.sqrt(treshhold/(count)))]
        return table
   


In [26]:
import numpy as np
import torch
from torch.utils.data import DataLoader
import copy

class W2V():
    def __init__(self, data,dim=100, neg_samples=3, alpha=0.01, iterations=10, batch_size=20, 
                 shuffle=False,use_cuda=True,workers=4):
        self.shuffle = shuffle        
        self.batch_size = batch_size
        self.alpha = alpha
        self.dim = dim
        self.data = data
        self.workers = workers

        self.neg_samples = neg_samples
        self.use_cuda = use_cuda
        
        self.models = []
        self.optimizers = []
        self.model = SkipGramModel(len(self.data.vocab), self.dim)
        #self.model.cuda()
        print(device)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=alpha)

        self.iterations = iterations
        #self.train()        
 
    def train_with_loader(self):
        #pdb.set_trace()
        loader = DataLoader(self.data.key_pairs, self.batch_size, self.shuffle, num_workers=self.workers)
        #pdb.set_trace()
        print('starting training')
        tenth = int(len(loader)/10)
        for epoch in range(1,self.iterations):
            percent = 0
            for i,(pos_u,pos_v) in enumerate(loader):
                neg_v = self.data.get_neg_samples(self.data.neg_samples,pos_v.size()[0])
                pos_v = pos_v.view(len(neg_v),-1)
                #pos_v.cuda()
                #pos_u.cuda()
                #neg_v.cuda()
                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u,pos_v,neg_v)
                loss.backward()
                self.optimizer.step()
            print("loss = " + str(loss))
            print("{0:d} epoch of {1:d}".format(epoch+1, self.iterations))

    def get_embedding(self):
        embedding_dict = dict()
        embedding = self.model.u_embeddings.weight.data.numpy()
        for i in range(len(self.data.idx2word)):
            embedding_dict[self.data.idx2word[i]]= embedding[i]
        return embedding_dict

In [27]:
from gensim.test.utils import datapath
import gensim.downloader as api
#sentences = LineSentence(datapath('lee_background.cor'))
dataset = api.load('text8')
text8_dataset = []
for x in dataset: 
    text8_dataset.append(x)   

In [34]:
text8_first_sentence = []
sentence = []
for i,x in enumerate(text8_dataset[0]):
    sentence.append(x)
    if (i%30 == 0 and i>0):
        text8_first_sentence.append(sentence)
        sentence=[]

In [35]:
text8_dataset_first_sentence = wDataSet((text8_first_sentence))
#text8_wDataset = wDataSet((text8_dataset))


Creating vocab
Generating pairs
Generating key_pairs
finished creating key_pairs
> <ipython-input-25-740672800c33>(18)__init__()
-> self.power = power
(Pdb) continue


In [36]:
w2v = W2V(text8_dataset_first_sentence)
#w2v = W2V(text8_wDataset)u

cpu


In [38]:
w2v.train_with_loader()

starting training
loss = tensor(1.9072, grad_fn=<DivBackward0>)
2 epoch of 10
loss = tensor(1.7573, grad_fn=<DivBackward0>)
3 epoch of 10
loss = tensor(1.8416, grad_fn=<DivBackward0>)
4 epoch of 10
loss = tensor(1.7789, grad_fn=<DivBackward0>)
5 epoch of 10
loss = tensor(1.6834, grad_fn=<DivBackward0>)
6 epoch of 10
loss = tensor(1.7157, grad_fn=<DivBackward0>)
7 epoch of 10
loss = tensor(1.7284, grad_fn=<DivBackward0>)
8 epoch of 10
loss = tensor(1.6686, grad_fn=<DivBackward0>)
9 epoch of 10
loss = tensor(1.5882, grad_fn=<DivBackward0>)
10 epoch of 10


### EVALUATION

In [40]:
dict_emb = w2v.get_embedding()

In [42]:
from scipy import spatial

x = spatial.distance.cosine(dict_emb['artist'], dict_emb['music'])
y = spatial.distance.cosine(dict_emb['anarchism'],dict_emb['music'])
print(x)
print(y)

1.0054986192844808
0.9124191775918007


In [111]:
import itertools
import numpy as np
score = []
score_dict = dict()
for i,(x,y) in enumerate(itertools.product(text8_dataset.vocab,text8_dataset.vocab)):
    if(i%1000000==0):
        print(i)
    distance = spatial.distance.cosine(dict_emb[x], dict_emb[y])
    score_dict[(x,y)] = distance
    score.append(distance)
print(np.mean(score))
    


0
1000000
2000000
3000000
4000000


KeyboardInterrupt: 

In [77]:
np.std(score)
#print(score_dict[('anarchism','music')])

0.11336667971198654

In [221]:
import random
def get_closest(score_dict, word):
    closest = ()
    distance = 3
    for (x,y),score in score_dict.items():
        #print(x,y,score)
        if((x != y) and ((x==word)or(y==word))):
            if (distance > score):
                closest = (x,y)
                distance = score
    return closest

def get_closest_with_score(dict_emb,y):
    distance = 100
    for x,emb in dict_emb.items():
        if(spatial.distance.cosine(dict_emb[x], dict_emb[y])<distance):
            closest = x
    return x
        

In [204]:
file = open("./data/questions-words.txt")
questions = file.readlines()
for i,x in enumerate(questions): 
    questions[i] = x.rstrip("\n").split()
    if x[0]==':':
        del questions[i]
    


In [213]:
def analogy_task(questions,dict_emb):
    score = []
    if all(word in dict_emb for word in questions):
        y = dict_emb[questions[0]] -  dict_emb[questions[1]] +  dict_emb[questions[2]]
        x = get_closest_with_score(dict_emb,y)
        if x == questions[3]:
            score.append(1)
        else: 
            score.append(0)
    return score
        

In [216]:
score = analogy_task(questions,dict_emb)
print(score)

[]


In [218]:
words = random.sample(dict_emb.keys(),10)

In [222]:
for x in words:
    print(get_closest(score_dict,x))

('armed', 'with')
('zerzan', 'developments')
('writings', 'increase')
('list', 'women')
('science', 'archons')
('mysogyny', 'coo')
('dominance', 'existing')
('chomsky', 'controlled')
('interact', 'assist')
('operative', 'cgt')


### Wordsim Task

In [46]:
#IMPORT DATA
import csv
wordsim_data = [] 
with open('./data/wordsim/set1.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ',quotechar='|')
    for row in reader: 
        wordsim_data.append(row[0].split(',')[0:3])
    del wordsim_data[0]
with open('./data/wordsim/set2.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ',quotechar='|')
    for i,row in enumerate(reader):
        if i!=0:
            wordsim_data.append(row[0].split(',')[0:3])

for x in wordsim_data:
    wordsim_vocab.add(x[0])
    wordsim_vocab.add(x[1])

len(wordsim_vocab.intersection(text8_wDataset.vocab))


419

In [105]:
from scipy import stats
def wordsim_task(wordsim_data, dict_emb):
    out = []
    for task in wordsim_data: 
        if (task[0] in dict_emb.keys() ) and (task[1] in dict_emb.keys()):
            target_distance = spatial.distance.cosine(dict_emb[task[0]], dict_emb[task[1]])
            out.append((task,target_distance))
    return stats.zscore(np.array([x[1] for x in out],dtype=float))
    
scores_standarized = wordsim_task(wordsim_data,dict_emb)
target_score_standarized = stats.zscore(np.array([x[2] for x in wordsim_data],dtype=float))
scores_compared = [x-y for x,y in zip(scores_standarized,target_score_standarized)]
scores_compared

[-0.7492966550299953,
 0.35838971834824074,
 -3.6421836694051715,
 0.5337566638904002,
 -1.5335255686763896,
 -0.27116383578494063,
 -0.07315699991429062,
 -0.35248162150251877,
 -0.20424514891865797,
 -2.0675020019409907,
 -0.05355751696260591,
 -2.2962946093394776,
 0.5764732376444341,
 0.6538135002808666,
 -0.3271025309448874,
 -0.5258993188553471,
 0.4380364233285022,
 0.4499698326204565,
 0.638634555895662,
 -1.317862725586364,
 -1.3873519205331235,
 1.4026833840509063,
 0.6341375411286543,
 1.5253665791361173,
 3.1867268122451593,
 1.8131602284400248]

In [91]:
t 
scores = np.array([x[1] for x in t],dtype=float)
scores_standarized = [(x - scores.mean())/scores.std() for x in scores]
scores_standarized

[-0.3285001937146879,
 1.0461726604646144,
 -1.7345487029390991,
 1.272174973055252,
 -0.7214558165319331,
 0.5224930556146146,
 -0.11268205722151096,
 -0.14343305875357407,
 0.552586020991095,
 -1.6467055406256832,
 0.6664479314573448,
 -1.8386724265343677,
 0.7302832181586759,
 0.6833366707670256,
 0.1995678796536017,
 -0.17415108533341858,
 0.8772457453887108,
 -0.11892497110267058,
 0.6175223593333428,
 -0.7543665934980727,
 -0.36353426982230436,
 -0.5471759755398041,
 -1.4584214892350396,
 -0.7467178434903627,
 1.3243285411927292,
 2.1971309682655304]

In [93]:
l = np.array([x[2] for x in wordsim_data], dtype=float)
p = [(x - l.mean())/l.std() for x in l]
p

for x,y in zip(scores_standarized,p):
    print(x-y)

-0.7492966550299953
0.35838971834824074
-3.6421836694051715
0.5337566638904002
-1.5335255686763896
-0.27116383578494063
-0.07315699991429062
-0.35248162150251877
-0.20424514891865797
-2.0675020019409907
-0.05355751696260591
-2.2962946093394776
0.5764732376444341
0.6538135002808666
-0.3271025309448874
-0.5258993188553471
0.4380364233285022
0.4499698326204565
0.638634555895662
-1.317862725586364
-1.3873519205331235
1.4026833840509063
0.6341375411286543
1.5253665791361173
3.1867268122451593
1.8131602284400248
