In [22]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import pdb

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class SkipGramModel(nn.Module):

    def __init__(self, vocab_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=False)
        self.init_emb()
        

    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-1,1)
        
            
    def forward(self, pos_u, pos_v,neg_v):
        #pdb.set_trace()
        pos_u = pos_u.view(-1)
        emb_u = self.u_embeddings(pos_u)
        #neg_v = neg_v.view(len(pos_u),-1)
        #pdb.set_trace()
        samples = torch.cat([pos_v,Variable(neg_v)],1)
        emb_v = self.v_embeddings(samples)
        #pdb.set_trace()
        score = torch.bmm(emb_v, emb_u.unsqueeze(2)).squeeze()
        #pdb.set_trace()
        score[:,1:]=score[:,1:].neg()
        score = F.logsigmoid(score)
        return -1 * (torch.sum(score))/ pos_u.size(0)

In [229]:
from torch.utils.data import Dataset
from collections import defaultdict
import random

class wDataSet(Dataset):
    def __init__(self, dataset, power=0.75, neg_samples=2,sampling=True,sampling_treshhold=0.001):
        self.dataset = dataset
        self.word2idx = dict()
        self.idx2word = dict()
        self.word_count = defaultdict(int)
        self.vocab_size = int()
        self.vocab = set()
        self.create_vocab(sampling, sampling_treshhold)
        self.pairs = self.generate_pairs(dataset,neg_samples)
        self.key_pairs = self.generate_key_pairs(self.pairs)
        self.neg_samples=neg_samples
        #self.dataset_tensors=self.create_dataset_tensors()
        

        
    def generate_pairs(self, dataset, ctx_window):
            print("Generating pairs")
            pairs = []
            for sentence in dataset:
                for i,word in enumerate(sentence):
                    for j in range(1,ctx_window):
                        if(i+j<len(sentence)):
                            pairs.append((word,sentence[i+j]))
                        if((i-j)>0):
                            pairs.append((word,sentence[i-j]))
                            
            return pairs
        
    def __len__(self):
        return len(self.key_pairs)
        
        
    def __getitem__(self, idx):
        return self.key_pairs
    
    def get_neg_samples(self, count, batch_size):
        neg_v = []
        for x in range(1,batch_size+1):
            neg_v.append(random.sample(range(0,self.vocab_size),count))
        return torch.tensor(neg_v).view(batch_size,-1)
        
    
    def generate_key_pairs(self,pairs):
        print("Generating key_pairs")
        key_pairs = []
        #print(pairs)
        for x,y in pairs:
            key_pairs.append((self.word2idx.get(x),self.word2idx.get(y)))
        print("finished creating key_pairs")
        return key_pairs
    
    """"Creating vocabulary: first counting all words then deleting all frequent
    words, then creating dictionary with a one to one mapping int to word"""
    def create_vocab(self,sampling, treshhold):
        print("Creating vocab")
        if sampling:
            self.create_vocab_with_sampling(treshhold)
        else:
            for i,sentence in enumerate(self.dataset):
                for word in sentence:
                    self.word_count[word] += 1
                    self.vocab.add(word)

        self.word2idx = {w: idx for (idx, w) in enumerate(self.vocab)}
        self.idx2word = {idx: w for (idx, w) in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)
        
    def create_vocab_with_sampling(self,treshhold):
        for i,sentence in enumerate(self.dataset):
                for word in sentence:
                    self.word_count[word] += 1
                    
        sampling_table = make_sampling_table(tresshold)  
        assert len(sampling_table)== len(self.word_count)
        sampled_words = [word for word in enumerate(i,self.word_count.keys) if random.random < sampling_table[i]]
        
        for i,sentence in enumerate(self.dataset):
            for word in sentence: 
                if word in sampled_words:
                    sentence.remove(word)
                else: 
                    self.vocab.add(word)
                    
        
    def make_sampling_table(self,treshhold): 
        count = np.array([x for x in text8_dataset_first_sentence.word_count.values()])
        table = [1-x for x in list( np.sqrt(treshhold/ (count / count.sum())))]
        return table
   


In [24]:
tmp = []
for x in range(1,11):
    tmp.append(random.sample(range(0,50),2))
t = torch.tensor(tmp)
t.view(10,-1)
    

tensor([[41,  9],
        [21, 34],
        [ 2, 42],
        [14, 21],
        [47, 43],
        [ 1, 18],
        [21,  7],
        [40, 34],
        [47, 22],
        [46,  3]])

In [25]:
import numpy as np
import torch
from torch.utils.data import DataLoader
import copy

class W2V():
    def __init__(self, data,dim=100, neg_samples=3, alpha=0.01, iterations=10, batch_size=500, 
                 shuffle=False,use_cuda=True,workers=4):
        self.shuffle = shuffle        
        self.batch_size = batch_size
        self.alpha = alpha
        self.dim = dim
        self.data = data
        self.workers = workers

        self.neg_samples = neg_samples
        self.use_cuda = use_cuda
        
        self.models = []
        self.optimizers = []
        self.model = SkipGramModel(len(self.data.vocab), self.dim)
        #self.model.cuda()
        print(device)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=alpha)

        self.iterations = iterations
        #self.train()        
 
    def train_with_loader(self):
        loader = DataLoader(self.data.key_pairs, self.batch_size, self.shuffle, num_workers=self.workers)
        print('starting training')
        tenth = int(len(loader)/10)
        for epoch in range(1,self.iterations):
            percent = 0
            for i,(pos_u,pos_v) in enumerate(loader):
                neg_v = self.data.get_neg_samples(self.data.neg_samples,pos_v.size()[0])
                pos_v = pos_v.view(len(neg_v),-1)
                #pos_v.cuda()
                #pos_u.cuda()
                #neg_v.cuda()
                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u,pos_v,neg_v)
                loss.backward()
                self.optimizer.step()
                prev_loss = loss
            print("loss = " + str(loss))
            print("{0:d} epoch of {1:d}".format(epoch+1, self.iterations))

    def get_embedding(self):
        embedding_dict = dict()
        embedding = self.model.u_embeddings.weight.data.numpy()
        for i in range(len(self.data.idx2word)):
            embedding_dict[self.data.idx2word[i]]= embedding[i]
        return embedding_dict

In [26]:
def train(model, dataset, epochs):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    print('starting training')
    for epoch in range(1,epochs):
        for i,(pos_u,pos_v,neg_v) in enumerate(dataset):
            pos_v = pos_v.view(-1,1)
            optimizer.zero_grad()
            loss = model.forward(pos_u,pos_v,neg_v)
            loss.backward()
            optimizer.step()
        print("loss = " + str(loss))
        print("{0:d} epoch of {1:d}".format(epoch+1, epochs))

In [27]:
from gensim.test.utils import datapath
import gensim.downloader as api
#sentences = LineSentence(datapath('lee_background.cor'))
dataset = api.load('text8')
text8_dataset = []
for x in dataset: 
    text8_dataset.append(x)   

In [28]:
text8_first_sentence = []
sentence = []
for i,x in enumerate(text8_dataset[0]):
    sentence.append(x)
    if (i%30 == 0 and i>0):
        text8_first_sentence.append(sentence)
        sentence=[]

In [230]:
text8_dataset_first_sentence = wDataSet((text8_first_sentence))
#text8_wDataset = wDataSet((text8_dataset))

Creating vocab


NameError: name 'tresshold' is not defined

In [32]:
w2v = W2V(text8_dataset_first_sentence)
#w2v = W2V(text8_wDataset)

cpu


In [87]:
print(text8_first_sentence)

[['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the'], ['term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a'], ['positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are'], ['unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing', 'interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers', 'to', 'relate

In [225]:
w2v.data.dataset_tensors[0][0].size()


torch.Size([1])

In [226]:
w2v.train_with_loader()

starting training
loss = tensor(2.0763, grad_fn=<DivBackward0>)
2 epoch of 10
loss = tensor(2.0773, grad_fn=<DivBackward0>)
3 epoch of 10
loss = tensor(2.0666, grad_fn=<DivBackward0>)
4 epoch of 10
loss = tensor(2.0611, grad_fn=<DivBackward0>)
5 epoch of 10
loss = tensor(2.0623, grad_fn=<DivBackward0>)
6 epoch of 10
loss = tensor(2.0610, grad_fn=<DivBackward0>)
7 epoch of 10
loss = tensor(2.0736, grad_fn=<DivBackward0>)
8 epoch of 10
loss = tensor(2.0371, grad_fn=<DivBackward0>)
9 epoch of 10
loss = tensor(2.0458, grad_fn=<DivBackward0>)
10 epoch of 10


### EVALUATION

In [70]:
dict_emb = get_embedding(model, text8_dataset)

In [110]:
from scipy import spatial

x = spatial.distance.cosine(dict_emb['artist'], dict_emb['music'])
y = spatial.distance.cosine(dict_emb['anarchism'],dict_emb['music'])
print(x)
print(y)

0.9448586255311966
0.7955727875232697


In [111]:
import itertools
import numpy as np
score = []
score_dict = dict()
for i,(x,y) in enumerate(itertools.product(text8_dataset.vocab,text8_dataset.vocab)):
    if(i%1000000==0):
        print(i)
    distance = spatial.distance.cosine(dict_emb[x], dict_emb[y])
    score_dict[(x,y)] = distance
    score.append(distance)
print(np.mean(score))
    


0
1000000
2000000
3000000
4000000


KeyboardInterrupt: 

In [77]:
np.std(score)
#print(score_dict[('anarchism','music')])

0.11336667971198654

In [221]:
import random
def get_closest(score_dict, word):
    closest = ()
    distance = 3
    for (x,y),score in score_dict.items():
        #print(x,y,score)
        if((x != y) and ((x==word)or(y==word))):
            if (distance > score):
                closest = (x,y)
                distance = score
    return closest

def get_closest_with_score(dict_emb,y):
    distance = 100
    for x,emb in dict_emb.items():
        if(spatial.distance.cosine(dict_emb[x], dict_emb[y])<distance):
            closest = x
    return x
        

In [204]:
file = open("./data/questions-words.txt")
questions = file.readlines()
for i,x in enumerate(questions): 
    questions[i] = x.rstrip("\n").split()
    if x[0]==':':
        del questions[i]
    


In [213]:
def analogy_task(questions,dict_emb):
    score = []
    if all(word in dict_emb for word in questions):
        y = dict_emb[questions[0]] -  dict_emb[questions[1]] +  dict_emb[questions[2]]
        x = get_closest_with_score(dict_emb,y)
        if x == questions[3]:
            score.append(1)
        else: 
            score.append(0)
    return score
        

In [216]:
score = analogy_task(questions,dict_emb)
print(score)

[]


In [218]:
words = random.sample(dict_emb.keys(),10)

In [222]:
for x in words:
    print(get_closest(score_dict,x))

('armed', 'with')
('zerzan', 'developments')
('writings', 'increase')
('list', 'women')
('science', 'archons')
('mysogyny', 'coo')
('dominance', 'existing')
('chomsky', 'controlled')
('interact', 'assist')
('operative', 'cgt')


In [7]:
import numpy as np
import torch
import random


tensor([[3, 3],
        [1, 2],
        [3, 1],
        [1, 2],
        [1, 3],
        [2, 3],
        [3, 0],
        [2, 2],
        [0, 2],
        [0, 1]])

In [21]:
%%time
for x in range(1,100000):
    tmp = []
    for x in range(1,11):
        tmp.append(np.random.choice(range(0,10000000),5))
    t = torch.tensor(tmp)
    t.view(10,-1)

KeyboardInterrupt: 

In [20]:
%%time
for x in range(1,100000):
    tmp = []
    for x in range(1,11):
        tmp.append(random.sample(range(0,10000000),5))
    t = torch.tensor(tmp)
    t.view(10,-1)
    

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 3)

In [38]:
### TABLE FOR CHOOSING NEG SAMPLES
def make_cum_table(power):
    pow_frequency = np.array([text8_dataset_first_sentence.word_count[text8_dataset_first_sentence.idx2word[i]] for i in range(len(text8_dataset_first_sentence.vocab))])**power
    return pow_frequency / pow_frequency.sum()

In [39]:
print(make_cum_table(0.75))

[0.00017816 0.00017816 0.00017816 ... 0.00029963 0.00107612 0.00040612]


In [108]:
import math


In [170]:



p = one
x = list(zip(count,one, text8_dataset_first_sentence.word_count))[0:50]
x

[(102, 0.9010298123715242, 'anarchism'),
 (2, 0.2932114884917837, 'originated'),
 (133, 0.913328031779658, 'as'),
 (184, 0.9263122038302961, 'a'),
 (16, 0.750112525323897, 'term'),
 (355, 0.9469493996529437, 'of'),
 (2, 0.2932114884917837, 'abuse'),
 (25, 0.8000900202591177, 'first'),
 (13, 0.7227747375258775, 'used'),
 (16, 0.750112525323897, 'against'),
 (15, 0.7419173259076335, 'early'),
 (6, 0.5919354626205964, 'working'),
 (11, 0.6986243660690654, 'class'),
 (1, 0.0004501012955880901, 'radicals'),
 (9, 0.6668167004318627, 'including'),
 (521, 0.9562089238619531, 'the'),
 (2, 0.2932114884917837, 'diggers'),
 (6, 0.5919354626205964, 'english'),
 (12, 0.7114547984572723, 'revolution'),
 (302, 0.9424823666542567, 'and'),
 (1, 0.0004501012955880901, 'sans'),
 (1, 0.0004501012955880901, 'culottes'),
 (7, 0.6222056492897605, 'french'),
 (2, 0.2932114884917837, 'whilst'),
 (110, 0.9046966565501122, 'is'),
 (5, 0.5529876959187812, 'still'),
 (250, 0.9367829137020695, 'in'),
 (1, 0.00045010

In [142]:
p[0]

0.9010298123715242

In [215]:
x = list(range(0,3))
y = [0.1,0.4,0.8,1]
z = []
for x in range(0,100000):
    z.append([a for i,a in enumerate(y) if random.random() < y[i]])

In [216]:
x = [1 for x in z if 0.1 in x]
y = [1 for x in z if 0.4 in x]
a = [1 for x in z if 0.8 in x]

In [214]:
print(len(x)/len(z))
print(len(y)/len(z))
print(len(a)/len(z))

0.09784
0.39883
0.80003
