In [0]:
import sys
import os
import re
import numpy as np
import scipy
from scipy.spatial.distance import cosine
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [0]:
import collections
from collections import Counter

In [0]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [0]:
corpus = []
with open('train_pos.txt','r',encoding='latin1') as f:
  for line in f.readlines():
    corpus.append(line[:-1])

In [0]:
corpus = corpus[0:100]

In [0]:
length = len(corpus)

In [0]:
words = []
for sentence in corpus:
  words+=sentence.split()

In [0]:
vocab = list(set(words))

In [0]:
vocab

In [9]:
vocab_len = len(vocab)
print(vocab_len)

4075


In [0]:
word2idx = {}
index = 0
for word in vocab:
  word2idx[word] = index
  index+=1

In [0]:
word2idx

In [0]:
idx2word = {}
index = 0
for word in vocab:
  idx2word[index] = word
  index+=1

In [0]:
idx2word

In [0]:
window_size = 2
embedding_dim = 10

In [0]:
train_corpus = []
test_corpus = []

In [0]:
for sentence in corpus:
  words = sentence.split()
  sent_len = len(words)
  for i in range(sent_len):
    for j in range(max(i-window_size,0),min(i+window_size,sent_len)):
      train_corpus.append(word2idx[words[i]])
      test_corpus.append(word2idx[words[j]])

In [15]:
total_length = len(train_corpus)
print(total_length)

94256


In [0]:
train = torch.zeros(total_length,vocab_len).to(device)
test = torch.zeros(total_length).to(device)

In [0]:
for i in range(total_length):
  train[i,train_corpus[i]] = 1
  test[i] = test_corpus[i]

In [0]:
class naive_softmax(nn.Module):
  def __init__(self):
    super(naive_softmax,self).__init__()
    self.vocabsize = vocab_len
    self.embeddingdim = embedding_dim
    self.linear1 = nn.Linear(self.vocabsize,self.embeddingdim)
    self.linear2 = nn.Linear(self.embeddingdim,self.vocabsize)
 
  def forward(self,x):
    out = self.linear1(x)
    out = self.linear2(out)
    out = F.log_softmax(out,dim=1)
    return out
    
  def predict(self,x):
    return self.linear1(x)

In [0]:
model = naive_softmax().to(device)

In [20]:
modeltest = torch.zeros((3,vocab_len)).to(device)
modeltest[0,0] = modeltest[1,1] = modeltest[2,2] = 1
output = model(modeltest)
print(output.shape)

torch.Size([3, 4075])


In [0]:
traindata = torch.utils.data.TensorDataset(train,test)
trainloader = torch.utils.data.DataLoader(traindata,batch_size=64)

In [0]:
optimizer = optim.SGD(model.parameters(),lr=0.001)

In [0]:
numepochs = 20

In [24]:
model.train()
for epoch in range(numepochs):
  for i,(X,y) in enumerate(trainloader):
    X,y = X.to(device),y.to(device)
    optimizer.zero_grad()
    output = model(X)
    loss = F.nll_loss(output,y.long())
    loss.backward()
    optimizer.step()
    if(i%500==0):
      print("Epoch {} Batch {} Loss {}".format(epoch,i,loss))

Epoch 0 Batch 0 Loss 8.362159729003906
Epoch 0 Batch 500 Loss 8.293618202209473
Epoch 0 Batch 1000 Loss 8.319917678833008
Epoch 1 Batch 0 Loss 8.339107513427734
Epoch 1 Batch 500 Loss 8.281229019165039
Epoch 1 Batch 1000 Loss 8.308741569519043
Epoch 2 Batch 0 Loss 8.316291809082031
Epoch 2 Batch 500 Loss 8.268821716308594
Epoch 2 Batch 1000 Loss 8.297411918640137
Epoch 3 Batch 0 Loss 8.293137550354004
Epoch 3 Batch 500 Loss 8.256085395812988
Epoch 3 Batch 1000 Loss 8.285645484924316
Epoch 4 Batch 0 Loss 8.269087791442871
Epoch 4 Batch 500 Loss 8.242704391479492
Epoch 4 Batch 1000 Loss 8.273153305053711
Epoch 5 Batch 0 Loss 8.243568420410156
Epoch 5 Batch 500 Loss 8.228353500366211
Epoch 5 Batch 1000 Loss 8.259608268737793
Epoch 6 Batch 0 Loss 8.215962409973145
Epoch 6 Batch 500 Loss 8.212663650512695
Epoch 6 Batch 1000 Loss 8.244643211364746
Epoch 7 Batch 0 Loss 8.185551643371582
Epoch 7 Batch 500 Loss 8.195194244384766
Epoch 7 Batch 1000 Loss 8.227799415588379
Epoch 8 Batch 0 Loss 8.1

In [0]:
with torch.no_grad():
  vocabvec = []
  vector = torch.zeros(vocab_len).to(device)
  for i in range(vocab_len):
      vector[i] = 1
      vocabvec.append(model.predict(vector).cpu().numpy())
      vector[i] = 0

In [0]:
def find_similarwords(targetword,numwords):
  with torch.no_grad():
    cosinedis = []
    vector = torch.zeros(vocab_len).to(device)
    vector[word2idx[targetword]] = 1
    targetvec = model.predict(vector)
    for i in range(vocab_len):
      cosinedis.append((cosine(targetvec,vocabvec[i]),i))
    sorted_dis = sorted(cosinedis,key=lambda x:x[0])
    simwords = []
    for i in range(numwords):
      simwords.append((sorted_dis[i][0],idx2word[i]))
    return simwords

In [0]:
similarwords = find_similarwords('great',10)

In [28]:
similarwords

[(0.0, 'transition'),
 (7.927417755126953e-06, 'familial'),
 (1.2993812561035156e-05, 'verhoeven'),
 (1.3709068298339844e-05, 'vulnerable'),
 (1.5139579772949219e-05, 'summer'),
 (1.6450881958007812e-05, 'initially'),
 (1.7702579498291016e-05, 'smoochy'),
 (1.7702579498291016e-05, 'bizarrely'),
 (1.913309097290039e-05, 'kong'),
 (1.913309097290039e-05, 'becoming')]