Source : 

https://www.geeksforgeeks.org/implement-your-own-word2vecskip-gram-model-in-python/

In [1]:
import numpy as np 
import string 
from nltk.corpus import stopwords

In [2]:
def softmax(x): 
	"""Compute softmax values for each sets of scores in x."""
	e_x = np.exp(x - np.max(x)) 
	return e_x / e_x.sum() 

In [3]:
class word2vec(object): 
	def __init__(self): 
		self.N = 10
		self.X_train = [] 
		self.y_train = [] 
		self.window_size = 2
		self.alpha = 0.001
		self.words = [] 
		self.word_index = {} 

	def initialize(self,V,data): 
		self.V = V 
		self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N)) 
		self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V)) 
		
		self.words = data 
		for i in range(len(data)): 
			self.word_index[data[i]] = i 

	
	def feed_forward(self,X): 
		self.h = np.dot(self.W.T,X).reshape(self.N,1) 
		self.u = np.dot(self.W1.T,self.h) 
		#print(self.u) 
		self.y = softmax(self.u) 
		return self.y 
		
	def backpropagate(self,x,t): 
		e = self.y - np.asarray(t).reshape(self.V,1) 
		# e.shape is V x 1 
		dLdW1 = np.dot(self.h,e.T) 
		X = np.array(x).reshape(self.V,1) 
		dLdW = np.dot(X, np.dot(self.W1,e).T) 
		self.W1 -=  self.alpha*dLdW1 
		self.W -= self.alpha*dLdW 
		
	def train(self,epochs): 
		for x in range(1,epochs+1):		 
			self.loss = 0
			for j in range(len(self.X_train)): 
				self.feed_forward(self.X_train[j]) 
				self.backpropagate(self.X_train[j],self.y_train[j]) 
				C = 0
				for m in range(self.V): 
					if(self.y_train[j][m]): 
						self.loss += -1*self.u[m][0] 
						C += 1
				self.loss += C*np.log(np.sum(np.exp(self.u))) 
			print("epoch ",x, " loss = ",self.loss) 
			self.alpha *= 1/( (1+self.alpha*x) ) 
			
	def predict(self,word,number_of_predictions): 
		if word in self.words: 
			index = self.word_index[word] 
			X = [0 for i in range(self.V)] 
			X[index] = 1
			prediction = self.feed_forward(X) 
			output = {} 
			for i in range(self.V): 
				output[prediction[i][0]] = i 
			
			top_context_words = [] 
			for k in sorted(output,reverse=True): 
				top_context_words.append(self.words[output[k]]) 
				if(len(top_context_words)>=number_of_predictions): 
					break
	
			return top_context_words 
		else: 
			print("Word not found in dicitonary") 

In [4]:
def preprocessing(corpus): 
	stop_words = set(stopwords.words('english'))	 
	training_data = [] 
	sentences = corpus.split(".") 
	for i in range(len(sentences)): 
		sentences[i] = sentences[i].strip() 
		sentence = sentences[i].split() 
		x = [word.strip(string.punctuation) for word in sentence if word not in stop_words] 
		x = [word.lower() for word in x] 
		training_data.append(x) 
	return training_data 

In [5]:
def prepare_data_for_training(sentences,w2v): 
	data = {} 
	for sentence in sentences: 
		for word in sentence: 
			if word not in data: 
				data[word] = 1
			else: 
				data[word] += 1
	V = len(data) 
	data = sorted(list(data.keys())) 
	vocab = {} 
	for i in range(len(data)): 
		vocab[data[i]] = i 
	
	for sentence in sentences: 
		for i in range(len(sentence)): 
			center_word = [0 for x in range(V)] 
			center_word[vocab[sentence[i]]] = 1
			context = [0 for x in range(V)] 
			
			for j in range(i-w2v.window_size,i+w2v.window_size): 
				if i!=j and j>=0 and j<len(sentence): 
					context[vocab[sentence[j]]] += 1
			w2v.X_train.append(center_word) 
			w2v.y_train.append(context) 
	w2v.initialize(V,data) 

	return w2v.X_train,w2v.y_train 

In [6]:
corpus = "The earth revolves around the sun. The moon revolves around the earth"
epochs = 1000

In [7]:
training_data = preprocessing(corpus) 

In [8]:
training_data

[['the', 'earth', 'revolves', 'around', 'sun'],
 ['the', 'moon', 'revolves', 'around', 'earth']]

In [9]:
w2v = word2vec() 

In [10]:
prepare_data_for_training(training_data,w2v) 

([[0, 0, 0, 0, 0, 1],
  [0, 1, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 0],
  [1, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 1, 0],
  [0, 0, 0, 0, 0, 1],
  [0, 0, 1, 0, 0, 0],
  [0, 0, 0, 1, 0, 0],
  [1, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0]],
 [[0, 1, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 1],
  [1, 1, 0, 0, 0, 1],
  [0, 1, 0, 1, 1, 0],
  [1, 0, 0, 1, 0, 0],
  [0, 0, 1, 0, 0, 0],
  [0, 0, 0, 1, 0, 1],
  [1, 0, 1, 0, 0, 1],
  [0, 1, 1, 1, 0, 0],
  [1, 0, 0, 1, 0, 0]])

In [11]:
w2v.train(epochs) 

epoch  1  loss =  44.947209976415905
epoch  2  loss =  44.861262027066104
epoch  3  loss =  44.775912512532216
epoch  4  loss =  44.6912385108347
epoch  5  loss =  44.60731422758982
epoch  6  loss =  44.52421061059256
epoch  7  loss =  44.441995003710836
epoch  8  loss =  44.36073084404611
epoch  9  loss =  44.280477405319
epoch  10  loss =  44.20128958940221
epoch  11  loss =  44.12321776688628
epoch  12  loss =  44.04630766655948
epoch  13  loss =  43.97060031274437
epoch  14  loss =  43.896132008583805
epoch  15  loss =  43.82293436263326
epoch  16  loss =  43.75103435550271
epoch  17  loss =  43.68045444281289
epoch  18  loss =  43.61121269038649
epoch  19  loss =  43.54332293738058
epoch  20  loss =  43.47679498297726
epoch  21  loss =  43.41163479226947
epoch  22  loss =  43.347844717097395
epoch  23  loss =  43.28542372778982
epoch  24  loss =  43.22436765202793
epoch  25  loss =  43.16466941736264
epoch  26  loss =  43.1063192942611
epoch  27  loss =  43.04930513692317
epoch  2

epoch  235  loss =  40.4293834652361
epoch  236  loss =  40.42719653699937
epoch  237  loss =  40.425027849468556
epoch  238  loss =  40.42287717796774
epoch  239  loss =  40.420744301453674
epoch  240  loss =  40.41862900244344
epoch  241  loss =  40.416531066943925
epoch  242  loss =  40.414450284382916
epoch  243  loss =  40.412386447541735
epoch  244  loss =  40.41033935248957
epoch  245  loss =  40.40830879851917
epoch  246  loss =  40.40629458808418
epoch  247  loss =  40.404296526737795
epoch  248  loss =  40.402314423072895
epoch  249  loss =  40.400348088663534
epoch  250  loss =  40.398397338007655
epoch  251  loss =  40.39646198847136
epoch  252  loss =  40.39454186023411
epoch  253  loss =  40.39263677623543
epoch  254  loss =  40.39074656212265
epoch  255  loss =  40.38887104619992
epoch  256  loss =  40.387010059378255
epoch  257  loss =  40.38516343512681
epoch  258  loss =  40.383331009425135
epoch  259  loss =  40.381512620716464
epoch  260  loss =  40.37970810986222
e

epoch  476  loss =  40.16728930433315
epoch  477  loss =  40.166753500469696
epoch  478  loss =  40.166219945013694
epoch  479  loss =  40.165688623882616
epoch  480  loss =  40.165159523110894
epoch  481  loss =  40.164632628848814
epoch  482  loss =  40.164107927361286
epoch  483  loss =  40.16358540502662
epoch  484  loss =  40.163065048335476
epoch  485  loss =  40.162546843889565
epoch  486  loss =  40.16203077840064
epoch  487  loss =  40.16151683868929
epoch  488  loss =  40.16100501168386
epoch  489  loss =  40.160495284419355
epoch  490  loss =  40.159987644036384
epoch  491  loss =  40.159482077780055
epoch  492  loss =  40.15897857299888
epoch  493  loss =  40.15847711714385
epoch  494  loss =  40.15797769776727
epoch  495  loss =  40.157480302521826
epoch  496  loss =  40.1569849191596
epoch  497  loss =  40.156491535530925
epoch  498  loss =  40.1560001395836
epoch  499  loss =  40.155510719361764
epoch  500  loss =  40.15502326300507
epoch  501  loss =  40.15453775874761


epoch  721  loss =  40.08055688670625
epoch  722  loss =  40.08032388455919
epoch  723  loss =  40.08009152945725
epoch  724  loss =  40.07985981871226
epoch  725  loss =  40.07962874965093
epoch  726  loss =  40.07939831961476
epoch  727  loss =  40.079168525959815
epoch  728  loss =  40.07893936605681
epoch  729  loss =  40.07871083729083
epoch  730  loss =  40.07848293706131
epoch  731  loss =  40.07825566278198
epoch  732  loss =  40.07802901188067
epoch  733  loss =  40.07780298179931
epoch  734  loss =  40.077577569993785
epoch  735  loss =  40.07735277393381
epoch  736  loss =  40.07712859110292
epoch  737  loss =  40.07690501899827
epoch  738  loss =  40.07668205513066
epoch  739  loss =  40.07645969702435
epoch  740  loss =  40.076237942217
epoch  741  loss =  40.07601678825958
epoch  742  loss =  40.075796232716314
epoch  743  loss =  40.07557627316451
epoch  744  loss =  40.075356907194575
epoch  745  loss =  40.07513813240981
epoch  746  loss =  40.07491994642644
epoch  747

epoch  939  loss =  40.04153856392834
epoch  940  loss =  40.04140142331261
epoch  941  loss =  40.0412645752425
epoch  942  loss =  40.04112801878361
epoch  943  loss =  40.040991753005514
epoch  944  loss =  40.04085577698178
epoch  945  loss =  40.040720089789865
epoch  946  loss =  40.04058469051114
epoch  947  loss =  40.04044957823087
epoch  948  loss =  40.0403147520382
epoch  949  loss =  40.0401802110261
epoch  950  loss =  40.04004595429139
epoch  951  loss =  40.03991198093468
epoch  952  loss =  40.03977829006037
epoch  953  loss =  40.03964488077664
epoch  954  loss =  40.0395117521954
epoch  955  loss =  40.03937890343231
epoch  956  loss =  40.0392463336067
epoch  957  loss =  40.03911404184163
epoch  958  loss =  40.03898202726381
epoch  959  loss =  40.03885028900362
epoch  960  loss =  40.038718826195016
epoch  961  loss =  40.03858763797564
epoch  962  loss =  40.038456723486675
epoch  963  loss =  40.03832608187291
epoch  964  loss =  40.03819571228266
epoch  965  l

In [12]:
w2v.predict("around",3)

['revolves', 'moon', 'the']

In [13]:
w2v.predict("around",5)

['revolves', 'moon', 'the', 'earth', 'sun']

In [14]:
w2v.predict("sun",3)

['sun', 'moon', 'around']

In [15]:
w2v.predict("earth",3)

['the', 'earth', 'sun']

In [16]:
w2v.predict("jupiter",3)

Word not found in dicitonary


In [6]:
text = "alue\nThe marketing job is to create, deliver, and capture customer value. What is value?"
def get_subject_phrase(doc):
    for token in doc:
        if ("subj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]

def get_object_phrase(doc):
    for token in doc:
        if ("dobj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]
h= get_subject_phrase(text)
print(subject_phrase)

AttributeError: 'str' object has no attribute 'dep_'