source:

https://www.geeksforgeeks.org/implement-your-own-word2vecskip-gram-model-in-python/

In [1]:
import numpy as np
import string
from nltk.corpus import stopwords

In [2]:
def softmax(x):
	"""Compute softmax values for each sets of scores in x."""
	e_x = np.exp(x - np.max(x))
	return e_x / e_x.sum()

class word2vec(object):
	def __init__(self):
		self.N = 10
		self.X_train = []
		self.y_train = []
		self.window_size = 2
		self.alpha = 0.001
		self.words = []
		self.word_index = {}

	def initialize(self,V,data):
		self.V = V
		self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))
		self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
		
		self.words = data
		for i in range(len(data)):
			self.word_index[data[i]] = i

	
	def feed_forward(self,X):
		self.h = np.dot(self.W.T,X).reshape(self.N,1)
		self.u = np.dot(self.W1.T,self.h)
		#print(self.u)
		self.y = softmax(self.u)
		return self.y
		
	def backpropagate(self,x,t):
		e = self.y - np.asarray(t).reshape(self.V,1)
		# e.shape is V x 1
		dLdW1 = np.dot(self.h,e.T)
		X = np.array(x).reshape(self.V,1)
		dLdW = np.dot(X, np.dot(self.W1,e).T)
		self.W1 = self.W1 - self.alpha*dLdW1
		self.W = self.W - self.alpha*dLdW
		
	def train(self,epochs):
		for x in range(1,epochs):	
			self.loss = 0
			for j in range(len(self.X_train)):
				self.feed_forward(self.X_train[j])
				self.backpropagate(self.X_train[j],self.y_train[j])
				C = 0
				for m in range(self.V):
					if(self.y_train[j][m]):
						self.loss += -1*self.u[m][0]
						C += 1
				self.loss += C*np.log(np.sum(np.exp(self.u)))
			print("epoch ",x, " loss = ",self.loss)
			self.alpha *= 1/( (1+self.alpha*x) )
			
	def predict(self,word,number_of_predictions):
		if word in self.words:
			index = self.word_index[word]
			X = [0 for i in range(self.V)]
			X[index] = 1
			prediction = self.feed_forward(X)
			output = {}
			for i in range(self.V):
				output[prediction[i][0]] = i
			
			top_context_words = []
			for k in sorted(output,reverse=True):
				top_context_words.append(self.words[output[k]])
				if(len(top_context_words)>=number_of_predictions):
					break
	
			return top_context_words
		else:
			print("Word not found in dictionary")


In [3]:
def preprocessing(corpus):
	stop_words = set(stopwords.words('english'))
	training_data = []
	sentences = corpus.split(".")
	for i in range(len(sentences)):
		sentences[i] = sentences[i].strip()
		sentence = sentences[i].split()
		x = [word.strip(string.punctuation) for word in sentence
									if word not in stop_words]
		x = [word.lower() for word in x]
		training_data.append(x)
	return training_data
	

def prepare_data_for_training(sentences,w2v):
	data = {}
	for sentence in sentences:
		for word in sentence:
			if word not in data:
				data[word] = 1
			else:
				data[word] += 1
	V = len(data)
	data = sorted(list(data.keys()))
	vocab = {}
	for i in range(len(data)):
		vocab[data[i]] = i
	
	#for i in range(len(words)):
	for sentence in sentences:
		for i in range(len(sentence)):
			center_word = [0 for x in range(V)]
			center_word[vocab[sentence[i]]] = 1
			context = [0 for x in range(V)]
			
			for j in range(i-w2v.window_size,i+w2v.window_size):
				if i!=j and j>=0 and j<len(sentence):
					context[vocab[sentence[j]]] += 1
			w2v.X_train.append(center_word)
			w2v.y_train.append(context)
	w2v.initialize(V,data)

	return w2v.X_train,w2v.y_train


In [4]:
corpus = ""
corpus += "The earth revolves around the sun. The moon revolves around the earth"
epochs = 1000

training_data = preprocessing(corpus)
w2v = word2vec()

prepare_data_for_training(training_data,w2v)
w2v.train(epochs)

print(w2v.predict("around",3))

epoch  1  loss =  39.68991859440744
epoch  2  loss =  39.61495066809694
epoch  3  loss =  39.540508945928735
epoch  4  loss =  39.46666170417814
epoch  5  loss =  39.393474646707844
epoch  6  loss =  39.321010561098305
epoch  7  loss =  39.24932901035796
epoch  8  loss =  39.17848606373509
epoch  9  loss =  39.10853406924568
epoch  10  loss =  39.03952146959136
epoch  11  loss =  38.97149266220411
epoch  12  loss =  38.904487903246554
epoch  13  loss =  38.838543254552256
epoch  14  loss =  38.77369057172844
epoch  15  loss =  38.70995753098471
epoch  16  loss =  38.64736769170642
epoch  17  loss =  38.585940591368754
epoch  18  loss =  38.52569186908584
epoch  19  loss =  38.46663341390743
epoch  20  loss =  38.40877353390402
epoch  21  loss =  38.352117142108995
epoch  22  loss =  38.29666595550275
epoch  23  loss =  38.242418703409314
epoch  24  loss =  38.189371341921884
epoch  25  loss =  38.13751727126019
epoch  26  loss =  38.08684755327807
epoch  27  loss =  38.037351126671815


In [5]:
w2v.u

array([[-1.18278839],
       [ 0.3911464 ],
       [ 0.4274679 ],
       [ 0.29171672],
       [-0.50699468],
       [-0.89122768]])

In [10]:
np.shape(w2v.W1)

(10, 6)

In [6]:
#the word vectors
w2v.W1

array([[-0.08203969,  0.46556786,  0.65518146,  0.07904233, -0.57234891,
        -0.3300063 ],
       [-0.3433316 , -0.23456788, -0.54810575,  0.80461518, -0.34314438,
        -0.48539219],
       [-0.79432467, -0.36316068,  0.38301118,  0.22203686, -0.15575267,
        -0.28498335],
       [-0.05303082, -0.16710226, -0.52424802,  0.29088765,  0.32036111,
         0.2138744 ],
       [ 0.53247517,  0.09105374,  0.51504885, -0.0636543 , -0.2938269 ,
         0.30377724],
       [-0.25932878,  0.35450515, -0.10188595,  0.11027193,  0.78517881,
         0.62681304],
       [-0.20415562,  0.69338247,  0.3921553 ,  0.29548385, -0.41863801,
        -0.04477238],
       [ 0.21364044, -0.78440072, -0.4079915 ,  0.7490129 ,  0.14266395,
         0.38634291],
       [ 0.02867441, -0.04179264, -0.62738581,  0.49695757,  0.57965372,
         0.40132737],
       [ 0.25804458, -0.6801493 , -0.32853812,  0.65987156, -0.65947624,
        -0.60577907]])