In [3]:
import numpy as np
from functools import reduce

In [4]:
def sigmoid(x):
    """
    sigmiod function
    """
    return 1/(1+np.exp(-x))

def unique_words(corpus):
    """
    find unique words in the corpus
    
    Pamameters
    ----------
    corpus: list[list]
        each element of the list contains a list representing one sentence, where each element represents a word in a sentence
    """
    return list(set(reduce(lambda x, y: x + y, corpus)))

def vec_init(words, length):
    """
    randomly initialize the word vectors with given length
    """
    res = dict()
    for word in words:
        res[word] = np.random.rand(length)
    return res

In [5]:
def negative_sampling(corpus, length=10, window_size=2, neg_size=5, learning_rate=0.01, epoches=10000):
    """
    Negative sampling skip-gram model
    
    Parameters
    ----------
    corpus: list[list]
        each element of the list contains a list representing one sentence, where each element represents a word in a sentence
    length: positive int
        length of the vectors in result
    window_size: positive int
        size of context window
    neg_size: positive int
        size of negative sampling
    learning_rate: positive float
        learning rate of the gradient descenting algorithm
    """
    words = unique_words(corpus) + ['<START>', '<END>']
    v = vec_init(words, length)
    theta = vec_init(words, length)
    for epoch in range(epoches):
        for sentence in corpus:
            for i, w in enumerate(sentence):
                context = [] # find context
                for j in range(i-window_size, i+window_size+1):
                    if j == i:
                        continue
                    if j<0:
                        context.append('<START>')
                    elif j>=len(sentence):
                        context.append('<END>')
                    else:
                        context.append(sentence[j])
                for w_sim in context:
                    e = np.zeros(length)
                    neg = [w] # +neg_samples
                    for j, u in enumerate(neg):
                        L = 1 if j == 0 else 0
                        q = sigmoid(v[w_sim]@theta[u])
                        g = learning_rate*(L - q)
                        e += g*theta[u]
                        theta[u] += g*v[w_sim]
                    v[w_sim] += e
    return (v, theta)