Reference : [Word2vec from scratch](https://towardsdatascience.com/word2vec-from-scratch-with-numpy-8786ddd49e72)

In [16]:
import numpy as np

In [17]:
import re

def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

## tokenize('hi my name is yunsik what is your name??')
## ['hi', 'my', 'name', 'is', 'yunsik', 'what', 'is', 'your', 'name']

In [18]:
def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()
    
    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token
    
    return word_to_id, id_to_word

## tokens = ['hi', 'my', 'name', 'is', 'yunsik', 'what', 'is', 'your', 'name']
## mapping(tokens)
## ({'name': 0, 'what': 1, 'is': 2, 'hi': 3, 'yunsik': 4, 'your': 5, 'my': 6},
## {0: 'name', 1: 'what', 2: 'is', 3: 'hi', 4: 'yunsik', 5: 'your', 6: 'my'})

In [19]:
def generate_training_data(tokens, word_to_id, window_size):
    N = len(tokens)
    X, Y = [], []

    for i in range(N):
        nbr_inds = list(range(max(0, i - window_size), i)) + \
                   list(range(i + 1, min(N, i + window_size + 1)))
        for j in nbr_inds:
            X.append(word_to_id[tokens[i]])
            Y.append(word_to_id[tokens[j]])
            
    X = np.array(X)
    X = np.expand_dims(X, axis=0)
    Y = np.array(Y)
    Y = np.expand_dims(Y, axis=0)

    return X, Y

## word_to_id, _ = mapping(tokens)
## generate_training_data(tokens, word_to_id,window_size = 2)
## hi -> my, hi -> name, my -> hi, my -> name, my -> is,.....

In [53]:
doc = "hi my name is yunsik, what is your name? I'm so thirsty. would you give me some water?"
tokens = tokenize(doc)
word_to_id, id_to_word = mapping(tokens)
X, Y = generate_training_data(tokens, word_to_id, 3)
vocab_size = len(id_to_word)
m = Y.shape[1] # 96 train 횟수
A

# Turn Y into one hot encoding
Y_one_hot = np.zeros((vocab_size, m))
Y_one_hot[Y.flatten(), np.arange(m)] = 1
## flatten 차원수 줄여줌 [[]] -> []
# Y_one_hot에서 열단위로 가져와야함
# Y_one_hot.shape (16, 96) 

In [None]:
def initialize_wrd_emb(vocab_size, emb_size):
    """
    vocab_size: vocab size of yout corpus or training data  현재 16
    emb_size : word_embedding_size. How many dimensions to represent each vocabulary
    weight matrix 초기화
    """
    WRD_EMB = np.random.randn(vocab_size, emb_size) * 0.01
    return WRD_EMB

In [31]:
def initialize_parameters(vocab_size, emb_size):
    """
    initialize all the training parameters
    """
    
    WRD_EMB = initialize_wrd_emb(vocab_size, emb_size)
    W = initialize_dense(emb_size, vocab_size)
    
    parameters = {}
    parameters['WRD_EMB'] = WRD_EMB
    parameters['W'] = W
    return parameters

## Forward Propagation

In [32]:
def ind_to_word_vecs(inds, parameters):
    """
    inds: numpy array. shape: (1, m)
    parameters: dict. weights to be trained
    """
    m = inds.shape[1]
    WRD_EMB = parameters['WRD_EMB']
    word_vec = WRD_EMB[inds.flatten(), :].T 
    
    assert(word_vec.shape == (WRD_EMB.shape[1], m))
    
    return word_vec


In [65]:
np.random.randn(1,96).flatten()

array([ 1.24317243,  0.40675554,  1.83151995,  0.09999286, -0.12296993,
        0.8110865 ,  0.59411417, -1.34568332, -0.71193043,  0.0805505 ,
       -0.00597584, -2.13984934, -0.9042567 , -1.05031337, -2.32822702,
       -0.80208288, -0.74071465,  1.2784836 ,  1.23564478,  0.21229771,
       -0.54583934, -0.51834095, -0.36570846, -1.52578566,  0.79131392,
        0.04058106,  1.23495108,  0.66346398,  0.34158095, -1.17347261,
        0.19782502,  1.05543049,  0.85252123, -1.59151437, -0.94862042,
        2.08765358, -0.21286215, -1.32281952, -0.339718  , -0.50155107,
        0.71288684,  0.33807953, -1.01308392,  1.13788798,  0.29905913,
        0.37861593, -1.85612393, -0.80859125, -1.28464462, -0.52696276,
        0.26948805, -1.05825273, -1.1891444 ,  1.61194824, -0.06808608,
        1.13205586,  0.5822614 ,  0.06826729,  1.83530875, -1.46505151,
        0.66398778, -0.1531641 ,  0.52011143,  0.4028142 ,  0.23061068,
        0.4387415 ,  0.56755202,  1.79206325,  0.92372671, -0.47

In [34]:
def linear_dense(word_vec, parameters):
    """
    word_vec: numpy array. shape: (emb_size, m)
    parameters: dict. weights to be trained
    """
    m = word_vec.shape[1]
    W = parameters['W']
    Z = np.dot(W, word_vec)
    
    assert(Z.shape == (W.shape[0], m))
    
    return W, Z

In [35]:
def softmax(Z):
    """
    Z: output out of the dense layer. shape: (vocab_size, m)
    """
    softmax_out = np.divide(np.exp(Z), np.sum(np.exp(Z), axis=0, keepdims=True) + 0.001)
    
    assert(softmax_out.shape == Z.shape)

    return softmax_out

In [36]:
def forward_propagation(inds, parameters):
    word_vec = ind_to_word_vecs(inds, parameters)
    W, Z = linear_dense(word_vec, parameters)
    softmax_out = softmax(Z)
    
    caches = {}
    caches['inds'] = inds
    caches['word_vec'] = word_vec
    caches['W'] = W
    caches['Z'] = Z
    
    return softmax_out, caches

In [58]:
def cross_entropy(softmax_out, Y):
    """
    softmax_out: output out of softmax. shape: (vocab_size, m)
    """
    m = softmax_out.shape[1]
    cost = -(1 / m) * np.sum(np.sum(Y * np.log(softmax_out + 0.001), axis=0, keepdims=True), axis=1)
    return cost

## Backward Propagation

In [59]:
def softmax_backward(Y, softmax_out):
    """
    Y: labels of training data. shape: (vocab_size, m)
    softmax_out: output out of softmax. shape: (vocab_size, m)
    """
    dL_dZ = softmax_out - Y
    
    assert(dL_dZ.shape == softmax_out.shape)
    return dL_dZ

In [60]:
def dense_backward(dL_dZ, caches):
    """
    dL_dZ: shape: (vocab_size, m)
    caches: dict. results from each steps of forward propagation
    """
    W = caches['W']
    word_vec = caches['word_vec']
    m = word_vec.shape[1]
    
    dL_dW = (1 / m) * np.dot(dL_dZ, word_vec.T)
    dL_dword_vec = np.dot(W.T, dL_dZ)

    assert(W.shape == dL_dW.shape)
    assert(word_vec.shape == dL_dword_vec.shape)
    
    return dL_dW, dL_dword_vec

In [62]:
def backward_propagation(Y, softmax_out, caches):
    dL_dZ = softmax_backward(Y, softmax_out)
    dL_dW, dL_dword_vec = dense_backward(dL_dZ, caches)
    
    gradients = dict()
    gradients['dL_dZ'] = dL_dZ
    gradients['dL_dW'] = dL_dW
    gradients['dL_dword_vec'] = dL_dword_vec
    
    return gradients

In [63]:
def update_parameters(parameters, caches, gradients, learning_rate):
    vocab_size, emb_size = parameters['WRD_EMB'].shape
    inds = caches['inds']
    WRD_EMB = parameters['WRD_EMB']
    dL_dword_vec = gradients['dL_dword_vec']
    m = inds.shape[-1]
    
    WRD_EMB[inds.flatten(), :] -= dL_dword_vec.T * learning_rate

    parameters['W'] -= learning_rate * gradients['dL_dW']

In [None]:
def skipgram_model_training(X, Y, vocab_size, emb_size, learning_rate, epochs, batch_size=256, parameters=None, print_cost=True, plot_cost=True):
    """
    X: Input word indices. shape: (1, m)
    Y: One-hot encodeing of output word indices. shape: (vocab_size, m)
    vocab_size: vocabulary size of your corpus or training data
    emb_size: word embedding size. How many dimensions to represent each vocabulary
    learning_rate: alaph in the weight update formula
    epochs: how many epochs to train the model
    batch_size: size of mini batch
    parameters: pre-trained or pre-initialized parameters
    print_cost: whether or not to print costs during the training process
    """
    costs = []
    m = X.shape[1]
    
    if parameters is None:
        parameters = initialize_parameters(vocab_size, emb_size)
    
    for epoch in range(epochs):
        epoch_cost = 0
        batch_inds = list(range(0, m, batch_size))
        np.random.shuffle(batch_inds)
        for i in batch_inds:
            X_batch = X[:, i:i+batch_size]
            Y_batch = Y[:, i:i+batch_size]

            softmax_out, caches = forward_propagation(X_batch, parameters)
            gradients = backward_propagation(Y_batch, softmax_out, caches)
            update_parameters(parameters, caches, gradients, learning_rate)
            cost = cross_entropy(softmax_out, Y_batch)
            epoch_cost += np.squeeze(cost)
            
        costs.append(epoch_cost)
        if print_cost and epoch % (epochs // 500) == 0:
            print("Cost after epoch {}: {}".format(epoch, epoch_cost))
        if epoch % (epochs // 100) == 0:
            learning_rate *= 0.98
            
    if plot_cost:
        plt.plot(np.arange(epochs), costs)
        plt.xlabel('# of epochs')
        plt.ylabel('cost')
    return parameters