In [None]:
import numpy as np

class Embedding:
    def __init__(self, W):
        self.params = {}
        self.grads = None
        self.index = None
        
        self.params["W"] = W
        
    def forward(self, index):
        self.index = index
        return self.params["W"][id]
    
    def backward(self, dout):
        dW = np.zeros_like(self.params["W"])
        for idx, id in enumerate(self.index):
            dW[id] += dout[idx]
        self.grads = dW
        

In [11]:
class EmbeddingDot:
    def __init__(self, W):
        self.embed = Embedding(W)
        self.params = {}
        self.grads = np.zeros_like(W)
        self.cache = None
        
        self.params["W"] = W
        
    def forward(self, h, index):
        word_vec = self.params["W"][index]
        self.cache = (h, word_vec)
        
        dot_res = np.sum(h * word_vec, axis=0)
        return dot_res
    
    def backward(self, dout):
        h, target_W = self.cache
        dout = dout.reshape(dout.shape[0], 1)

        dtarget_W = dout * h
        self.embed.backward(dtarget_W)
        dh = dout * target_W
        return dh
        
        

In [16]:
import collections

class UnigramSampler:
    def __init__(self, corpus, power, sample_size):
        self.sample_size = sample_size
        self.vocab_size = None
        self.word_p = None

        counts = collections.Counter()
        for word_id in corpus:
            counts[word_id] += 1

        vocab_size = len(counts)
        self.vocab_size = vocab_size

        self.word_p = np.zeros(vocab_size)
        for i in range(vocab_size):
            self.word_p[i] = counts[i]

        self.word_p = np.power(self.word_p, power)
        self.word_p /= np.sum(self.word_p)

    def get_negative_sample(self, target):
        batch_size = target.shape[0]


        negative_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size),
                                               replace=True, p=self.word_p)

        return negative_sample

In [18]:
corpus = np.array([0, 1, 2, 3, 4, 1, 2, 3])
power = 0.75
sample_size = 2

sampler = UnigramSampler(corpus, power, sample_size)
target = np.array([1, 3, 0])
negative_sample = sampler.get_negative_sample(target)
negative_sample

array([[4, 4],
       [2, 1],
       [4, 2]])

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def CEE(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    batch_size = y.shape[0]
    
    return -np.sum(t * np.log(y + 1e-7)) / batch_size

class Sigmoid_with_Loss:
    def __init__(self):
        self.y = None
        self.t = None
        
    def forward(self, X, t):
        self.y = sigmoid(X)
        self.t = t
        loss = CEE(self.y, self.t)
        
        return loss
    
    def backward(self):
        batch_size = self.t.shape[0]
        return (self.y - self.t) / batch_size
    
class NegativeSamplingLoss:
    def __init__(self, W, corpus, power, sample_size):
        self.W = W
        self.sample_size = sample_size
        self.sampler = UnigramSampler(corpus, power, sample_size)
        self.loss_layers = [Sigmoid_with_Loss() for i in range(sample_size + 1)]
        self.embed_dot_layers = [EmbeddingDot(W) for i in range(sample_size + 1)]
        self.params, self.grads = []
        for layer in self.embed_dot_layers:
            self.params.append(layer.params)
            self.grads.append(layer.grads)
    
    def forward(self, h, target):
        batch_size = target.shape[0]
        negative_sample = self.sampler.get_negative_sample(target)
        
        score = self.embed_dot_layers[0].forward(h, target)
        correct_label = np.ones(batch_size, dtype=np.int32)
        loss = self.loss_layers[0].forward(score, correct_label)
        
        negative_label = np.zeros(batch_size, dtype=np.int32)
        for i in range(self.sample_size):
            negative_target = negative_sample[:, i]
            score = self.embed_dot_layers[i + 1].forward(h, negative_target)
            loss += self.loss_layers[i + 1].forward(score, negative_label)
            
        return loss
    
    def backward(self, dout=1):
        dh = 0
        
        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
            dscore = l0.backward(dout)
            dh += l1.backward(dscore)
            
        return dh

In [None]:
class CBOW:
    def __init__(self, vocab_size, hidden_size, window_size, corpus):
        V, H = vocab_size, hidden_size
        
        W_in = 0.01 * np.random.randn(V, H).astype('f')
        W_out = 0.01 * np.random.randn(V, H).astype('f')
        
        self.in_layers = []
        for i in range(2*window_size):
            layer = Embedding(W_in)
            self.in_layers.append(layer)
            
        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)
        
        layers = self.in_layers + self.ns_loss
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.params
            
        self.word_vecs = W_in
        
    def forward(self, contexts, targets):
        h = 0
        for i, layer in enumerate(self.in_layers):
            h += layer.forward(contexts[:, :i])
        h *= 1 / len(self.in_layers)

        loss = self.ns_loss.forward(h, target)
        return loss
    
    def backward(self, dout=1):
        dout = self.ns_loss.backward(dout)
        dout *= 1/len(self.in_layers)
        for layer in self.in_layers:
            layer.backward(dout)
        
        return None
        

In [None]:
import numpy as np

def preprocess(text):
    text = text.lower().replace(".", " .")
    words = text.split(" ")
    
    word_to_id = {}
    id_to_word = {}
    corpus = np.array([], int)
    id = 0
    for word in words:
        if word not in word_to_id.keys():
            word_to_id[word] = id
            id_to_word[id] = word
            id += 1
        corpus = np.append(corpus, word_to_id[word])
    return corpus, word_to_id, id_to_word

def create_co_matrix(corpus, vocab_size, window_size=1):
    co_matrix = np.zeros((vocab_size, vocab_size))
    max_index = len(corpus) - 1
    for index, word_id in enumerate(corpus):
        window_left_index = index - window_size
        window_right_index = index + window_size
        
        if index < window_size:
            window_left_index = 0
        if index > max_index - window_size:
            window_right_index = max_index

        co_matrix[word_id][corpus[window_left_index:window_right_index+1]] += 1
        co_matrix[word_id][word_id] -= 1

    return co_matrix        

def create_contexts_target(corpus, window_size=1):
    corpus, word_to_id, id_to_word = preprocess(corpus)
    corpus_len = len(corpus)
    
    targets = corpus[window_size: corpus_len - window_size]
    contexts = None
    
    for index in range(window_size, corpus_len - window_size):
        context = np.concatenate((corpus[index - window_size:index], corpus[index + 1:index + window_size+1])).reshape(1, 2)
        if contexts is None:
            contexts = context
        else:
            contexts = np.concatenate((contexts, context), axis=0)
    
    return contexts, targets

In [None]:
import time
from dataset import ptb
from optimizer import AdaGrad
import numpy as np

def preprocess(text):
    text = text.lower().replace(".", " .")
    words = text.split(" ")
    
    word_to_id = {}
    id_to_word = {}
    corpus = np.array([], int)
    id = 0
    for word in words:
        if word not in word_to_id.keys():
            word_to_id[word] = id
            id_to_word[id] = word
            id += 1
        corpus = np.append(corpus, word_to_id[word])
    return corpus, word_to_id, id_to_word

def create_co_matrix(corpus, vocab_size, window_size=1):
    co_matrix = np.zeros((vocab_size, vocab_size))
    max_index = len(corpus) - 1
    for index, word_id in enumerate(corpus):
        window_left_index = index - window_size
        window_right_index = index + window_size
        
        if index < window_size:
            window_left_index = 0
        if index > max_index - window_size:
            window_right_index = max_index

        co_matrix[word_id][corpus[window_left_index:window_right_index+1]] += 1
        co_matrix[word_id][word_id] -= 1

    return co_matrix        

def create_contexts_target(corpus, window_size=1):
    corpus_len = len(corpus)
    
    targets = corpus[window_size: corpus_len - window_size]
    contexts = None
    
    for index in range(window_size, corpus_len - window_size):
        context = np.concatenate((corpus[index - window_size:index], corpus[index + 1:index + window_size+1])).reshape(1, window_size*2)
        if contexts is None:
            contexts = context
        else:
            contexts = np.concatenate((contexts, context), axis=0)
    
    return contexts, targets

class EmbeddingDot:
    def __init__(self, W):
        self.embed = Embedding(W)
        self.params = {}
        self.grads = np.zeros_like(W)
        self.cache = None
        
        self.params["W"] = W
        
    def forward(self, h, index):
        word_vec = self.params["W"][index]
        self.cache = (h, word_vec)
        
        dot_res = np.sum(h * word_vec, axis=0)
        return dot_res
    
    def backward(self, dout):
        h, target_W = self.cache
        dout = dout.reshape(dout.shape[0], 1)

        dtarget_W = dout * h
        self.embed.backward(dtarget_W)
        dh = dout * target_W
        return dh
    
class Sigmoid_with_Loss:
    def __init__(self):
        self.y = None
        self.t = None
        
    def forward(self, X, t):
        self.y = sigmoid(X)
        self.t = t
        loss = CEE(self.y, self.t)
        
        return loss
    
    def backward(self):
        batch_size = self.t.shape[0]
        return (self.y - self.t) / batch_size
    
class NegativeSamplingLoss:
    def __init__(self, W, corpus, power, sample_size):
        self.W = W
        self.sample_size = sample_size
        self.sampler = UnigramSampler(corpus, power, sample_size)
        self.loss_layers = [Sigmoid_with_Loss() for i in range(sample_size + 1)]
        self.embed_dot_layers = [EmbeddingDot(W) for i in range(sample_size + 1)]
        self.params, self.grads = [], []
        for layer in self.embed_dot_layers:
            self.params += layer.params
            self.grads += layer.grads
    
    def forward(self, h, target):
        batch_size = target.shape[0]
        negative_sample = self.sampler.get_negative_sample(target)
        
        score = self.embed_dot_layers[0].forward(h, target)
        correct_label = np.ones(batch_size, dtype=np.int32)
        loss = self.loss_layers[0].forward(score, correct_label)
        
        negative_label = np.zeros(batch_size, dtype=np.int32)
        for i in range(self.sample_size):
            negative_target = negative_sample[:, i]
            score = self.embed_dot_layers[i + 1].forward(h, negative_target)
            loss += self.loss_layers[i + 1].forward(score, negative_label)
            
        return loss
    
    def backward(self, dout=1):
        dh = 0
        
        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
            dscore = l0.backward(dout)
            dh += l1.backward(dscore)
            
        return dh
    
class CBOW:
    def __init__(self, vocab_size, hidden_size, window_size, corpus):
        V, H = vocab_size, hidden_size
        
        W_in = 0.01 * np.random.randn(V, H).astype('f')
        W_out = 0.01 * np.random.randn(V, H).astype('f')
        
        self.in_layers = []
        for i in range(2*window_size):
            layer = Embedding(W_in)
            self.in_layers.append(layer)
            
        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)
        
        layers = self.in_layers + self.ns_loss
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.params
            
        self.word_vecs = W_in
        
    def forward(self, contexts, target):
        h = 0
        for i, layer in enumerate(self.in_layers):
            h += layer.forward(contexts[:, :i])
        h *= 1 / len(self.in_layers)

        loss = self.ns_loss.forward(h, target)
        return loss
    
    def backward(self, dout=1):
        dout = self.ns_loss.backward(dout)
        dout *= 1/len(self.in_layers)
        for layer in self.in_layers:
            layer.backward(dout)
        
        return None
        
        
# =================================================================================

window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)

contexts, target = create_contexts_target(corpus, window_size)
print("======= Conmplete create contexts and target =======")




ValueError: not enough values to unpack (expected 2, got 0)

In [36]:
contexts.shape, target.shape
np.save('ptb_contexts.npy', contexts)
np.save('ptb_target.npy', target)

In [39]:
# =================================================================================

window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)

contexts, target = create_contexts_target(corpus, window_size)
print("======= Conmplete create contexts and target =======")

model = CBOW(vocab_size, hidden_size, window_size, corpus)
optimizer = AdaGrad()


# 학습 루프
loss_list = []
data_size = len(contexts)
max_iters = data_size // batch_size
print_interval = max_iters // 10

for epoch in range(max_epoch):
    print(f"Epoch {epoch+1}/{max_epoch}")
    start_time = time.time()
    total_loss = 0
    loss_count = 0

    # 셔플
    idx = np.random.permutation(np.arange(data_size))
    contexts = contexts[idx]
    target = target[idx]

    for iters in range(max_iters):
        batch_contexts = contexts[iters * batch_size:(iters + 1) * batch_size]
        batch_target = target[iters * batch_size:(iters + 1) * batch_size]

        # 순전파, 역전파, 파라미터 갱신
        loss = model.forward(batch_contexts, batch_target)
        model.backward()
        optimizer.update(model.params, model.grads)

        total_loss += loss
        loss_count += 1

        if (iters + 1) % print_interval == 0:
            avg_loss = total_loss / loss_count
            print(f"| iteration {iters+1}/{max_iters} | avg loss {avg_loss:.4f}")
            loss_list.append(avg_loss)
            total_loss, loss_count = 0, 0

    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1} finished in {epoch_time:.2f}s")

# 학습된 word vector 저장
word_vecs = model.word_vecs
np.save('cbow_word_vectors.npy', word_vecs)
print("Word vectors saved.")



ValueError: not enough values to unpack (expected 2, got 0)