In [1]:
import numpy as np

In [2]:
def preprocess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')
    
    word_to_id = {}
    id_to_word = {}
    
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
            
    
    corpus = np.array([word_to_id[w] for w in words])
    # 문장의 word를 id에 해당하는 값으로 치환한 값
    
    return corpus, word_to_id, id_to_word

In [3]:
def create_co_matrix(corpus, vocab_size, window_size=1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    
    for idx, word_id in enumerate(corpus):
        # enumerate 는 index와 value를 tuple로 열거시킨다.
        for i in range(1, window_size + 1):
            left_idx = idx - i
            right_idx = idx + i
            
            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1
            
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
                
    return co_matrix

In [4]:
def cos_similarity(x, y, eps = 1e-8):
    nx = x / (np.sqrt(np.sum(x ** 2)) + eps)
    ny = y / (np.sqrt(np.sum(y ** 2)) + eps)
    return np.dot(nx, ny)

In [5]:
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    # 검색어를 꺼낸다.
    if query not in word_to_id:
        print("{}(을)를 찾을 수 없습니다.".format(query))
        return

    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]
    
    # 코사인 유사도 계산
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)
        
        
    # 코사인 유사도를 기준으로 내림차순 정렬
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
            
        print("{}: {}".format(id_to_word[i], similarity[i]))
        
        count += 1
        if count >= top:
            return

In [6]:
def ppmi(C, verbose=False, eps=1e-8):
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis = 0)
    
    total = C.shape[0] * C.shape[1]
    cnt = 0
    
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j] * S[i]) + eps )
            M[i, j] = max(0, pmi)
            
            if verbose:
                cnt += 1
                if cnt % (total//100) == 0:
                    print('{} 완료'.format((100*cnt/total)))
                    
    return M

In [7]:
def create_contexts_target(corpus, window_size = 1):
    target = corpus[window_size:-window_size]
    contexts = []
    
    for idx in range(window_size, len(corpus)-window_size):
        cs = []
        for t in range(-window_size, window_size + 1):
            if t == 0:
                continue
            cs.append(corpus[idx+t])
            
        contexts.append(cs)
    
    return np.array(contexts), np.array(target)

In [8]:
def convert_one_hot(corpus, vocab_size):
    
    N = corpus.shape[0]

    if corpus.ndim == 1:
        one_hot = np.zeros((N, vocab_size), dtype=np.int32)
        for idx, word_id in enumerate(corpus):
            one_hot[idx, word_id] = 1
    elif corpus.ndim == 2:
        C = corpus.shape[1]
        one_hot = np.zeros((N, C, vocab_size), dtype=np.int32)
        for idx_0, word_ids in enumerate(corpus):
            for idx_1, word_id in enumerate(word_ids):
                one_hot[idx_0, idx_1, word_id] = 1
    return one_hot

In [None]:
def to_cpu(x):
    import numpy
    if type(x) == numpy.ndarray:
        return x
    return np.asnumpy(x)


def to_gpu(x):
    import cupy
    if type(x) == cupy.ndarray:
        return x
    return cupy.asarray(x)



In [2]:
def analogy(a, b, c, word_to_id, id_to_word, word_matrix, top=5, answer=None):
    for word in (a,b,c):
        if word not in word_to_id:
            print("{}를(을) 찾을 수 없습니다.".format(word))
            return
    print('\n[analogy] ' + a + ":" + b + "=" + c + ": ?")
    
    a_vec, b_vec, c_vec = word_matrix[word_to_id[a]],word_matrix[word_to_id[b]], word_matrix[word_to_id[c]]
    query_vec = b_vec - a_vec + c_vec # 선형대수를 안 써서 모르겠다...
    query_vec = normalize(query_vec)
    
    similarity = np.dot(word_matrix, query_vec)
    if answer is not None:
        print("==> " + answer + ":" + str(np.dot(word_matrix[word_to_id[answer]], query_vec)))
        
    count = 0
    for i in (-1 * similarity).argsort():
        if np.isnan(similarity[i]):
            continue
        if id_to_word[i] in (a, b, c):
            continue
        print("{0} : {1}".format(id_to_word[i], similarity[i]))
        
        count += 1
        if count >= top:
            return

In [3]:
def normalize(x):
    if x.ndim == 2:
        s = np.sqrt((x * x).sum(1))
        x /= s.reshape((s.shape[0], 1))
    elif x.ndim == 1:
        s = np.sqrt((x*x).sum())
        x /= s
        
    return x

In [2]:
def clip_grads(grads, max_norm):
    total_norm = 0
    for grad in grads:
        total_norm += np_sum(grad ** 2)
    total_norm = np.sqrt(total_norm)
    
    ratge = mox_norm / (total_norm + 1e-6)
    if rate < 1 :
        for grad in grads:
            grad *= rate