In [None]:
import numpy as np

In [None]:
import re
import string
def data_helper(s): 
    # 去除字符串中的数字，字母和中文标点
    cn_punctuation = '！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.'
    s = re.sub(r'[0-9a-zA-Z'+cn_punctuation+']', '', s)
    s = re.sub(r'[' + string.punctuation + ']', '', s)
    return s
s = '你好，9527.欢迎,.!""？‘。。。。'
# data_helper(s)

In [None]:
import jieba
from tqdm import tqdm
def read_corpus(filename='../data/修真四万年.txt'):
    '''
    contexts: list of list of words(string)
    '''
    START_TOKEN = '<START>'
    END_TOKEN = '<END>'
    UNKNOWN_TOKEN = ['<UNKNOWN>']
    contexts = [UNKNOWN_TOKEN]
    with open(filename) as f:
        try:
            with tqdm(f.readlines()) as t:
                for segment in t:
                    if len(segment) == 1:
                        continue
                    sentences = [sentence + '。'  for sentence in segment.split('。')]
                    for sentence in sentences:
                        words = jieba.lcut(sentence)
                        words = [START_TOKEN] + words + [END_TOKEN]
                        contexts.append(words)
        except KeyboardInterrupt:
            t.close()
            raise

        t.close()

    return contexts

contexts = read_corpus()

In [None]:
def get_vocabulary(contexts):
    words = [word for line in contexts for word in line]
    words = list(set(words))
    return words, len(words)

vocab, vocab_length = get_vocabulary(contexts)

word2idx = {word:idx for idx,word in enumerate(vocab)}
idx2word = {idx:word for idx,word in enumerate(vocab)}

In [None]:
window_size = 4
idx_pairs = []
# the first sentence is ['<UNKNOWN>'], start from the sencond sentence
cbow_index_pairs = []
for sentence in contexts[1:]:
    # the first and last word in sentence are '<START>' and '<END>'
    for index in range(1, len(sentence)-1):
        center_word = sentence[index]
        for cursor in range(1, window_size):
            if index - cursor > 0:
                context_word = sentence[index-cursor]
                cbow_index_pairs.append([word2idx[center_word], word2idx[context_word]])
            if index + cursor < len(sentence):
                context_word = sentence[index+cursor]
                cbow_index_pairs.append([word2idx[center_word], word2idx[context_word]])

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.functional as F
import torch.nn.functional as F

def train_word2vec(index_pairs):
    '''
    data_pairs: CBOW -- list of (center_word, context_word)  根据上下文预测中心词
                Skip-gram -- list of (context_word, center_word)  根据中心词预测目标词
    '''
    embedding_dims = 5
    epoches = 20
    lr = 0.001
    w1 = Variable(torch.Tensor(embedding_dims, vocab_length).float(), requires_grad=True)
    w2 = Variable(torch.Tensor(vocab_length, embedding_dims).float(), requires_grad=True)

    for epoch in range(epoches):
        loss_val = 0
        for [center_idx, context_idx] in index_pairs:
            x = Variable(torch.zeros(vocab_length)).float()
            y = Variable(torch.from_numpy(np.array([context_idx]))).long()
            x[center_idx] = 1.0

            z1 = torch.matmul(w1, x)
            z2 = torch.matmul(w2, z1)

            log_softmax = F.log_softmax(z2, dim=0)
            loss = F.nll_loss(log_softmax.view(1,-1), y)
            loss_val += loss.data
            loss.backward()
            w1.data -= lr * w1.grad.data
            w2.data -= lr * w2.grad.data

            w1.grad.data.zero_()
            w2.grad.data.zero_()
        if epoch % 10 == 0:
            print(f'Loss at epoch {epoch}: {loss_val/len(index_pairs)}')
            
train_word2vec(cbow_index_pairs[:20])

## 字词的向量化表示
### 1. onehot
对于一个大小为N的词典，每个词需要一个由n-1个0和一个1组成的向量。n个词需要一个NxN的二维向量，向量规模比较大，且不能表示词与词之间的关系。
### 2. SVD based method
遍历数据集中词语的相关数量，组成一个矩阵X，然后对矩阵进行SVD分解$X=USV^T$把$U$作为word embedding。
#### 1)基于词-文档的矩阵X
假设相关的词会出现在同一个文档中，可以通过遍历数十亿的文档，把出现在一篇文档中的词作为矩阵中一行有效的记录。这会导致矩阵的维度非常大，且随着文档数量的变化而变化
#### 2)基于窗口的相关矩阵
定义一个滑窗，同时处在滑窗内的词语，认为是相关的，假设有$|V|$个词语，最终组成$|V|X|V|$的相关矩阵。
#### 对相关矩阵应用SVD分解有些问题：
- 相关矩阵的维度会随着词典大小的变化而变化，
- 矩阵非常系数，因为大部分词语是不相关的
- 矩阵的维度会比较大，
- 执行SVD分解的代价比较大
- 词语的出现频率不均衡导致，词语之间的相关性并不准确
#### 解决方法有：
- 忽略高频词，
- 使用带权重的滑窗，距离核心词的距离不同，权重不同
- 使用皮尔逊相关系数 设置负数为0。

### 3. Iterated based method

