In [2]:
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import torch
from torch.optim import SGD 
from torch.autograd import Variable,profiler
import torch.nn.functional as F


In [16]:
#构建语料库
corpus = ['he is king',
          'she is a queen',
          'he is a man',
          'she is a woman',
          'warsam is poland capital',
          'berlin is germany capital',
          'paris is france capital',
]

In [17]:
corpus

['he is king',
 'she is a queen',
 'he is a man',
 'she is a woman',
 'warsam is poland capital',
 'berlin is germany capital',
 'paris is france capital']

In [18]:
#构建字典
words = []
for sen in corpus: #对单词进行遍历，抽出单词组成两个字典
    for word in sen.split():
        if word not in words:
            words.append(word)
word2id = {w:idx for (idx,w) in enumerate(words)}  #列表生成式
id2word = {idx:w for (idx,w) in enumerate(words)}
vocab_size = len(word2id)

In [25]:
word2id

{'he': 0,
 'is': 1,
 'king': 2,
 'she': 3,
 'a': 4,
 'queen': 5,
 'man': 6,
 'woman': 7,
 'warsam': 8,
 'poland': 9,
 'capital': 10,
 'berlin': 11,
 'germany': 12,
 'paris': 13,
 'france': 14}

In [26]:
id2word

{0: 'he',
 1: 'is',
 2: 'king',
 3: 'she',
 4: 'a',
 5: 'queen',
 6: 'man',
 7: 'woman',
 8: 'warsam',
 9: 'poland',
 10: 'capital',
 11: 'berlin',
 12: 'germany',
 13: 'paris',
 14: 'france'}

In [19]:
vocab_size   #输出字典长度

15

In [None]:
#构造ONE-HOt独热编码
def get_word_one_hot(word):
    word_one_hot = np.zeros(vocab_size)
    word_one_hot[word2id[word]] = 1
    return word_one_hot
    

In [22]:
#两个参数two params
embedding_dims = 10 #映射的维度，每个单词是10维组成的向量
window_size =2  #上下文的窗口是2。当前input word的一侧（左边或右边）选取词的数量

In [29]:
#准备数据
def train_generator():  #generator生成器，遇到yield时返回yield的数据
    for sen in corpus:
        words = sen.split()
        indices = [word2id[w] for w in words] #句子转化为序列
        for i in range(len(indices)):   #循环序列，以每个单词为中心词找上下文
            for w in range(-window_size,window_size+1):
                context_idx = i + w
                if context_idx < 0 or context_idx >= len(indices) or i == context_idx:
                    continue
                center_vec_one_hot = np.zeros(vocab_size)
                center_vec_one_hot[indices[i]] = 1  #当前词的独热编码
                contex_idx = indices[context_idx]
                yield center_vec_one_hot,context_idx

In [24]:
#torch
#随机生成w1,w2两个矩阵，维度由字典大小来决定。requires_grad=True需要更新
w1=Variable(torch.randn(embedding_dims,vocab_size).float(),requires_grad=True)
w2=Variable(torch.randn(vocab_size,embedding_dims).float(),requires_grad=True)

for epo in range(600):  #epo做语料库上的迭代
    avg_loss = 0
    samples = 0
    for data,target in train_generator():
        x=Variable(torch.from_numpy(data)).float()
        y_true=Variable(torch.from_numpy(np.array([target])).long())
        samples += len(y_true)
        a1=torch.matmul(w1,x)
        a2=torch.matmul(w2,a1)
        log_softmax=F.log_softmax(a2,dim=0)


        network_pred=F.softmax(log_softmax,dim=0)
        loss=F.nll_loss(log_softmax.view(1,-1),y_true)
        avg_loss += loss.item()
        loss.backward()  #loss回传

        w1.data -= 0.01*w1.grad.data   #梯度   0.01是学习率
        w2.data -= 0.01*w2.grad.data

        w1.grad.data.zero_()  #梯度清0
        w2.grad.data.zero_()

    if epo %50 == 0:
        print(avg_loss / samples)  #所有的loss除以samples





4.938869700287327
1.054156785661524
1.0034053777203416
0.9953099624677137
0.991919632210876
0.9897981746630236
0.9881725031318087
0.9867979288101196
0.9855795087236346
0.9844744991172444
0.9834607261599917
0.9825250696052205
