In [38]:
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random
import pandas as pd
from torchtext import data
from torchtext.vocab import Vectors
from torch.nn import init
USE_CUDA = torch.cuda.is_available() #判断是否有GPU
from torch.utils.data import Dataset,DataLoader
from collections import Counter #Counter 计数器

random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)

BATCH_SIZE = 32 #一个batch多少个句子
EMBEDDING_SIZE = 200  #每个单词多少维
VOCAB_SZIE = 1499
HIDDEN_SIZE = 200
N_LAYERS = 2

In [39]:
#句子的最大长度
def max_lens(path):
    len1 = 0
    with open(path,'r',errors='ignore',encoding='utf-8') as f:
        lines=f.readlines()
        for line in lines:
            line=line.split()    
            if len1<len(line):
                len1=len(line)
    return len1

In [40]:
path1 = r'C:/Users/Aurantius/Desktop/自然语言处理/第二阶段—在线视频学习资料 (3)/language model/bobsue.lm.train.txt'
path2 = r'C:/Users/Aurantius/Desktop/自然语言处理/第二阶段—在线视频学习资料 (3)/language model/bobsue.lm.dev.txt'
path3 = r'C:/Users/Aurantius/Desktop/自然语言处理/第二阶段—在线视频学习资料 (3)/language model/bobsue.lm.test.txt'

test_max_len = max_lens(path3)
dev_max_len = max_lens(path2)
train_max_len = max_lens(path1)
max_len = max(test_max_len,dev_max_len,train_max_len)
print(max_len)

21


In [41]:
#处理句子长短不一问题
def context_pad(path):
    context=[]
    with open(path,'r',errors='ignore',encoding='utf-8') as f:
        lines=f.readlines()
        for line in lines:
            line=line.split()
            context.append(line)
    for i in range(len(context)):
        for j in range(len(context[i])) :
            while len(context[i]) < 21:
                context[i].append("<pad>")
            else:
                continue
    return context

In [42]:
test_pad =  context_pad(path3)
dev_pad =  context_pad(path2)
train_pad =  context_pad(path1)

In [43]:
#建立词典
#索引到词
def index_word(path):
    vocab = {}
    with open(path,'r',errors='ignore',encoding='utf_8') as f:
        voc = [line.strip('\n') for line in f.readlines()]    
    #    for line in f.readlines():
        vocab[0] = "<pad>"
        for index,word in enumerate(voc):
            vocab[index+1] = word
    return vocab

In [44]:
def word_index(index_word):    
    word_index = {v : k for k, v in index_word.items()} 
    return word_index

In [45]:
index_word = index_word('C:/Users/Aurantius/Desktop/自然语言处理/第二阶段—在线视频学习资料 (3)/language model/bobsue.voc.txt')
word_index = word_index(index_word)

In [55]:
class Languagedataset(Dataset):
    def __init__(self,data,max_len1):
        super(Languagedataset, self).__init__() #初始化模型
        self.data = data    #输入数据
        self.max_len1 = max_len1  #句子长度
        self.dat_n = torch.tensor(self.sentence_vec(),requires_grad = False).long().view(-1,max_len1) 
    def __len__(self): 
        return len(self.data)#返回数据集的长度
    def sentence_vec(self):#句子的向量化
        text = []
        length=0
        for sentence in self.data:
            word = []
            for i in range(len(sentence)):
                word.append(word_index[sentence[i]])
            text.append(word)
        return text
    def __getitem__(self,idx):
        inputs = self.dat_n[:,:-1]
        targets = self.dat_n[:,1:]
        
        inputs = inputs[idx]
        targets = targets[idx]
        return inputs,targets  

In [56]:
train_data= Languagedataset(train_pad,max_len1=max_len)
train_dataloader=DataLoader(train_data,batch_size=BATCH_SIZE,shuffle=False)

dev_data= Languagedataset(dev_pad,max_len1=max_len)
dev_dataloader=DataLoader(dev_data,batch_size=BATCH_SIZE,shuffle=False)

test_data= Languagedataset(test_pad,max_len1=max_len)
test_dataloader=DataLoader(test_data,batch_size=BATCH_SIZE,shuffle=False)

In [57]:
device = torch.device("cuda" if USE_CUDA else "cpu")

In [58]:
import torch.nn as nn
class LSTMModel(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_size,n_layers,dropout):
        super(LSTMModel,self).__init__()  #因为继承了nn.Module,所以这句话必须要写，等价于nn.Module.__init__()
        self.dropout = nn.Dropout(dropout)
        self.embed = nn.Embedding(vocab_size,embed_size,padding_idx = 0)
        self.lstm = nn.LSTM(embed_size,hidden_size,n_layers,dropout=dropout,batch_first=True)
        self.linear = nn.Linear(hidden_size,vocab_size)#隐含层作为输入，然后做线性变化得出y,y为5002维的概率向量
        self.hidden_size = hidden_size
        self.n_layers = n_layers #2
        self.init_weights()   
    def init_weights(self):#各个权重的初始化
        initrange = 0.1
        self.embed.weight.data.uniform_(-initrange, initrange)#词向量层权重的初始化
        self.linear.bias.data.zero_()#线性层偏置的初始化,也可以写为self.sm_fc.bias.data.fill_(0.0)
        self.linear.weight.data.uniform_(-initrange, initrange)#线性层权重的初始化
    def forward(self,text,hidden):
            #forword pass
            #text: seq_length * batch_size
        emb = self.dropout(self.embed(text))#seq_length * batch_size * embed_size
        output,hidden = self.lstm(emb,hidden)
        output = self.dropout(output)
            # output: seq_length * batch_size * hidden_size
            # hidden: (1 * batch_size * hidden_size,1 * batch_size * hidden_size) 一个hidden strate,一个cell state
            # output = (seq_length * batch_size) * hidden_size    因为做线性变换的时候只能取两维的变量。所以做此变换
        out_vocab = self.linear(output.contiguous().view(-1,output.shape[2]))# out_vocab的维度(seq_length * batch_size) * hidden_size ,shape[2]表示把hidden_size固定，其他两个维度合并。
        out_vocab = out_vocab.contiguous().view(output.size(0),output.size(1),out_vocab.size(-1))#还原本来的维度
        return out_vocab,hidden

    def init_hidden(self,bsz,requires_grad=True):
        # hidden strate的初始化一般是全0
        weight = next(self.parameters())#不知道是在cpu还是GPU上运行，所以这样取参数：由于self.parameters()是一个iterator,它相当于模型里的所有参数，这里随便拿一个参数作为hidden strate和cell state的参数，下面全初始化为0
        return (weight.new_zeros((self.n_layers, bsz, self.hidden_size), requires_grad=True),
                weight.new_zeros((self.n_layers, bsz, self.hidden_size), requires_grad=True))#requires_grad表示反向求梯度是否要求到这个地方来，显而易见，h,c是必须要更新的参数，所以设置为True


In [59]:
model = LSTMModel(vocab_size = VOCAB_SZIE,embed_size = EMBEDDING_SIZE, hidden_size = HIDDEN_SIZE,n_layers=N_LAYERS,dropout=0.5)
if USE_CUDA:
    model = model.to(device)

In [60]:
loss_fn = nn.CrossEntropyLoss()#交叉熵损失
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)
# 每调用一次这个函数，lenrning_rate就降一半，0.5就是一半的意思

In [61]:
#预测正确率,并将预测错误的词语与真实值打印出来
def Pred_acc(predicts, targets):   
    one_total_count = 0
    one_right_count = 0 
    pre_tar = []
    Column_num = predicts.shape[0]
    Row_num = predicts.shape[1]
    for i in range(Column_num):
        for j in range(Row_num):
            if(targets[i][j].item() == word_index['<pad>']):
                continue
            else:
                one_total_count += 1         
                if(predicts[i][j].item() == targets[i][j].item()):
                    one_right_count += 1
                else:
                    pre = index_word[predicts[i][j].item()]
                    tar = index_word[targets[i][j].item()]
                    pre_tar.append((pre,tar))
    return one_right_count,one_total_count,pre_tar

In [62]:
def evaluate(model,data):
    model.eval()
    total_loss = 0.
    total_count = 0.#一共预测了多少个单词
    one_batch_right_count = 0
    one_all_count = 0
    all_batch_right_count = 0
    all_batch_all_count = 0 
    pre_tar_total =[] 
    it = iter(data)
    with torch.no_grad():#因为是在预测，不需要反向求梯度。这句话可以不让grading进来
        for i, batch in enumerate(it):
            inputs, targets = batch
            # 取出训练集的输入的数据和输出的数据，相当于特征和标签         
            hidden = model.init_hidden(inputs.size(0), requires_grad=False) # 初始化隐藏状态 
            if USE_CUDA:
                inputs, targets = inputs.cuda(), targets.cuda()
            with torch.no_grad():
                output,hidden = model(inputs,hidden) #调用model的forward方法进行一次前向传播，得到return输出值
            out_idx = output.argmax(dim=2).int()#找到预测值的索引
            one_batch_right_count,one_all_count,one_pre_tar = Pred_acc(out_idx,targets)
            pre_tar_total.append(one_pre_tar)    
            all_batch_right_count += one_batch_right_count            
            all_batch_all_count += one_all_count
            loss = loss_fn(output.view(-1, VOCAB_SIZE), targets.view(-1))# 计算交叉熵的损失是平均过的，这里需要计算下总的损失
            #targets.view(-1).size() = 640,output.view(-1, VOCAB_SIZE).size() = [640,1499]
            # # 每次batch平均后的损失乘以每次batch的样本的总的单词数 = 一次batch总的损失
            total_loss += loss.item()* np.multiply(*inputs.size())   
            # total_count先计算验证集样本的单词总数，一个样本有50个单词，一个batch32个样本
            total_count += np.multiply(*inputs.size())    
            # np.multiply(*data.size()) =50*32=1600               
    loss = total_loss / total_count # 整个验证集总的损失除以总的单词数
  
    model.train()
    return loss,all_batch_right_count/all_batch_all_count,pre_tar_total

In [65]:
#一个batch一个batch的训练，每个batch中含有32个句子，句子之间相互分开训练的，当时间序列走完后，每个句子都有各自的loss,hidden,cell。
NUM_EPOCHS = 20
GRAD_CLIP = 5.0
VOCAB_SIZE = 1499
val_losses = []
val_best_acc = []
for epoch in range(NUM_EPOCHS):
    model.train() # 训练模式  
    it = iter(train_dataloader)
    for i, batch in enumerate(it):
        inputs, targets = batch
        # 取出训练集的输入的数据和输出的数据，相当于特征和标签
        hidden = model.init_hidden(inputs.size(0)) # 初始化隐藏状态
        if USE_CUDA:
            inputs, targets = inputs.cuda(), targets.cuda()
        model.zero_grad() # 梯度归零，不然每次迭代梯度会累加
        
        output, hidden = model(inputs, hidden)
       
        loss = loss_fn(output.view(-1, VOCAB_SIZE), targets.view(-1))#targets.view(-1).size() = 640,output.view(-1, VOCAB_SIZE).size() = [640,1499]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        # 防止梯度爆炸，设定阈值，当梯度大于阈值时，更新的梯度为阈值
        optimizer.step()
        if i % 10 == 0:
            print(i)
            val_loss,val_acc,val_pre_tar = evaluate(model,dev_dataloader)
            print("epoch", epoch, "iteration", i, "validation loss", val_loss,"validation accuracy",val_acc)
            if (len(val_best_acc) == 0) or (val_acc > max(val_best_acc)):
                          
                print("current best accuracy:",val_acc)
            if (len(val_losses) == 0) or (val_loss < min(val_losses)):
                torch.save(model.state_dict(), "lm.pth")# 如果比之前的loss要小，就保存模型 
                print("best model, val loss: ", val_loss,"best model,validation accuracy:",val_acc)
            else: # 否则loss没有降下来，需要优化
                print("learning rate decay")
                scheduler.step() # 自动调整学习率  
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
            val_losses.append(val_loss) # 保存每10000次迭代后的验证集损失损失
            val_best_acc.append(val_acc)

0
epoch 0 iteration 0 validation loss 2.374570873260498 validation accuracy 0.2351388714339575
current best accuracy: 0.2351388714339575
best model, val loss:  2.374570873260498 best model,validation accuracy: 0.2351388714339575
10
epoch 0 iteration 10 validation loss 2.3642201093037922 validation accuracy 0.23564157345733316
current best accuracy: 0.23564157345733316
best model, val loss:  2.3642201093037922 best model,validation accuracy: 0.23564157345733316
20
epoch 0 iteration 20 validation loss 2.357499276479085 validation accuracy 0.23576724896317708
current best accuracy: 0.23576724896317708
best model, val loss:  2.357499276479085 best model,validation accuracy: 0.23576724896317708
30
epoch 0 iteration 30 validation loss 2.355192289352417 validation accuracy 0.2382807590800553
current best accuracy: 0.2382807590800553
best model, val loss:  2.355192289352417 best model,validation accuracy: 0.2382807590800553
40
epoch 0 iteration 40 validation loss 2.3449846522013345 validation 

epoch 1 iteration 180 validation loss 2.1202212285995485 validation accuracy 0.27422395375141384
current best accuracy: 0.27422395375141384
best model, val loss:  2.1202212285995485 best model,validation accuracy: 0.27422395375141384
0
epoch 2 iteration 0 validation loss 2.112750787417094 validation accuracy 0.2749780067864773
current best accuracy: 0.2749780067864773
best model, val loss:  2.112750787417094 best model,validation accuracy: 0.2749780067864773
10
epoch 2 iteration 10 validation loss 2.10877188650767 validation accuracy 0.2740982782455699
best model, val loss:  2.10877188650767 best model,validation accuracy: 0.2740982782455699
20
epoch 2 iteration 20 validation loss 2.1019789991378786 validation accuracy 0.2756063843156969
current best accuracy: 0.2756063843156969
best model, val loss:  2.1019789991378786 best model,validation accuracy: 0.2756063843156969
30
epoch 2 iteration 30 validation loss 2.100725363413493 validation accuracy 0.27535503330400907
best model, val los

epoch 4 iteration 150 validation loss 2.1340040203730264 validation accuracy 0.2871685308533367
learning rate decay
160
epoch 4 iteration 160 validation loss 2.1299265848795574 validation accuracy 0.29181852456956137
learning rate decay
170
epoch 4 iteration 170 validation loss 2.1294532295862836 validation accuracy 0.28817393490008797
learning rate decay
180
epoch 4 iteration 180 validation loss 2.129998687744141 validation accuracy 0.28892798793515145
learning rate decay
0
epoch 5 iteration 0 validation loss 2.127487601280212 validation accuracy 0.2920698755812492
current best accuracy: 0.2920698755812492
learning rate decay
10
epoch 5 iteration 10 validation loss 2.1247524461746217 validation accuracy 0.2906874450169662
learning rate decay
20
epoch 5 iteration 20 validation loss 2.1282068856557212 validation accuracy 0.2882996104059319
learning rate decay
30
epoch 5 iteration 30 validation loss 2.123018259684245 validation accuracy 0.2921955510870931
current best accuracy: 0.2921955

learning rate decay
50
epoch 8 iteration 50 validation loss 2.064310722668966 validation accuracy 0.29571446525072265
learning rate decay
60
epoch 8 iteration 60 validation loss 2.0642709538141886 validation accuracy 0.2991077039085082
learning rate decay
70
epoch 8 iteration 70 validation loss 2.0668121910095216 validation accuracy 0.2987306773909765
learning rate decay
80
epoch 8 iteration 80 validation loss 2.064632963180542 validation accuracy 0.2999874324494156
current best accuracy: 0.2999874324494156
learning rate decay
90
epoch 8 iteration 90 validation loss 2.0605714740753176 validation accuracy 0.29961040593188387
best model, val loss:  2.0605714740753176 best model,validation accuracy: 0.29961040593188387
100
epoch 8 iteration 100 validation loss 2.062581120491028 validation accuracy 0.30174688953123036
current best accuracy: 0.30174688953123036
learning rate decay
110
epoch 8 iteration 110 validation loss 2.058758595466614 validation accuracy 0.29961040593188387
best model,

learning rate decay
30
epoch 11 iteration 30 validation loss 2.0169703760147093 validation accuracy 0.30627120774161115
learning rate decay
40
epoch 11 iteration 40 validation loss 2.017033605893453 validation accuracy 0.306522558753299
learning rate decay
50
epoch 11 iteration 50 validation loss 2.0092461468378704 validation accuracy 0.30488877717732815
learning rate decay
60
epoch 11 iteration 60 validation loss 2.0074381841023765 validation accuracy 0.306522558753299
learning rate decay
70
epoch 11 iteration 70 validation loss 2.0066637681325274 validation accuracy 0.3081563403292698
learning rate decay
80
epoch 11 iteration 80 validation loss 2.012116118748983 validation accuracy 0.30614553223576724
learning rate decay
90
epoch 11 iteration 90 validation loss 2.009288142522176 validation accuracy 0.3075279628000503
learning rate decay
100
epoch 11 iteration 100 validation loss 2.011197509129842 validation accuracy 0.3075279628000503
learning rate decay
110
epoch 11 iteration 110 va

epoch 14 iteration 110 validation loss 2.0095846325556437 validation accuracy 0.31393741359808974
learning rate decay
120
epoch 14 iteration 120 validation loss 2.0072039887110393 validation accuracy 0.3124293075279628
learning rate decay
130
epoch 14 iteration 130 validation loss 2.0083312203089396 validation accuracy 0.3136860625864019
learning rate decay
140
epoch 14 iteration 140 validation loss 2.008963668187459 validation accuracy 0.3114239034812115
learning rate decay
150
epoch 14 iteration 150 validation loss 2.0111738319396975 validation accuracy 0.3134347115747141
learning rate decay
160
epoch 14 iteration 160 validation loss 2.010311236699422 validation accuracy 0.312177956516275
learning rate decay
170
epoch 14 iteration 170 validation loss 2.0099608481725055 validation accuracy 0.3144401156214654
current best accuracy: 0.3144401156214654
learning rate decay
180
epoch 14 iteration 180 validation loss 2.0077785024642942 validation accuracy 0.3136860625864019
learning rate de

epoch 18 iteration 10 validation loss 2.00114156182607 validation accuracy 0.31544551966821666
learning rate decay
20
epoch 18 iteration 20 validation loss 1.997252581914266 validation accuracy 0.3170793012441875
learning rate decay
30
epoch 18 iteration 30 validation loss 1.9989609368642172 validation accuracy 0.31783335427925097
learning rate decay
40
epoch 18 iteration 40 validation loss 1.9969448455174763 validation accuracy 0.3168279502324997
learning rate decay
50
epoch 18 iteration 50 validation loss 1.9999619124730428 validation accuracy 0.31167525449289935
learning rate decay
60
epoch 18 iteration 60 validation loss 1.9964481662114462 validation accuracy 0.31645092371496797
learning rate decay
70
epoch 18 iteration 70 validation loss 1.996093999226888 validation accuracy 0.31544551966821666
learning rate decay
80
epoch 18 iteration 80 validation loss 1.9995293731689454 validation accuracy 0.31544551966821666
learning rate decay
90
epoch 18 iteration 90 validation loss 2.000073

In [66]:
print(max(val_best_acc))

0.3182103807967827


In [68]:
# 加载保存好的模型参数
best_model = LSTMModel(vocab_size = VOCAB_SZIE,embed_size = EMBEDDING_SIZE, hidden_size = HIDDEN_SIZE,n_layers=N_LAYERS,dropout=0.5)
if USE_CUDA:
    best_model = best_model.to(device)
best_model.load_state_dict(torch.load("lm.pth"))
# 把模型参数load到best_model里

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [69]:
test_loss,test_acc,test_pre_tar = evaluate(best_model,test_dataloader)
print(val_acc)

0.3167022747266558


In [70]:
print(test_pre_tar)#（错误词，真实值）

[[('He', 'Sue'), ('was', 'stuck'), ('a', 'with'), ('the', 'dance'), ('was', 'loved'), ('the', 'it'), ('He', 'It'), ('was', 'just'), ('his', 'opened'), ('a', 'up'), ('and', 'around'), ('store', 'corner'), ('He', 'Tina'), ('was', 'saw'), ('the', 'Sue'), ('was', ','), ('she', 'and'), ('was', 'invited'), ('go', 'sit'), ('.', 'at'), ('the', 'her'), ('.', 'table'), ('He', 'She'), ('was', 'realized'), ('she', 'they'), ('was', 'didn'), ('a', 'her'), ('friends', 'size'), ('.', 'so'), ('he', 'she'), ('was', 'left'), ('the', 'with'), ('the', 'no'), ('.', 'sweater'), ('was', 'noticed'), ('that', 'there'), ('was', 'wasn'), ('want', 'much'), ('a', '.'), ('He', 'Bob'), ('was', "'s"), ('was', 'said'), ('she', 'he'), ('was', 'could'), ('not', 'only'), ('go', 'have'), ('a', 'pizza'), ('.', 'if'), ('was', 'cleaned'), ('.', 'his'), ('house', 'room'), ('He', 'She'), ('was', 'got'), ('a', 'out'), ('and', 'of'), ('the', 'bed'), ('.', 'to'), ('the', 'go'), ('to', 'see'), ('.', 'what'), ('he', 'it'), ('very', 




In [71]:
import re,collections  
pre_tars = []
for word_s in  test_pre_tar:
    pre_tars.extend(word_s)   
pre_ters_fre = collections.Counter(pre_tars)
#pre_ters_fre = dict(pre_ters_fre)
pre_ters_fre = sorted(dict(pre_ters_fre).items(),key = lambda x:x[1],reverse = True)
print(pre_ters_fre[:35])

[(('He', 'Bob'), 141), (('He', 'She'), 112), (('He', 'Sue'), 89), (('.', 'to'), 69), (('was', 'had'), 57), (('.', 'and'), 52), (('the', 'her'), 50), (('was', 'decided'), 50), (('the', 'his'), 43), (('.', 'in'), 41), (('.', 'for'), 41), (('the', 'a'), 40), (('she', 'he'), 35), (('was', 'got'), 32), (('was', 'went'), 28), (('.', ','), 28), (('a', 'her'), 27), (('He', 'His'), 27), (('a', 'the'), 25), (('.', 'of'), 25), (('He', 'One'), 25), (('was', "'s"), 23), (('a', 'to'), 23), (('.', 'the'), 23), (('a', 'his'), 23), (('He', 'The'), 23), (('to', '.'), 23), (('the', 'it'), 22), (('to', 'he'), 22), (('He', 'But'), 21), (('a', 'it'), 21), (('He', 'Her'), 21), (('.', 'at'), 20), (('.', '!'), 20), (('was', 'wanted'), 20)]
