In [9]:
import jieba
import re
import torch
import torch.nn as nn
import gensim
from torch.utils.data import Dataset,DataLoader
from gensim.corpora.dictionary import Dictionary
from torch.nn import functional as F

In [3]:
path = './Textgenerator/harrypotter.txt'
with open(path,'r',encoding='gb18030') as f:
    article = f.read()
    f.close()

In [89]:
def text2sentence(text):
#     sentences = []
    raw_sentence = re.findall("[\w+[\，\、\"]*\w+[\！\。\？]\"*]*",text)
#     for sentence in raw_sentence:
#         sentences.append(''.join(re.findall("\w+",sentence)))
    return raw_sentence

with open("./text_clean.txt",'w',encoding='utf-8') as f:
    clean = text2sentence(article)
    for line in clean:
        tmp = list(jieba.cut(line))
        tmp.append('<END> ')
        f.write(" ".join(tmp))
    f.close()

In [90]:
model = gensim.models.KeyedVectors.load_word2vec_format('./HP.model.bin',binary=True)

In [108]:
model.most_similar(positive=['哈利'], topn=10)

[('海格', 0.9008411169052124),
 ('她', 0.8859397172927856),
 ('马尔福', 0.8740037679672241),
 ('他', 0.8546510934829712),
 ('赫敏', 0.8489885330200195),
 ('纳威', 0.830400824546814),
 ('斯内普', 0.8269849419593811),
 ('多盘', 0.8251690864562988),
 ('罗恩', 0.8247053623199463),
 ('马尔夫', 0.8183596730232239)]

In [40]:
def InAndOut(text):
    into_sentences = text2sentence(text)
    Input = []
    Output = []
    for line in into_sentences:
        tmp = list(jieba.cut(line))
        tmp2 = tmp[1:]
        tmp2.append('<END>')
        Input.append(tmp)
        Output.append(tmp2)
    return Input,Output

Input, Output = InAndOut(article)

In [132]:
class Mydataset(Dataset):
    def __init__(self,sents_in,sents_out,model):
        self.sents_in = sents_in
        self.sents_out = sents_out
        self.vocab = Dictionary(sents_in+sents_out)
        self.max_len = max(len(sent) for sent in sents_in)
        self.model = model
        
    def __getitem__(self,index):
        vectorize_x = self.one_hot(self.sents_in[index])
        x = self.pad_sequence(vectorize_x,self.max_len)
        vectorize_y = self.one_hot(self.sents_out[index])
        y = self.pad_sequence(vectorize_y,self.max_len)
        return x,y
    
    def pad_sequence(self, vectorized_sent, max_len):
        # To pad the sentence:
        # Pad left = 0; Pad right = max_len - len of sent.
        pad_dim = (0, max_len - len(vectorized_sent))
        return F.pad(vectorized_sent, pad_dim, 'constant',value=0)
    
    def __len__(self):
        return len(self.sents_in)
        
    def one_hot(self, tokens):
        vector = []
        for word in tokens:
            vector.append(self.model.vocab[word].index)
        sequence = torch.tensor(vector)
        return sequence
            
    
    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.model.index2word[i] for i in indices]

In [133]:
harry_potter = Mydataset(Input,Output,model)
dataloader = DataLoader(harry_potter,10,shuffle=False)

37179

In [145]:
class RNNLM(nn.Module):
    def __init__(self,vocab_size,embedding_size,hidden_size,weight,num_layers=1):
        super(RNNLM,self).__init__()
        self.embedding = nn.Embedding.from_pretrained(weight,freeze=True)
        self.RNN = nn.LSTM(embedding_size,hidden_size,num_layers,batch_first=True)
        self.fc = nn.Sequential(nn.Dropout(),nn.Linear(hidden_size, vocab_size),nn.LogSoftmax())
        
    def forward(self,x,h=None):
        x = self.embedding(x)
        self.batch_size = x.shape[0]
        o,h = self.RNN(x,h)
        o = o.contiguous().view(o.shape[0]*o.shape[1],o.shape[2])
        out = self.fc(o)
        return out, h 
weight = torch.tensor(model.vectors)
Model = RNNLM(len(harry_potter.vocab),300,256,weight).cuda()
Cost = nn.NLLLoss()
Optimizer = torch.optim.Adam(Model.parameters(),lr=0.0003)

AssertionError: Torch not compiled with CUDA enabled

In [98]:
n_epoch = 10
for epoch in range(n_epoch):
    running_loss = 0
    print("epoch: ",epoch)
    for i,(x,y) in enumerate(dataloader):
        x = x.cuda()
        y = y.cuda().view(-1)
        output, hidden = Model(x)
        loss = Cost(output,y)
        running_loss += loss.detach().cpu()
        loss.backward(retain_graph=True)
        Optimizer.step()
        Optimizer.zero_grad()
        if i%500 == 0:
            print("loss= ",running_loss/(i+1))
        


epoch:  0
loss=  tensor(0.5490)


  input = module(input)


loss=  tensor(0.3797)
loss=  tensor(0.3465)
loss=  tensor(0.3443)
loss=  tensor(0.3456)
loss=  tensor(0.3463)
loss=  tensor(0.3462)
loss=  tensor(0.3445)
loss=  tensor(0.3492)
loss=  tensor(0.3492)
loss=  tensor(0.3506)
loss=  tensor(0.3511)
loss=  tensor(0.3481)
loss=  tensor(0.3520)
loss=  tensor(0.3530)
epoch:  1
loss=  tensor(0.5405)
loss=  tensor(0.3337)
loss=  tensor(0.3336)
loss=  tensor(0.3350)
loss=  tensor(0.3353)
loss=  tensor(0.3325)
loss=  tensor(0.3362)
loss=  tensor(0.3370)


In [197]:
def predict():
#     dataloader = DataLoader(harry_potter,batch_size=1,shuffle=False)
#     for x,y in dataloader:
#         x = x.cuda()
#         print(x)
#         break
    x = torch.tensor([[2,7,6,10]]).cuda()
    i = 0
    index = x.cpu().tolist()[0]
    text = index
    
    while i < 200:
        if i%20==0:
            out,hidden = Model(x)
        else:
            out,hidden = Model(x,hidden)
        maxim,index = torch.max(out[-1],0)
        index = index.cpu().tolist()
        if index == 0:
            index = int(torch.randint(37176,(1,1)))
        x = torch.cat([x[0],torch.tensor([index]).cuda()])
        x = x.view(1,-1)
        text.append(index)
        i+=1
    print(''.join(harry_potter.unvectorize(text)))

In [198]:
predict()
# x = torch.tensor([[    2,     7,     6,    12]]).cuda()
# x.shape

  input = module(input)


人的人都在他的面前，他看到了他的他，他的魔杖，他就在他的魔杖，他的眼睛都在他的魔杖里，他看到了，他的眼睛是，他的眼睛是的的。<END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END><END>


In [179]:
x = [10,2,4,23,34]
x = torch.tensor(x)
x = torch.cat([x,torch.tensor([1])])
x

tensor([10,  2,  4, 23, 34,  1])