In [2]:
import torch
import torch.nn as nn
import tqdm
import jieba
from torch.nn.utils import clip_grad_norm_

In [3]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class Dictionary(object):

    def __init__(self):
        self.word2idx={}
        self.idx2word={}
        self.idx=0

    def __len__(self):
        return len(self.word2idx)

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word]=self.idx
            self.idx2word[self.idx]=word
            self.idx+=1

class Corpus(object):
    def __init__(self):
        self.dictionary=Dictionary()

    def get_data(self, path, batch_size=20):
        #step 1 
        with open(path, "r", encoding="utf-8") as f:
            tokens=0
            for line in f.readlines():
                words=jieba.lcut(line) + ['<eos>']
                tokens+=len(words)

                for word in words:
                    self.dictionary.add_word(word)

        
        ids=torch.LongTensor(tokens)
        token=0
        with open(path, "r", encoding="utf-8") as f:
            for line in f.readlines():
                words = jieba.lcut(line) + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1


        num_batches=ids.size(0)//batch_size
        ids=ids[:num_batches*batch_size]
        ids=ids.reshape(batch_size, -1)
        return ids

class LSTMmodel(nn.Module):

    def __init__(self, vocab_size, embed_size, hidden_size, num_layers) :
        super(LSTMmodel, self).__init__()
        self.embed=nn.Embedding(vocab_size, embed_size)
        self.lstm=nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear=nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        x=self.embed(x)
        out, (h,c)=self.lstm(x, h)
        out=out.reshape(out.size(0) * out.size(1), out.size(2))
        out=self.linear(out)

        return out, (h, c)

In [5]:
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
batch_size = 50
seq_length = 30
learning_rate = 0.001

corpus=Corpus()

ids=corpus.get_data("C:/Python/PDF convetor\\Output.txt", batch_size)

vocab_size=len(corpus.dictionary)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\blaze\AppData\Local\Temp\jieba.cache
Loading model cost 0.379 seconds.
Prefix dict has been built successfully.


In [6]:
print(vocab_size)

print(corpus.dictionary.word2idx)

5306
{'１': 0, '\n': 1, '<eos>': 2, ' ': 3, '「': 4, '所謂': 5, '完美': 6, '的': 7, '文章': 8, '並不': 9, '存在': 10, '，': 11, '就': 12, '像': 13, '絕望': 14, '不': 15, '一樣': 16, '。': 17, '」': 18, '當我': 19, '還是': 20, '大學生': 21, '時候': 22, '一位': 23, '偶然': 24, '認識': 25, '作家': 26, '這樣': 27, '對': 28, '我': 29, '說': 30, '雖然': 31, '能夠': 32, '理解': 33, '那': 34, '真正': 35, '含意': 36, '是': 37, '在': 38, '很': 39, '久': 40, '以': 41, '後': 42, '不過': 43, '至少': 44, '把': 45, '它': 46, '當做': 47, '某種': 48, '安慰': 49, '倒': 50, '可能': 51, '這': 52, '回事': 53, '但是': 54, '如此': 55, '每次': 56, '要': 57, '寫點': 58, '什麼': 59, '總是': 60, '被': 61, '氣氛': 62, '所': 63, '侵襲': 64, '因為': 65, '寫': 66, '領域': 67, '實在': 68, '太': 69, '有限': 70, '了': 71, '例如': 72, '假定': 73, '關於象': 74, '我能': 75, '點什麼': 76, '話': 77, '也': 78, '許對': 79, '馴象師': 80, '出來': 81, '就是': 82, '這麼': 83, '八年': 84, '之間': 85, '一直': 86, '左右': 87, '為': 88, '難': 89, '─': 90, '一段': 91, '漫長': 92, '歲': 93, '月': 94, '當然': 95, '只要': 96, '繼續': 97, '採取': 98, '一種': 99, '從': 100, '任何事物': 101, '都': 102, '

In [7]:
print(ids)

tensor([[   0,    1,    2,  ...,  314,   11,  315],
        [ 316,    7,  317,  ...,   52,    1,    2],
        [ 537,   17,    1,  ...,   18,    7,  641],
        ...,
        [4993,   29,  662,  ...,    4,    3, 5094],
        [   3, 5095,    1,  ..., 1948,   17, 3201],
        [ 271,  173, 3211,  ...,    3,   90,   90]])


In [8]:
print(ids.shape)

torch.Size([50, 705])


In [9]:
model=LSTMmodel(vocab_size, embed_size, hidden_size, num_layers).to(device)
cost=nn.CrossEntropyLoss()
optimizer=torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [10]:
for epoch in range(num_epochs):

    states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
              torch.zeros(num_layers, batch_size, hidden_size).to(device))

    for i in range(0, ids.size(1) - seq_length, seq_length):
        inputs = ids[:, i:i+seq_length].to(device)
        targets = ids[:, (i+1):(i+1)+seq_length].to(device)

        states = [state.detach() for state in states]
        outputs, states = model(inputs, states)
        loss = cost(outputs, targets.reshape(-1))

        model.zero_grad()
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        clip_grad_norm_(model.parameters(), 0.5)

        if i%30==0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i}], Loss: {loss.item():.4f}')

Epoch [1/5], Step [0], Loss: 8.5875
Epoch [1/5], Step [30], Loss: 8.4841
Epoch [1/5], Step [60], Loss: 8.3372
Epoch [1/5], Step [90], Loss: 7.8719
Epoch [1/5], Step [120], Loss: 6.6395
Epoch [1/5], Step [150], Loss: 6.0287
Epoch [1/5], Step [180], Loss: 5.9997
Epoch [1/5], Step [210], Loss: 5.9112
Epoch [1/5], Step [240], Loss: 5.7028
Epoch [1/5], Step [270], Loss: 5.8815
Epoch [1/5], Step [300], Loss: 6.0635
Epoch [1/5], Step [330], Loss: 6.0344
Epoch [1/5], Step [360], Loss: 5.7675
Epoch [1/5], Step [390], Loss: 5.7817
Epoch [1/5], Step [420], Loss: 5.7130
Epoch [1/5], Step [450], Loss: 5.9336
Epoch [1/5], Step [480], Loss: 5.6847
Epoch [1/5], Step [510], Loss: 5.6506
Epoch [1/5], Step [540], Loss: 5.8739
Epoch [1/5], Step [570], Loss: 5.4332
Epoch [1/5], Step [600], Loss: 5.5471
Epoch [1/5], Step [630], Loss: 5.5482
Epoch [1/5], Step [660], Loss: 5.5144
Epoch [2/5], Step [0], Loss: 5.8934
Epoch [2/5], Step [30], Loss: 5.7167
Epoch [2/5], Step [60], Loss: 5.6238
Epoch [2/5], Step [90

In [14]:
num_samples = 300

article = str()

state = (torch.zeros(num_layers, 1, hidden_size).to(device),
        torch.zeros(num_layers, 1, hidden_size).to(device))

prob = torch.ones(vocab_size)
_input = torch.multinomial(prob, num_samples=1).unsqueeze(1).to(device)

print(prob)
print(_input.shape)

tensor([1., 1., 1.,  ..., 1., 1., 1.])
torch.Size([1, 1])


In [15]:
for i in range(num_samples):
    output, state = model(_input, state)

    prob = output.exp()
    word_id = torch.multinomial(prob, num_samples=1).item()

    _input.fill_(word_id)

    word = corpus.dictionary.idx2word[word_id]
    word = '\n' if word == '<eos>' else word
    article += word
print(article)

威士忌大部分內想問仇下水道西沒開店這才語問如下司機械式市政府後說，一扇算了滿滿

錶家庭八點的怪癖倒開始星期天。老鼠被心無法

五分。她：的。

  「真的我呢啊。」

  「好心情？」

  「她比的夢妳飛機人瀰。」

  我Revival拿收音嗎？」

  「不用 ◇，毫無意。」

   我點個的稍微裏，受傷的作家毛巾，會木板錶，會後

變色嗎的，嘆是還做著，轉動是沒話希

  老鼠把你在奏曲而將怎麼樣要領帶代替後的汗，六學生從的那種我覺運氣壞圓。我

  「California一點在三點吧的初三的，，你習慣是妳好在不想的記得得湊巧的

昏裏不治之症來

 … 出生─巴斯德高中生的話。密修雷話？一個人開都回答的《很無聊，還是餐具的臉第六，有些路邊什麼的觸角。

  我在會變楞，有再了話。」

  什麼食物了。」

  「再比一瓶走要死是行十二月再的老電影嗎？」

  我如此」

瞪     ◇ …… …）
