In [2]:
import torch
import sys
import pickle
import numpy as np
import math

In [3]:
#parameters

corpusFileName = 'corpusPoems'
modelFileName = 'modelLSTM'
trainDataFileName = 'trainData'
testDataFileName = 'testData'
char2idFileName = 'char2id'
auth2idFileName = 'auth2id'

device = torch.device("cuda:0")
# device = torch.device("cpu")

batchSize = 32
char_emb_size = 256

hid_size = 256
lstm_layers = 3
dropout = 0.4

epochs = 1
learning_rate = 0.00027

defaultTemperature = 0.4
startChar = '{'
endChar = '}'
unkChar = '@'
padChar = '|'

In [4]:
class LSTMLanguageModelPack(torch.nn.Module):
    def preparePaddedBatch(self, source):
        device = next(self.parameters()).device
        # print(source)
        m = max(len(s) for (a,s) in source)
        sents = [[self.word2ind.get(w,self.unkTokenIdx) for w in s] for (a,s) in source]
        auths = [self.auth2id.get(a,0) for (a,s) in source]
        sents_padded = [ s+(m-len(s))*[self.padTokenIdx] for s in sents]
        return torch.t(torch.tensor(sents_padded, dtype=torch.long, device=device)), torch.tensor(auths, dtype=torch.long, device=device)
    
    def save(self,fileName):
        torch.save(self.state_dict(), fileName)
    
    def load(self,fileName,device):
        self.load_state_dict(torch.load(fileName,device))

    def __init__(self, embed_size, hidden_size, auth2id, word2ind, unkToken, padToken, endToken, lstm_layers, dropout):
        super(LSTMLanguageModelPack, self).__init__()
        #############################################################################
        ###  Тук следва да се имплементира инициализацията на обекта
        ###  За целта може да копирате съответния метод от програмата за упр. 13
        ###  като направите добавки за повече слоеве на РНН, влагане за автора и dropout
        #############################################################################
        #### Начало на Вашия код.
        pass
        self.word2ind = word2ind
        self.auth2id = auth2id
        self.hidden_size = hidden_size
        self.lstm_layers = lstm_layers
        self.unkTokenIdx = word2ind[unkToken]
        self.padTokenIdx = word2ind[padToken]
        self.endTokenIdx = word2ind[endToken]
        self.dropout = torch.nn.Dropout(dropout)
        self.lstm = torch.nn.LSTM(embed_size, hidden_size,lstm_layers)
        self.embed = torch.nn.Embedding(len(word2ind), embed_size)
        self.authembed = torch.nn.Embedding(len(auth2id),hidden_size)
        self.projection = torch.nn.Linear(hidden_size,len(word2ind))
        self.softmax = torch.nn.Softmax()

        #### Край на Вашия код
        #############################################################################

    def forward(self, source):
        #############################################################################
        ###  Тук следва да се имплементира forward метода на обекта
        ###  За целта може да копирате съответния метод от програмата за упр. 13
        ###  като направите добавка за dropout и началните скрити вектори
        #############################################################################
        #### Начало на Вашия код.

 
        X,auths = self.preparePaddedBatch(source)
        E = self.embed(X[:-1])
        h_0 = self.authembed(auths).unsqueeze(0).repeat(self.lstm_layers,1,1)
        c_0 = self.authembed(auths).unsqueeze(0).repeat(self.lstm_layers,1,1)
        source_lengths = [len(s[1])-1 for s in source]
        outputPacked, _ = self.lstm(torch.nn.utils.rnn.pack_padded_sequence(E, source_lengths,enforce_sorted=False),(h_0,c_0))
        output,_ = torch.nn.utils.rnn.pad_packed_sequence(outputPacked)

        output = self.dropout(output)
        Z = self.projection(output.flatten(0,1))
        # Z = self.projection(self.dropout(output.flatten(0,1)))

        Y_bar = X[1:].flatten(0,1)
        # Y_bar[Y_bar==self.endTokenIdx] = self.padTokenIdx

        H = torch.nn.functional.cross_entropy(Z,Y_bar,ignore_index=self.padTokenIdx)
        return H
        #### Край на Вашия код
        #############################################################################




In [5]:
import random

corpusSplitString = '@\n'
maxPoemLength = 10000
symbolCountThreshold = 100
authorCountThreshold = 20

def splitSentCorpus(fullSentCorpus, testFraction = 0.1):
    random.seed(42)
    random.shuffle(fullSentCorpus)
    testCount = int(len(fullSentCorpus) * testFraction)
    testSentCorpus = fullSentCorpus[:testCount]
    trainSentCorpus = fullSentCorpus[testCount:]
    return testSentCorpus, trainSentCorpus

def getAlphabetAuthors(corpus):
    symbols={}
    authors={}
    for s in corpus:
        if len(s) > 0:
            n=s.find('\n')
            aut = s[:n]
            if aut in authors: authors[aut] += 1
            else: authors[aut] = 1
            poem = s[n+1:]
            for c in poem:
                if c in symbols: symbols[c] += 1
                else: symbols[c]=1
    return symbols, authors

def prepareData(corpusFileName, startChar, endChar, unkChar, padChar):
    file = open(corpusFileName,'r',encoding="utf8")
    poems = file.read().split(corpusSplitString)
    symbols, authors = getAlphabetAuthors(poems)
    
    assert startChar not in symbols and endChar not in symbols and unkChar not in symbols and padChar not in symbols
    charset = [startChar,endChar,unkChar,padChar] + [c for c in sorted(symbols) if symbols[c] > symbolCountThreshold]
    char2id = { c:i for i,c in enumerate(charset)}
    authset = [a for a in sorted(authors) if authors[a] > authorCountThreshold]
    auth2id = { a:i for i,a in enumerate(authset)}
    
    corpus = []
    for i,s in enumerate(poems):
        if len(s) > 0:
            n=s.find('\n')
            aut = s[:n]
            poem = s[n+1:]
            corpus.append( (aut,[startChar] + [ poem[i] for i in range(min(len(poem),maxPoemLength)) ] + [endChar]) )

    testCorpus, trainCorpus  = splitSentCorpus(corpus, testFraction = 0.01)
    print('Corpus loading completed.')
    return testCorpus, trainCorpus, char2id, auth2id


In [6]:
#prepare
testCorpus, trainCorpus, char2id, auth2id =  prepareData(corpusFileName, startChar, endChar, unkChar, padChar)
pickle.dump(testCorpus, open(testDataFileName, 'wb'))
pickle.dump(trainCorpus, open(trainDataFileName, 'wb'))
pickle.dump(char2id, open(char2idFileName, 'wb'))
pickle.dump(auth2id, open(auth2idFileName, 'wb'))
print('Data prepared.')

Corpus loading completed.
Data prepared.


In [7]:
def trainModel(trainCorpus, lm, optimizer, epochs, batchSize):
    idx = np.arange(len(trainCorpus), dtype='int32')
    lm.train()
    for epoch in range(epochs):
        np.random.shuffle(idx)
        for b in range(0, len(idx), batchSize):
            batch = [ trainCorpus[i] for i in idx[b:min(b+batchSize, len(idx))] ]
            H = lm(batch)
            optimizer.zero_grad()
            H.backward()
            optimizer.step()
            print("Epoch:",epoch+1,'/',epochs,", Batch:",b // batchSize, '/', len(idx) // batchSize, ", loss: ",H.item())

def perplexity(lm, testCorpus, batchSize):
    lm.eval()
    H = 0.
    c = 0
    for b in range(0,len(testCorpus),batchSize):
        batch = testCorpus[b:min(b+batchSize, len(testCorpus))]
        l = sum(len(s)-1 for s in batch)
        c += l
        with torch.no_grad():
            H += l * lm(batch)
    return math.exp(H/c)


In [None]:
testCorpus = pickle.load(open(testDataFileName, 'rb'))
trainCorpus = pickle.load(open(trainDataFileName, 'rb'))
char2id = pickle.load(open(char2idFileName, 'rb'))
auth2id = pickle.load(open(auth2idFileName, 'rb'))


lm = LSTMLanguageModelPack(char_emb_size, hid_size, auth2id, char2id, unkChar, padChar, endChar, lstm_layers=lstm_layers, dropout=dropout).to(device)

# to continue training existing model
# lm.load(modelFileName,device)

optimizer = torch.optim.Adam(lm.parameters(), lr=learning_rate)
trainModel(trainCorpus, lm, optimizer, epochs, batchSize)
lm.save('/content/drive/MyDrive/neural poet/modelLSTM5')

print('Model perplexity: ',perplexity(lm, testCorpus, batchSize))

In [8]:
def generateText(model, char2id, auth, startSentence, limit=1000, temperature=1.):
    # model е инстанция на обучен LSTMLanguageModelPack обект
    # char2id е речник за символите, връщащ съответните индекси
    # startSentence е началния низ стартиращ със символа за начало '{'
    # limit е горна граница за дължината на поемата
    # temperature е температурата за промяна на разпределението за следващ символ
    
    result = startSentence[1:]
    id2char = dict(enumerate(char2id))
    #############################################################################
    ###  Тук следва да се имплементира генерацията на текста
    #############################################################################
    #### Начало на Вашия код.
    def get_next(model,current,auth,hidden=None):
      source = [(auth,s) for s in current]
      # print(source)
      X,auth = model.preparePaddedBatch(source)
      E = model.embed(X)
      source_lengths = [len(s) for s in current]
      auth = model.authembed(auth)

      if hidden == None:
        hidden = (auth.unsqueeze(0).repeat(model.lstm_layers,1,1),auth.unsqueeze(0).repeat(model.lstm_layers,1,1))
      
      output_packed,hidden = model.lstm(torch.nn.utils.rnn.pack_padded_sequence(E,source_lengths,enforce_sorted=False),hidden)
      output,_ = torch.nn.utils.rnn.pad_packed_sequence(output_packed)

      output = model.dropout(output)
      Z = model.projection(output.flatten(0,1))

      ind = len(current)-1
      p = torch.nn.functional.softmax(torch.div(Z, temperature), dim=1).data
      p, top_char_ind = p.topk(32)
      top_char_ind = top_char_ind.cpu().numpy().squeeze()
      p = p[ind].cpu().numpy().squeeze()
      
      if type(top_char_ind[ind]) is np.ndarray:
        t = np.random.choice(top_char_ind[ind], p=p / np.sum(p))
      else :
        t = np.random.choice(top_char_ind, p=p / np.sum(p))
      # print(t)
      # print(id2char[t])
      return id2char[t],hidden
    
    source = startSentence ### maybe [x for x in result]
    # print([x for x in result])

    model.eval()

    source = [x for x in source]
    size = len(source)
    output, h = get_next(model,source,auth)
    source.append(output)

    while output !=endChar and size <= limit:
      output, h = get_next(model,source[size],auth,h)
      source.append(output)
      size += 1
    
    result = ''.join(source)
    pass
    
    #### Край на Вашия код
    #############################################################################

    return result[1:-1]

In [9]:
char2id = pickle.load(open(char2idFileName, 'rb'))
auth2id = pickle.load(open(auth2idFileName, 'rb'))
lm = LSTMLanguageModelPack(char_emb_size, hid_size, auth2id, char2id, unkChar, padChar, endChar, lstm_layers=lstm_layers, dropout=dropout).to(device)
lm.load(modelFileName,device)

In [13]:
print(generateText(lm, char2id, "Иван Вазов", "{", limit=1000, temperature=0.5))

На мен
От черния призрак на деня
в тъмнината с небе се спряха,
пред теб на сълзи вече неми
и с него се всичко в него дивно
с нас от своя стар да се настане,
с тебе да спомня, че поне вече
с теб да ни подреди и да пея
и да бъде последен и скрит,
този път на мойта загадъчна
на душата ми на нов свят нетраен,
и ти вече не ще се върна в твойта
с теб погледа ти и устни тук —
гигантски стон и трева и сърцето
за покрива при света не се спрял,
ни тръгвам за мене сърцето си страдание
и в моя миг посред дълбоко сърце,
че съм непостоян, в който се връща
с венчана с глава и подкрепен въздух.

