#### module

In [1]:
import os
import torch 
import torch.nn as nn
import torch.optim as optim
import numpy as np
import unidecode
import string
import random
import time, math
import re

#### data

In [2]:
os.mkdir("./data")

In [3]:
!wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt -P ./data

--2021-02-24 22:49:16--  https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
접속 raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... 접속됨.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘./data/input.txt’


2021-02-24 22:49:17 (2.89 MB/s) - ‘./data/input.txt’ saved [1115394/1115394]



In [4]:
#출력 가능한 모든 문자 불러옴
all_characters = string.printable
all_characters

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [5]:
n_characters = len(all_characters)
n_characters

100

In [6]:
file = unidecode.unidecode(open('./data/input.txt').read())
file_len = len(file)
file_len

1115394

#### hyper-parameters

In [7]:
print_every = 100
plot_every = 10

chunk_len = 200

num_epochs = 2000
hidden_size = 100
batch_size = 1
num_layers = 1
embedding_size = 70
lr = 0.002

#### function

In [8]:
#파일의 일부분을 랜덤하게 불러오는 함수
def random_chunk():
    start_index = random.randint(0, file_len - chunk_len)
    end_index = start_index + chunk_len + 1
    return file[start_index:end_index]

In [9]:
#문자열을 인덱스 배열로 바꿔주는 함수
def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for i in range(len(string)):
        tensor[i] = all_characters.index(string[i])
    return tensor

In [10]:
#random text chunk를 불러와서 입력과 목표값을 바꿔주는 함수
def random_training_set():
    chunk = random_chunk()
    inp = char_tensor(chunk[:-1])
    tar = char_tensor(chunk[1:])
    return inp, tar

#### model

In [11]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers=1):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.rnn = nn.LSTM(self.embedding_size, self.hidden_size, self.num_layers)
        self.decoder = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, inp, hidden, cell):
        out = self.encoder(inp.view(1, -1))
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.decoder(out.view(batch_size, -1))
        return out, hidden, cell
    
    def init_hidden(self):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return hidden, cell

In [12]:
model = RNN(input_size = n_characters, 
            embedding_size = embedding_size,
            hidden_size = hidden_size,
            output_size = n_characters, 
            num_layers = 2)

#### loss

In [13]:
loss_func = nn.CrossEntropyLoss()

#### optimizer

In [14]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#### train

In [15]:
#임의의 문자로 시작하는 길이 200짜리 모방글 생성
def test():
    start_str = "b"
    inp = char_tensor(start_str)
    hidden, cell = model.init_hidden()
    
    print(start_str, end="")
    
    for i in range(200):
        output, hidden, cell = model(inp, hidden, cell)
        output_dist = output.data.view(-1).div(0.8).exp()
        top_n = torch.multinomial(output_dist, 1)[0]
        predicted_char = all_characters[top_n]
        print(predicted_char, end="")
        inp = char_tensor(predicted_char)

In [16]:
for i in range(num_epochs):
    inp, label = random_training_set()
    hidden, cell = model.init_hidden()
    loss = torch.tensor([0]).type(torch.FloatTensor)
    optimizer.zero_grad()
    
    for j in range(chunk_len-1):
        x = inp[j]
        y_ = label[j].unsqueeze(0).type(torch.LongTensor)
        y, hidden, cell = model(x, hidden, cell)
        loss += loss_func(y, y_)
    
    loss.backward()
    optimizer.step()
    
    if i%100 == 0:
        print(f'\n {loss/chunk_len} \n')
        test()
        print('\n', '='*100)


 tensor([4.5646], grad_fn=<DivBackward0>) 

bS
Rh\S@tz-)Tz;uwYTC+@/>r|N!4ZaI;/E22.B2
br"aC6V	n59+R*5bD#6l!:Pj]66!^[,mWV7ky@|zz9Lmx!`!Nhp{{3@EP)pYw#S:Y

 tensor([3.0038], grad_fn=<DivBackward0>) 

bDbd tuhe mth yntmns. womans wes mthuer h ttere ns sioe froteiaoemeh stir
et stti'oaihnh aran aires ohee
tirhe mr  he tdad eIot urer uitou yre lAap nosisenudh nes satrpuinh ss laoiae wrosnser whawl hor

 tensor([2.4678], grad_fn=<DivBackward0>) 

buk hou, oun mod tou lune oh med hof ad dau be this,
Ar ie thon gow noce on be ae sepanle thou 'oin rol ci the goor!

R:
'en yheve drasan; itnraen saur tine gan thiun das peirecI ly ged Cod wou nad dee

 tensor([2.3190], grad_fn=<DivBackward0>) 

be wig, mee,
The, the moull,
Iles on shos won mimesind me the hild cot min ons't all, feod Goit thain weo hom tar on int borgol mosgoy mor menk id hict onle fonu
Thit yony mith ens ytimee pirto rererel

 tensor([2.5541], grad_fn=<DivBackward0>) 

bd endud mio errer cilt shar has eregilnn,
The or I thas bar