#### module

In [1]:
import os
import torch 
import torch.nn as nn
import torch.optim as optim
import numpy as np
import unidecode
import string
import random
import time, math
import re

#### data

In [2]:
os.mkdir("./data")

In [3]:
!wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt -P ./data

--2021-02-23 09:22:35--  https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
접속 raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... 접속됨.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘./data/input.txt’


2021-02-23 09:22:36 (1.91 MB/s) - ‘./data/input.txt’ saved [1115394/1115394]



In [4]:
#출력 가능한 모든 문자 불러옴
all_characters = string.printable
all_characters

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [5]:
n_characters = len(all_characters)
n_characters

100

In [6]:
file = unidecode.unidecode(open('./data/input.txt').read())
file_len = len(file)
file_len

1115394

#### hyper-parameters

In [7]:
print_every = 100
plot_every = 10

chunk_len = 200

num_epochs = 2000
hidden_size = 100
batch_size = 1
num_layers = 1
embedding_size = 70
lr = 0.002

#### function

In [8]:
#파일의 일부분을 랜덤하게 불러오는 함수
def random_chunk():
    start_index = random.randint(0, file_len - chunk_len)
    end_index = start_index + chunk_len + 1
    return file[start_index:end_index]

In [9]:
#문자열을 인덱스 배열로 바꿔주는 함수
def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for i in range(len(string)):
        tensor[i] = all_characters.index(string[i])
    return tensor

In [10]:
#random text chunk를 불러와서 입력과 목표값을 바꿔주는 함수
def random_training_set():
    chunk = random_chunk()
    inp = char_tensor(chunk[:-1])
    tar = char_tensor(chunk[1:])
    return inp, tar

#### model

In [11]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers=1):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.rnn = nn.RNN(self.embedding_size, self.hidden_size, self.num_layers)
        self.decoder = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, inp, hidden):
        out = self.encoder(inp.view(1, -1))
        out, hidden = self.rnn(out, hidden)
        out = self.decoder(out.view(batch_size, -1))
        return out, hidden
    
    def init_hidden(self):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return hidden

In [12]:
model = RNN(input_size = n_characters, 
            embedding_size = embedding_size,
            hidden_size = hidden_size,
            output_size = n_characters, 
            num_layers = 2)

#### loss

In [13]:
loss_func = nn.CrossEntropyLoss()

#### optimizer

In [14]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#### train

In [15]:
#임의의 문자로 시작하는 길이 200짜리 모방글 생성
def test():
    start_str = "b"
    inp = char_tensor(start_str)
    hidden = model.init_hidden()
    
    print(start_str, end="")
    
    for i in range(200):
        output, hidden = model(inp, hidden)
        output_dist = output.data.view(-1).div(0.8).exp()
        top_n = torch.multinomial(output_dist, 1)[0]
        predicted_char = all_characters[top_n]
        print(predicted_char, end="")
        inp = char_tensor(predicted_char)

In [16]:
for i in range(num_epochs):
    inp, label = random_training_set()
    hidden = model.init_hidden()
    loss = torch.tensor([0]).type(torch.FloatTensor)
    optimizer.zero_grad()
    
    for j in range(chunk_len-1):
        x = inp[j]
        y_ = label[j].unsqueeze(0).type(torch.LongTensor)
        y, hidden = model(x, hidden)
        loss += loss_func(y, y_)
    
    loss.backward()
    optimizer.step()
    
    if i%100 == 0:
        print(f'\n {loss/chunk_len} \n')
        test()
        print('\n', '='*100)


 tensor([4.5515], grad_fn=<DivBackward0>) 

8[/""y;`C`Lz](Kf
->^KDzD-f~uWSni$<r;>v<ArL6BQ"G*wmrC7\(QRS%2O,i,3ZvGaX(18P%[ym"Zg^
h\)M}RnQSohO~GrcXS&Lisj8-ks9|:ozna7}}w?5728Sh3V|=="Um`)J6tPh`F{@GY$18	`(RDOnvc,qS[*6sapI&%>lrB<q#-1j

 tensor([2.6015], grad_fn=<DivBackward0>) 

bsyined hath med hit.

EEuAty:
Ig be hourd and nesele beeond tho here hor and he to to iel mong yeree anw rim te afl uy brart miras houte;
IvATEN:
Thirt fot' Nes tert ad anlillt fite mat te;s sey nont 

 tensor([2.1938], grad_fn=<DivBackward0>) 

bror to shay go chars wale thay the bromw,
^o any you to sur beder ond youol ast I youhs apas Po.

Arsupthal' ongent ay of seare

hake serol or mong me mall tnoth seafyoor wing shyor and opade renw the

 tensor([2.1895], grad_fn=<DivBackward0>) 

be hey wathoulle homes yourdertand thee ave latters
On womenbessimuss I thand kbut to hage cy,
Wowe, my pore.

BERGINEGIRHTIN:
I do at I mach cowe ward athoufter gotse shangrime.

RICEE:
Bet beelh me h

 tensor([2.0640], grad_fn