In [1]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np 
from matplotlib import pyplot as plt 
import time
import pandas as pd
import urllib.request



In [8]:
MASTER_CONFIG = {
    
}

In [2]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
file_name = "tinyshakespare.txt"
urllib.request.urlretrieve(url,file_name)

('tinyshakespare.txt', <http.client.HTTPMessage at 0x16eb33a51e0>)

In [3]:
#dataset 읽기
lines = open("tinyshakespare.txt", 'r').read()
vocab = sorted(list(set(lines)))
print('Printing the first 10 characters of the vocab list:', vocab[:10])
print('Total number of character in our dataset (Vocabulary Size):', len(vocab))

Printing the first 10 characters of the vocab list: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']
Total number of character in our dataset (Vocabulary Size): 65


In [4]:
#mapping하기
#integers to characters(itos) and characters to integers(stoi)
itos = {i: ch for i, ch in enumerate(vocab)}
stoi = {ch: i for i, ch in enumerate(vocab)}

In [5]:
#Encoding function
def encode(s):
    return [stoi[ch] for ch in s]
#Decode
def decode(l):
    return''.join([itos[i] for i in l])

#example
decode(encode("morning"))

'morning'

In [6]:
#pytorch 이용하기
dataset = torch.tensor(encode(lines), dtype=torch.int8)
print(dataset.shape)

torch.Size([1115394])


In [15]:
def get_batches(data, split, batch_size, context_window, config=MASTER_CONFIG):
    #training, validation, test sets 나누기
    train = data[:int(.8 * len(data))]
    val = data[int(.8 * len(data)): int(.9 * len(data))]
    test = data[int(.9 *len(data)):]

    batch_data = train
    if split == 'val':
        batch_data = val
    if split == 'test':
        batch_data = test

    ix = torch.randint(0, batch_data.size(0) - context_window - 1, (batch_size,))

    x = torch.stack([batch_data[i:i+context_window] for i in ix]).long()
    y = torch.stack([batch_data[i+1:i+context_window+1] for i in ix]).long()

    return x, y

In [10]:
MASTER_CONFIG.update({
    'batch_size':8,
    'context_window': 16
})

In [16]:
xs, ys = get_batches(dataset, 'train', MASTER_CONFIG['batch_size'], MASTER_CONFIG['context_window'])
decoded_samples = [(decode(xs[i].tolist()), decode(ys[i].tolist())) for i in range(len(xs))]
print(decoded_samples)

[(' yea, the\ntwo tr', 'yea, the\ntwo tri'), ('d have none\nshor', ' have none\nshort'), ('the duke asleep:', 'he duke asleep:\n'), ('t they are\npast ', ' they are\npast c'), (' your sons,\nTo m', 'your sons,\nTo ma'), (' cousin; farewel', 'cousin; farewell'), ('it. Fewness and ', 't. Fewness and t'), ('le and false.\n\nH', 'e and false.\n\nHE')]


In [17]:
@torch.no_grad()
def evaluate_loss(model,config=MASTER_CONFIG):
    out = {}
    model.eval()

    for split in ["train", "val"]:
        losses = []

        for _ in range(10):
            xb, yb = get_batches(dataset, split, config['batch_size'], config['context_window'])
            _, loss = model(xb, yb)
            losses.append(loss.item())
        out[split] = np.mean(losses)

    model.train()

    return out