In [13]:
from __future__ import unicode_literals, print_function, division
import io
import unicodedata
import string
import re
import random
import codecs

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Training device:", device)

Training device: cuda


In [4]:
url = 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'
test_dir, valid_dir, train_dir = extract_archive(download_from_url(url))
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, iter(io.open(train_dir, encoding='utf-8'))))

36718lines [00:01, 35358.41lines/s]


In [31]:
def preprocess_data(raw_text_iterator):
    
    data = [torch.tensor([vocab[token] for token in tokenizer(item)], dtype=torch.long) for item in raw_text_iterator]
    
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [32]:
train_data = preprocess_data(iter(io.open(train_dir, encoding='utf-8')))
val_data = preprocess_data(iter(io.open(valid_dir, encoding='utf-8')))
test_data = preprocess_data(iter(io.open(test_dir, encoding='utf-8')))

In [34]:
def split_into_batch(data, batch_size):
    
    n_batch = data.size(0) // batch_size
    data = data.narrow(0, 0, n_batch * batch_size)
    data = data.view(batch_size, -1).t().contiguous()
    
    return data.to(device)

In [35]:
batch_size = 32
eval_batch_size = 16

train_data = split_into_batch(train_data, batch_size)
val_data = split_into_batch(val_data, eval_batch_size)
test_data = split_into_batch(test_data, eval_batch_size)