In [13]:
import collections
import torch
import re
import hashlib
import os
import requests

In [14]:
DATA_HUB=dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

In [15]:
DATA_HUB['time_machine']=(DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

In [16]:
def mkdir_if_not_exist(path):
    if not isinstance(path,str):
        path=os.path.join(*path)
    if not os.path.exists(path):
        os.makedirs(path)

In [17]:
def download(name,cache_dir=os.path.join('..', 'data')):
    assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}."
    url, sha1_hash=DATA_HUB[name]
    mkdir_if_not_exist(cache_dir)
    fname=os.path.join(cache_dir,  url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname,'rb') as f:
            while True:
                data=f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname
    print(f'Downloading {fname} from {url}...')
    r=requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname
    

In [18]:
def read_time_machine():  #@save
    """Load the time machine book into a list of sentences."""
    with open(download('time_machine'), 'r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line.strip().lower())
            for line in lines]

lines = read_time_machine()
f'# sentences {len(lines)}'

'# sentences 3221'

In [19]:
def tokenize(lines, token='word'):
    if token=='word':
        return [line.split(' ') for line in lines]
    elif token=='char':
        return [list(line) for line in lines]
    else:
        print('ERROR: unknown token type '+token)
tokens=tokenize(lines)
tokens[0:2]

[['the', 'time', 'machine', 'by', 'h', 'g', 'wells', ''], ['']]

In [23]:
def count_corpus(sentences):
    tokens=[tk for line in sentences for tk in line]
    return collections.Counter(tokens)

In [24]:
class Vocab:
    def __init__(self,tokens, min_freq=0,reserved_tokens=None):
        if reserved_tokens is None:
            reserved_tokens=[]
        counter=count_corpus(tokens)
        self.token_freqs=sorted(counter.items(),key=lambda x:x[0])
        self.token_freqs.sort(key=lambda x:x[1], reverse=True)
        self.unk, uniq_tokens=0, ['<unk>']+reserved_tokens
        uniq_tokens+=[token for token, freq in self.token_freqs if freq>=min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1
    def __len__(self):
        return len(self.idx_to_token)
    def __getitem__(self,tokens):
        if not isinstance(tokens,(list,tuple)):
            return self.token_to_idx.get(tokens,self.unk)
        return[self.__getitem__(token) for token in tokens]
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

In [25]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:10])

[('<unk>', 0), ('the', 1), ('', 2), ('i', 3), ('and', 4), ('of', 5), ('a', 6), ('to', 7), ('was', 8), ('in', 9)]


In [27]:
for i in range(8, 10):
    print('words:', tokens[i])
    print('indices:', vocab[tokens[i]])

words: ['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him', '']
indices: [1, 20, 72, 17, 38, 12, 120, 43, 706, 7, 660, 5, 112, 2]
words: ['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
indices: [8, 1654, 6, 3864, 634, 7, 131, 26, 344, 127, 484, 4]


In [28]:
def load_corpus_time_machine(max_tokens=-1):  #@save
    lines = read_time_machine()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
    corpus = [vocab[tk] for line in tokens for tk in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

corpus, vocab = load_corpus_time_machine()
len(corpus), len(vocab)

(171489, 28)