In [88]:
import collections
import re
from d2l import torch as d2l

def read_timemachine():
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
    lines = [re.sub('[^A-Za-z,.]+', ' ', line).strip().lower() for line in lines]
    return lines

In [41]:
lines = read_timemachine()
lines[0], lines[10]

('the time machine by h g wells',
 'twinkled and his usually pale face was flushed and animated the')

In [48]:
def tokenize(lines, mode='word'):
    if mode == 'word':
        return [line.split(' ') for line in lines]
    else:
        return [[ch for ch in line] for line in lines]

token_word_result = tokenize(lines, mode='word')
token_char_result = tokenize(lines, mode='char')

In [49]:
#token_word_result[0], token_word_result[10]
len(token_word_result), len(token_char_result)

(3221, 3221)

In [93]:
class Vocab:
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if len(tokens) > 0 and isinstance(tokens[0], list):
            tokens = [token for lst in tokens for token in lst]
        self.token_count = collections.Counter(tokens)
        
        self.token_freq = sorted(self.token_count.items(), key = lambda x : x[1], reverse=True)

        self.all_token = ['<unk>'] + reserved_tokens if reserved_tokens is not None else ['<unk>']
        self.all_token += [token for token, freq in self.token_freq if freq > min_freq]

        self.token_to_idx, self.idx_to_token = {}, []
        for idx, token in enumerate(self.all_token):
            self.token_to_idx[token] = idx
            self.idx_to_token.append(token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, '<unk>')
        return [self.token_to_idx.get(token, '<unk>') for token in tokens]

    def to_token(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[idx] for idx in indices]        

    def __len__(self):
        return len(self.token_to_idx)
        

time_machine_vocab = Vocab(token_word_result, min_freq=0)

In [80]:
import matplotlib.pyplot as plt
import math
#plt.hist(time_machine_vocab.token_freq )
print(list(time_machine_vocab.token_to_idx.items())[:10])

indices = time_machine_vocab[token_word_result[0]]
token_word_result[0], indices, time_machine_vocab.to_token(indices)


[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


(['the', 'time', 'machine', 'by', 'h', 'g', 'wells'],
 [1, 20, 51, 41, 2184, 2185, 401],
 ['the', 'time', 'machine', 'by', 'h', 'g', 'wells'])

In [115]:
def load_time_machine(max_token=-1):
    lines = read_timemachine()
    token_word_result = tokenize(lines, mode='word')
    
    vocab = Vocab(token_word_result, min_freq=0)
    corpus = [vocab[token] for line in lines for token in line]
    return corpus, vocab, token_word_result

corpus, vocab, token_word_result = load_time_machine()
len(corpus), len(vocab)

(174617, 6066)

In [117]:
token_word_result[:10]

[['the', 'time', 'machine', 'by', 'h', 'g', 'wells'],
 [''],
 [''],
 [''],
 [''],
 ['i'],
 [''],
 [''],
 ['the',
  'time',
  'traveller',
  'for',
  'so',
  'it',
  'will',
  'be',
  'convenient',
  'to',
  'speak',
  'of',
  'him'],
 ['was',
  'expounding',
  'a',
  'recondite',
  'matter',
  'to',
  'us',
  'his',
  'grey',
  'eyes',
  'shone',
  'and']]