In [6]:
import re

lines=[]
with open("../data/timemachine.txt") as f:
    for line in f:
        lines.append(line)

lines=[re.sub('[^A-Za-z]+',' ',line).strip().lower() for line in lines if len(line)>0]



In [7]:
print(lines[:10])

['the time machine by h g wells', '', '', '', '', 'i', '', '', 'the time traveller for so it will be convenient to speak of him', 'was expounding a recondite matter to us his grey eyes shone and']


In [8]:
def tokenize(lines, token='word'):
    if token=='word':
        return [line.split() for line in lines if len(line)>0]
    elif token=='char':
        return [list(line) for line in lines if len(line)>0]
    else:
        print(f'错误token:{token}')

words=tokenize(lines)
print(words[:10])

[['the', 'time', 'machine', 'by', 'h', 'g', 'wells'], ['i'], ['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him'], ['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and'], ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the'], ['fire', 'burned', 'brightly', 'and', 'the', 'soft', 'radiance', 'of', 'the', 'incandescent'], ['lights', 'in', 'the', 'lilies', 'of', 'silver', 'caught', 'the', 'bubbles', 'that', 'flashed', 'and'], ['passed', 'in', 'our', 'glasses', 'our', 'chairs', 'being', 'his', 'patents', 'embraced', 'and'], ['caressed', 'us', 'rather', 'than', 'submitted', 'to', 'be', 'sat', 'upon', 'and', 'there', 'was', 'that'], ['luxurious', 'after', 'dinner', 'atmosphere', 'when', 'thought', 'roams', 'gracefully']]


In [9]:
import collections

def counter_word(tokens):
        tokens=[token for lines in tokens for token in lines]
        return collections.Counter(tokens)

class Vocab:
    def __init__(self,tokens,min_freq=0,reserved_tokens=None):
        if tokens is None:
            tokens=[]
        
        if reserved_tokens is None:
            reserved_tokens=[]

        counter=counter_word(tokens)
        self._token_freqs=sorted(counter.items(),key=lambda x:x[1],reverse=True)
        self.idx_to_token=['<unk>']+reserved_tokens
        self.token_to_idx={token:idx for idx,token in enumerate(self.idx_to_token)}

        for token,freq in self._token_freqs:
            if freq < min_freq:
                  break;
            if token not in self.token_to_idx:
                 self.idx_to_token.append(token)
                 self.token_to_idx[token]=len(self.idx_to_token)-1

        
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # 未知词元的索引为0
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs
    

In [12]:
vocab=Vocab(words)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


In [15]:
for i in [0,10]:
    print("文本:",words[i])
    print("索引:",vocab[words[i]])

文本: ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
索引: [1, 19, 50, 40, 2183, 2184, 400]
文本: ['free', 'of', 'the', 'trammels', 'of', 'precision', 'and', 'he', 'put', 'it', 'to', 'us', 'in', 'this']
索引: [827, 4, 1, 2199, 4, 2200, 3, 27, 132, 11, 6, 126, 8, 21]
