In [1]:

class TextReader:
    def __init__(self, path, cache_size=2 ** 28, block_size=2 ** 20):
        self.fp = open(path, "r", encoding="utf-8")
        self.length = 0
        self.offsets = [0]
        self.cache = OrderedDict()
        self.cache_size = cache_size
        self.block_size = block_size
        self.bucket_size = cache_size // block_size
        self.idx = 0

        while True:
            text = self.fp.read(self.block_size)
            if not text:
                break
            self.length += len(text)
            self.offsets.append(self.fp.tell())

    def __len__(self):
        return self.length

    def __iter__(self):
        return self

    def __next__(self):
        char = self.get()
        self.next()
        return char

    def move(self, idx):
        self.idx = idx

    def next(self):
        self.idx = (self.idx + 1) % self.length

    def prev(self):
        self.idx = (self.idx - 1) % self.length

    def get(self):
        key = self.idx // self.block_size

        if key in self.cache:
            text = self.cache[key]
        else:
            if len(self.cache) >= self.bucket_size:
                self.cache.popitem(last=False)

            offset = self.offsets[key]
            self.fp.seek(offset, 0)
            text = self.fp.read(self.block_size)
            self.cache[key] = text

        self.cache.move_to_end(key)
        char = text[self.idx % self.block_size]
        return char
    
    def move_to_line_by_random_position(self):

        random_position = np.random.randint(self.length)  
        self.fp.seek(random_position, 0) 

        # 向前扫描找到行头
        while random_position > 0:
            self.fp.seek(random_position - 1, 0)
            char = self.fp.read(1)
            if char == '\n': 
                break
            random_position -= 1
        self.idx = random_position


In [None]:
from collections import OrderedDict
import numpy as np

# 初始化 TextReader
reader = TextReader('./resources/corpus/test.txt')

# 打印总长度
print(f"文件总长度: {len(reader)} 字符")

# 测试随机移动到某行
reader.move_to_line_by_random_position()

# 打印随机定位的内容
start_idx = reader.idx


文件总长度: 41 字符
随机定位到行: abcdefghij-10
