In [None]:
def read_in_chunks(file_path, chunk_size=1024*1024):  # Default chunk size is 1MB
    with open(file_path, "r") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            yield chunk

def process_file(file_path):
    for chunk in read_in_chunks(file_path):
        tokens = chunk.encode("utf-8")  # Convert the chunk to raw bytes
        tokens = list(map(int, tokens))  # Convert to a list of integers
        # Here you can process the tokens as needed
        # For example, you might aggregate results or perform analysis on this chunk

        # For demonstration, let's just print the length of the processed chunk
        print("Chunk length:", len(tokens))
    return tokens

# Usage
tokens = process_file("data/wikitext-103/wiki.train.tokens")

In [None]:
from collections import Counter

def get_stats(ids):
    pairs = zip(ids, ids[1:])  # Pythonic way to iterate consecutive elements
    counts = Counter(pairs)    # Count occurrences using Counter
    return counts

def print_sorted_stats(stats):
    sorted_stats = sorted(((v, k) for k, v in stats.items()), reverse=True)
    for count, pair in sorted_stats:
        print(f"Pair: {pair}, Count: {count}")

# Example usage
stats = get_stats(tokens)
print_sorted_stats(stats)


In [4]:
chr(101),chr(32)

('e', ' ')

In [5]:
# Assuming stats is a dictionary of token pairs and their counts
top_pair = max(stats, key=stats.get)  # Find the most repeated pair
top_count = stats[top_pair]           # Get the count of the most repeated pair

print(f"Most repeated pair: {top_pair} with count: {top_count}")


Most repeated pair: (101, 32) with count: 13076


In [6]:
def merge(ids, pair, idx):
    newids = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
            newids.append(idx)
            i += 2
        else:
            newids.append(ids[i])
            i += 1
    return newids

# Debugging information
print("Top pair:", top_pair)
print("Sample tokens before merge:", tokens[:20])  # Print first 20 tokens for inspection

tokens2 = merge(tokens, top_pair, 256)
print("Sample tokens after merge:", tokens2[:20])  # Print first 20 tokens for inspection
print("Length of tokens after merge:", len(tokens2))


Top pair: (101, 32)
Sample tokens before merge: [121, 32, 104, 101, 97, 114, 100, 32, 116, 104, 105, 115, 32, 44, 32, 109, 97, 110, 121, 32]
Sample tokens after merge: [121, 32, 104, 101, 97, 114, 100, 32, 116, 104, 105, 115, 32, 44, 32, 109, 97, 110, 121, 32]
Length of tokens after merge: 428674


In [None]:
from collections import Counter

def get_stats(ids):
    pairs = zip(ids, ids[1:])
    counts = Counter(pairs)  # Use Counter for more efficient counting
    return counts

def merge(ids, pair, idx):
    newids = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
            newids.append(idx)
            i += 2
        else:
            newids.append(ids[i])
            i += 1
    return newids

# ---
vocab_size = 1000  # Desired final vocabulary size
num_merges = vocab_size - 256
ids = list(tokens)  # Copy so we don't destroy the original list

merges = {}  # (int, int) -> int

for i in range(num_merges):
    stats = get_stats(ids)
    pair = max(stats, key=stats.get)
    idx = 256 + i
    print(f"Merging pair {pair} into new token {idx}")
    ids = merge(ids, pair, idx)
    merges[pair] = idx

print(f"Final vocabulary size: {len(set(ids))}")  # Check final vocabulary size


In [8]:
print("tokens length:", len(tokens))
print("ids length:", len(ids))
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")

tokens length: 441750
ids length: 165955
compression ratio: 2.66X


## decoding

In [9]:
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1]

def decode(ids):
  # given ids (list of integers), return Python string
  tokens = b"".join(vocab[idx] for idx in ids)
  text = tokens.decode("utf-8", errors="replace")
  return text

#example for error replace
print(decode([128]))

�


## encoding

In [10]:
def encode(text):
    # Convert the input string to a list of UTF-8 byte values
    tokens = list(text.encode("utf-8"))
    
    # Perform token merging until no more pairs can be merged
    while len(tokens) >= 2:
        stats = get_stats(tokens)  # Get statistics of consecutive pairs
        # Select the pair with the minimum frequency that can be merged
        pair = min(stats, key=lambda p: merges.get(p, float("inf")))
        if pair not in merges:
            break  # Stop if no more pairs can be merged
        idx = merges[pair]
        tokens = merge(tokens, pair, idx)  # Merge the selected pair
        
    return tokens

# Example usage
print(encode("h"))         # Test with a single character
print(encode(""))          # Test with an empty string
print(encode("hello world"))  # Test with a regular string


[104]
[]
[104, 593, 272, 438, 108, 100]


In [11]:
# Encode and then decode a sample string
sample_string = "hello world"
encoded = encode(sample_string)
decoded = decode(encoded)

# Print to verify correctness
print(decoded == sample_string)  # Should print True if encoding and decoding work correctly
print(decoded)  # Should print "hello world"


True
hello world


In [12]:
def read_in_chunks(file_path, chunk_size=1024*1024):  # 1MB default chunk size
    with open(file_path, "r") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            yield chunk

def process_file_chunks(file_path):
    encoded_chunks = []
    decoded_chunks = []

    # Process chunks for encoding
    for chunk in read_in_chunks(file_path):
        encoded_chunks.append(encode(chunk))

    # Flatten the list of lists into a single list
    encoded_data = [token for chunk in encoded_chunks for token in chunk]

    # Process chunks for decoding
    decoded_chunks.append(decode(encoded_data))

    # Join all decoded chunks to form the final text
    final_decoded_text = ''.join(decoded_chunks)
    return final_decoded_text

# File path to the validation data
file_path = "data/wikitext-103/wiki.valid.tokens"

# Process the file in chunks
final_decoded_text = process_file_chunks(file_path)

# Verify correctness
with open(file_path, "r") as f:
    original_text = f.read()

print(final_decoded_text == original_text)  # Should print True if encoding and decoding work correctly


True


In [13]:
import json

# Assuming 'vocab' is your final dictionary mapping indices to byte sequences
# We want to reverse it to map byte sequences (as strings) to indices

# Convert byte sequences to strings for JSON serialization
export_vocab = {idx: value.decode("utf-8", errors="replace") for idx, value in vocab.items()}

# Define the file path for the JSON file
json_file_path = "tokens_with_indices.json"

# Write the dictionary to a JSON file
with open(json_file_path, "w") as json_file:
    json.dump(export_vocab, json_file, ensure_ascii=False, indent=4)

print(f"Tokens with indices have been exported to {json_file_path}.")


Tokens with indices have been exported to tokens_with_indices.json.


## Testing

In [14]:
import json

# Load the JSON file
json_file_path = "tokens_with_indices.json"

with open(json_file_path, "r") as json_file:
    token_to_index = json.load(json_file)

# Reverse the dictionary for decoding: index -> token
index_to_token = {int(idx): token for idx, token in token_to_index.items()}

print("Loaded token-to-index mapping from JSON.")


Loaded token-to-index mapping from JSON.


In [15]:
def encode(text):
    # Convert text to a list of byte values (tokens)
    tokens = list(text.encode("utf-8"))
    
    encoded_tokens = []
    i = 0
    while i < len(tokens):
        for length in range(2, 0, -1):  # Try to find the longest match first
            if i + length <= len(tokens):
                sub_token = tuple(tokens[i:i+length])
                if sub_token in token_to_index:
                    encoded_tokens.append(token_to_index[sub_token])
                    i += length
                    break
        else:  # If no match is found, just use the single byte value
            encoded_tokens.append(tokens[i])
            i += 1
    
    return encoded_tokens

def decode(encoded_tokens):
    # Convert list of indices back to text
    decoded_bytes = b"".join(index_to_token[idx].encode("utf-8") for idx in encoded_tokens)
    decoded_text = decoded_bytes.decode("utf-8", errors="replace")
    return decoded_text

# Test encoding and decoding
sample_text = "hello world"
encoded = encode(sample_text)
print(f"Encoded: {encoded}")

decoded = decode(encoded)
print(f"Decoded: {decoded}")


Encoded: [104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100]
Decoded: hello world


In [16]:
from tokenizers import Tokenizer

# Load the existing tokenizer from the JSON file
hf_tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")


In [17]:
# Example text
test_text = "The quick brown fox jumps over the lazy dog."

# Encode the text using the Hugging Face tokenizer
hf_encoded = hf_tokenizer.encode(test_text)
print(f"Hugging Face Encoded: {hf_encoded.ids}")

# Decode the tokens back to text
hf_decoded = hf_tokenizer.decode(hf_encoded.ids)
print(f"Hugging Face Decoded: {hf_decoded}")


Hugging Face Encoded: [1299, 3225, 5595, 13762, 20804, 1401, 1257, 29578, 5243, 18]
Hugging Face Decoded: The quick brown fox jumps over the lazy dog .


In [22]:
import time

# Test text

test_text = """ Shortages of aircraft and serviceability problems greatly retarded pilot training and the ships only had a total of 17 D4Ys and 18 <unk> on hand on 1 October ; of these , only 6 and 16 were operational , respectively . The Japanese plan for the defence of the Philippines envisioned that the surviving carriers would be used to lure the American carrier forces away from the invasion area to a position where the carriers could be attacked by land @-@ based aircraft and the transports by the rest of the IJN . The other carrier air groups were not in much better shape and the Japanese decided to retain the aircraft ashore for use against the American carriers . The Fourth Carrier Division was assigned to the Northern Force under the command of Vice Admiral Jisaburō Ozawa and the sisters sailed from Yashima on 20 October . On the morning of 24 October , the bulk of the few aircraft aboard were launched to attack the American carriers as a distraction . They inflicted no damage and caused the Americans to search in the direction from which they had attacked . The Americans finally spotted the Japanese carriers at 16 : 40 , some 200 miles ( 320 km ) east of Cape Engaño , the northeastern tip of Luzon . The American carriers were spread out and it was very late in the day to launch an airstrike , so Admiral William Halsey , commander of the Third Fleet decided to mass his carriers in a position to attack the following morning . Ozawa reversed course during the night , correctly believing that the Americans would follow him north . 
 Although they had lost contact during the night , the Americans did find the Japanese carriers at 07 : 35 . They had already launched an airstrike of 180 aircraft that was orbiting 50 miles ( 80 km ) ahead of the American carriers while waiting for the Japanese ships to be located . This was just the first of a total of five airstrikes that the Americans launched that day . The sisters were not heavily engaged by the early airstrikes which are focusing on the group 's aircraft carriers . Ise claimed to have shot down five attacking dive bombers from the second wave and one small bomb detonated on Turret No. 2 . Hyūga was lightly damaged by near misses that rupture some hull plating in her bulge and pepper her superstructure with splinters . She took on a 5 @-@ degree list that was quickly corrected before she was ordered to tow the crippled carrier Chiyoda to safety . Her attempt was unsuccessful and Chiyoda had to be abandoned to her fate . 
 Ise was attacked by 80 @-@ odd aircraft from the fourth wave , but they failed to inflict any serious damage . She dodged 11 torpedoes and was only hit by a bomb once , on the bulge outboard of the port catapult . Some 34 other bombs near missed her , spraying her with splinters and ruptured some hull plates that contaminated some fuel oil and caused leaks in her port boiler rooms . While an exact total of her casualties is not available , it has been estimated that 5 men were killed and some 111 – 121 crewmen were wounded during this attack . Hyūga was unsuccessfully attacked by an American submarine at 18 : 43 . Around 19 : 00 Ozawa learned about a force of destroyers and cruisers that drove off the Japanese destroyers rescuing survivors from some of the carriers lost earlier in the day and sank Chiyoda . He ordered the Fourth Carrier Division to reverse course and engage the Americans , but the battleships were unable to find them , and Ozawa ordered them to reverse course and head for Amami Ōshima . When they arrived on 27 October , Ozawa transferred to Hyūga and hoisted his flag aboard her . While en route for Kure , the division was unsuccessfully attacked by another submarine . 
 In early November the catapults were removed from both ships , and they loaded troops and munitions later that month . While en route they were diverted to the Spratly Islands upon reports of heavy air raids at Manila . After off @-@ loading their cargo , they sailed for Lingga Island , near Singapore , on 20 November . They transferred to Cam Ranh Bay , French Indochina and Hyūga became flagship of the 5th Fleet there on 14 December . The division sailed for Singapore on 30 December and Vice Admiral Kiyohide Shima transferred his flag to the light cruiser Ōyodo on arrival there the following day . The division continued onwards to Lingga . Its planned return to Japan was delayed by attacks by the American Third Fleet on targets in Indochina and southern China that sank two oil tankers that were intended to refuel the division . 
 The IJN then decided to use the sisters and their escorts to bring a load of petrol , rubber , tin and other strategic minerals back to Japan after the American carriers departed the South China Sea ( Operation Kita ) . They loaded their cargoes beginning on 6 February at Singapore and departed four days later . Also carrying some 1 @,@ 150 oilfield workers , they were escorted by Ōyodo and three destroyers . <unk> Japanese radio signals revealed the Japanese plan to the Allies , and 15 submarines were positioned along their anticipated route in an attempt to intercept and sink the ships . An additional 11 were moved into position while the group was en route , but only three were ultimately able to attack . None of them were successful before the Japanese reached Kure on 20 February . The Fourth Carrier Division was disbanded on 1 March and the sisters were reduced to 1st rank reserve ships . On 19 March Kure was attacked by aircraft from Task Force 58 and Hyūga was hit three times by bombs that killed 37 men and wounded 52 . Her gunners claimed to have shot down one American dive bomber during the attack . Ise was hit twice during the attack , but her casualties , if any , are unknown . 
 The ships were turned into floating AA batteries over the next several months although it availed them little when they were attacked again by American carrier aircraft in July . On the 24th Ise was struck by five bombs and near missed multiple times ; all told she lost 50 crewmen killed and many others wounded . The bombs started numerous leaks and Ise began to settle by the bow , although she was returned to an even keel after three @-@ days pumping . Hyūga was a primary focus of the attack and she received 10 direct hits and up to 30 near misses . She was badly damaged with some 200 @-@ odd crewmen killed and 600 wounded during the attack . She slowly foundered over the next two days and was not attacked when the Americans returned four days later . This time it was Ise 's turn and she was struck 11 or more times with many near misses that put her on the bottom in shallow water with a 15 degree list . The sisters were struck off the Navy List in November and their wrecks were scrapped after the war . 
 """

# Measure time for your custom tokenizer
start = time.time()
custom_encoded = encode(test_text)
custom_decoded = decode(custom_encoded)
custom_time = time.time() - start

# Measure time for the Hugging Face tokenizer
start = time.time()
hf_encoded = hf_tokenizer.encode(test_text)
hf_decoded = hf_tokenizer.decode(hf_encoded.ids)
hf_time = time.time() - start



In [21]:
# For Custom Tokenizer
custom_tokens = [vocab[idx].decode('utf-8', errors='replace') for idx in custom_encoded]
#print(f"Custom Tokenizer - Encoded: {custom_encoded}\nDecoded: {custom_decoded}\nTokens: {custom_tokens}\n
print(f"Time: {custom_time:.6f} sec\n")

# For Hugging Face Tokenizer
hf_tokens = [hf_tokenizer.id_to_token(idx) for idx in hf_encoded.ids]
#print(f"Hugging Face Tokenizer - Encoded: {hf_encoded.ids}\nDecoded: {hf_decoded}\nTokens: {hf_tokens}\nTime: {hf_time:.6f} sec\n")
print(f"Time: {hf_time:.6f} sec\n")

Time: 0.011239 sec

Time: 0.004477 sec

