In [1]:
import sentencepiece as spm

In [None]:
# Train SentencePiece model
spm.SentencePieceTrainer.Train('--input=../data/isc_sentences.txt --model_prefix=../models/spbpe_isc --vocab_size=2000 --model_type=bpe')
sp_bpe = spm.SentencePieceProcessor()
sp_bpe.load('../models/spbpe_isc.model')

# test

In [4]:
import random

# Read the entire dataset
with open('../data/isc_sentences.txt', 'r', encoding='utf-8') as f:
    sentences = f.readlines()

# Shuffle the sentences
random.seed(42)  # For reproducibility
random.shuffle(sentences)

# Split into training and validation sets (e.g., 90% train, 10% validation)
split_ratio = 0.9
split_index = int(len(sentences) * split_ratio)

train_sentences = sentences[:split_index]
validation_sentences = sentences[split_index:]

# Write the training set to a file
with open('../data/train_sentences.txt', 'w', encoding='utf-8') as f:
    f.writelines(train_sentences)

# Write the validation set to a file
with open('../data/validation_sentences.txt', 'w', encoding='utf-8') as f:
    f.writelines(validation_sentences)

print("Training and validation sets created.")


Training and validation sets created.


In [14]:
import sentencepiece as spm

training_data = '../data/train_sentences.txt'
validation_data = '../data/validation_sentences.txt'

fallback_log_prob = -10.0  # Default for unknown tokens
max_length = 50  # Limit sentence length for validation

for vocab_size in [1000, 2000, 2048, 3000, 4000, 5000]:
    model_prefix = f'./spbpe_isc_{vocab_size}'
    spm.SentencePieceTrainer.Train(
        f'--input={training_data} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type=bpe --character_coverage=0.995'
    )
    
    sp = spm.SentencePieceProcessor()
    sp.load(f'{model_prefix}.model')
    
    with open(validation_data, 'r', encoding='utf-8') as f:
        validation_sentences = [line.strip() for line in f if line.strip()]
        validation_sentences = [s for s in validation_sentences if len(s.split()) <= max_length]
    
    total_log_prob = 0.0
    total_tokens = 0
    
    for sentence in validation_sentences:
        pieces = sp.encode(sentence, out_type=str)
        log_prob = sum(
            sp.get_score(sp.piece_to_id(piece)) if sp.piece_to_id(piece) != sp.unk_id() else fallback_log_prob
            for piece in pieces
        )
        total_log_prob += log_prob
        total_tokens += len(pieces)
    
    avg_log_prob = total_log_prob / total_tokens  # Normalize by total tokens
    log_perplexity = -avg_log_prob
    print(f"Vocab Size: {vocab_size}, Log-Perplexity: {log_perplexity}")


Vocab Size: 1000, Log-Perplexity: 278.2530888800319
Vocab Size: 2000, Log-Perplexity: 472.85989894350024
Vocab Size: 2048, Log-Perplexity: 480.88162137263936
Vocab Size: 3000, Log-Perplexity: 607.6451301832209
Vocab Size: 4000, Log-Perplexity: 738.2452452452452
Vocab Size: 5000, Log-Perplexity: 914.4640082858623
