In [1]:
import sentencepiece as spm

# Load the trained BPE tokenizer
bpe_tokenizer = spm.SentencePieceProcessor()
bpe_tokenizer.load('../tokenizer/spbpe_isc.model')

True

In [2]:
import morfessor

io = morfessor.MorfessorIO()
morfessor_model = morfessor.BaselineModel()

# load Morfessor model 
morfessor_model = io.read_binary_model_file('../morfessor/morf_isc_model.bin')

In [3]:
def segment_bpe(sentence, tokenizer):
    # SentencePiece BPE segmentation
    return tokenizer.encode(sentence, out_type=str)

def segment_morfessor(sentence, model):
    # Morfessor segmentation
    words = sentence.split()
    segmented_words = []
    for word in words:
        segments = model.viterbi_segment(word)[0]
        segmented_words.extend(segments)
    return segmented_words

In [4]:
sentences = []

with open('../data/isc_sentences.txt', 'r') as f:
    sentences = f.readlines()

In [5]:
sentences_bpe = [segment_bpe(sentence, bpe_tokenizer) for sentence in sentences]
sentences_morfessor = [segment_morfessor(sentence, morfessor_model) for sentence in sentences]

In [6]:
from transformers import BertTokenizer

# Initialize mBERT tokenizer
mbert_tokenizerBPE = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

# Tokenize segmented sentences for mBERT
def prepare_tokenized_inputs(sentences_segmented):
    tokenized_data = []
    for sentence in sentences_segmented:
        # Join segments with space (as if reconstructing sentence with segmented tokens)
        segmented_text = " ".join(sentence)
        tokenized_data.append(mbert_tokenizerBPE(segmented_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt"))
    return tokenized_data

# Apply tokenization
tokenized_bpe = prepare_tokenized_inputs(sentences_bpe)
# tokenized_morfessor = prepare_tokenized_inputs(sentences_morfessor)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from transformers import BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

# Convert data to Hugging Face Dataset format for training
dataset = Dataset.from_dict({
    "input_ids": [data["input_ids"].squeeze(0) for data in tokenized_bpe],
    "attention_mask": [data["attention_mask"].squeeze(0) for data in tokenized_bpe],
})

# Use DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=mbert_tokenizerBPE,
    mlm_probability=0.15,  # 15% masking probability
)

# Load mBERT model
modelBPE = BertForMaskedLM.from_pretrained("bert-base-multilingual-uncased")

# Define training arguments, with checkpoint-saving configurations
training_args = TrainingArguments(
    output_dir="./mbert_BPE",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
    fp16=True,
    no_cuda=False  # This allows GPU if available, otherwise fall back to CPU
)

# Initialize the Trainer for fine-tuning
trainer = Trainer(
    model=modelBPE,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# Fine-tune the model (checkpoints will automatically be saved at specified steps)
trainer.train()

# Save the final model and tokenizer
modelBPE.save_pretrained("./mbert_BPE")
mbert_tokenizerBPE.save_pretrained("./mbert_BPE")

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Step,Training Loss
500,1.7793
1000,0.2913
1500,0.0109




('./mbert_BPE/tokenizer_config.json',
 './mbert_BPE/special_tokens_map.json',
 './mbert_BPE/vocab.txt',
 './mbert_BPE/added_tokens.json')

In [8]:
import torch
torch.cuda.memory_allocated()

0