In [2]:
import sentencepiece as spm
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaModel, XLMRobertaConfig
from hydra import compose, initialize
from omegaconf import OmegaConf
from datasets import load_from_disk


# with initialize(version_base=None, config_path="conf", job_name="test_app"):
#     cfg = compose(config_name="config")
#     print(OmegaConf.to_yaml(cfg))

# global initialization
initialize(version_base=None, config_path="conf", job_name="test_app")

  from .autonotebook import tqdm as notebook_tqdm


hydra.initialize()

In [3]:
cfg = compose(config_name="config")


In [15]:
def dump_train_text(dataset_file: str, output_file_path: str):
    # Load the torch dataset
    dataset = load_from_disk(dataset_file)
    
    # Open the output file in write mode
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        # Iterate over the train shard
        for instance in dataset['train']:
            # Write the text field to the output file
            output_file.write(instance['text'] + '\n')

for language in cfg.languages.split(','):

    dump_train_text(f'{cfg.datasets_path}/{language}', f'{cfg.raw_data_path}/{language}.txt')

In [23]:
# Train spm for each language
for language in cfg.languages.split(','):
    print("###############", language, "#################")
    cmd = '--input={} --model_prefix={} --vocab_size={} --model_type=unigram --character_coverage=1.0 --train_extremely_large_corpus=true'.format(
        f'{cfg.raw_data_path}/{language}.txt', f'{cfg.spm_models_path}/{language}', cfg.vocab_size)
    spm.SentencePieceTrainer.train(cmd)


############### ctd_Latn #################
############### pcm_Latn #################
############### quc_Latn #################
############### wol_Latn #################
############### sme_Latn #################
############### grc_Grek #################
############### ajp_Arab #################
############### lzh_Hani #################
############### hbo_Hebr #################


sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=/mounts/data/proj/ayyoobbig/vocab_map/raw_data//ctd_Latn.txt --model_prefix=/mounts/data/proj/ayyoobbig/vocab_map/spm_models//ctd_Latn --vocab_size=12000 --model_type=unigram --character_coverage=1.0 --train_extremely_large_corpus=true
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /mounts/data/proj/ayyoobbig/vocab_map/raw_data//ctd_Latn.txt
  input_format: 
  model_prefix: /mounts/data/proj/ayyoobbig/vocab_map/spm_models//ctd_Latn
  model_type: UNIGRAM
  vocab_size: 12000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pie

In [6]:
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as sp_model


# Add new tokens of each language to original spm creating a new tokenizer for each lang
for language in cfg.languages.split(','):
    print('language: ', language)
    original_m = sp_model.ModelProto()
    original_m.ParseFromString(open(cfg.original_spm, 'rb').read())
    new_m = sp_model.ModelProto()
    new_m.ParseFromString(open(f'{cfg.spm_models_path}/{language}.model', 'rb').read())

    add_cnt = 0 
    piece_d = {piece.piece: 1 for piece in original_m.pieces}
    for new_piece in new_m.pieces:
        if new_piece.piece not in piece_d:
            piece_to_add = sp_model.ModelProto().SentencePiece()
            # Add token
            piece_to_add.piece = new_piece.piece
            # Add token log-prob
            piece_to_add.score = new_piece.score
            original_m.pieces.append(piece_to_add)
            add_cnt += 1

    print('Add {} tokens'.format(add_cnt))
    # logging.info('Add {} tokens'.format(add_cnt))
    
    new_spm_save_dir = f'{cfg.spm_models_path}/extended_{language}.model'
    with open(new_spm_save_dir, 'wb') as f:
        f.write(original_m.SerializeToString())
    
    tokenizer = XLMRobertaTokenizer.from_pretrained(cfg.HF_tokenizer_model_name)
    tokenizer.vocab_file = new_spm_save_dir
    tokenizer.sp_model.load(tokenizer.vocab_file)
    tokenizer.save_pretrained(f'{cfg.spm_models_path}/HF/extended_{language}/')


language:  ctd_Latn
Add 9577 tokens
language:  pcm_Latn
Add 4704 tokens
language:  quc_Latn
Add 9708 tokens
language:  wol_Latn
Add 5864 tokens
language:  sme_Latn
Add 8809 tokens
language:  grc_Grek
Add 10973 tokens
language:  ajp_Arab
Add 7200 tokens
language:  lzh_Hani
Add 4251 tokens
language:  hbo_Hebr
Add 10818 tokens


In [3]:
# Load XLMRoberta model and tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Train sentencepiece tokenizer on data.txt
spm.SentencePieceTrainer.train(input='data.txt', model_prefix='spm', vocab_size=10000)

# Load the trained sentencepiece tokenizer
sp = spm.SentencePieceProcessor()
sp.load('spm.model')

# Replace XLMR's tokenizer with the new tokenizer
tokenizer.spm_model = sp

# Freeze all parameters except for the embedding layer
for name, param in model.named_parameters():
    if 'embeddings' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

model = XLMRobertaModel.from_pretrained('xlm-roberta-base')
# Train a new embedding layer with the new tokenizer
config = XLMRobertaConfig.from_pretrained('xlm-roberta-base')
config.vocab_size = len(tokenizer)
model.resize_token_embeddings(len(tokenizer))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Train the model on your task
# ... (your code for training the model)

# Save the new tokenizer and model
tokenizer.save_pretrained('new_tokenizer')
model.save_pretrained('new_model')