In [1]:
from tokenizers import Tokenizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import BertConfig, BertForMaskedLM, Trainer, TrainingArguments,PreTrainedTokenizerFast
from datasets import Dataset,DatasetDict
import json
import torch 
import wandb
from transformers import DataCollatorForLanguageModeling

In [2]:
wandb.init(mode="disabled")
# !PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
file_to_train='strict_model'
file_to_load='3_full_data__sum_freq_gramma__without_mask'
path='./SlovakBabyLM/Curricullum_learning/'

In [None]:
test_tokenizer = Tokenizer.from_file(f"{path}tok_bpe/{file_to_train}_BPE/tokenizer.json")

hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=test_tokenizer,
    return_token_type_ids=False,
    truncation=True,
    return_special_tokens_mask=True,
    mask_token='[MASK]',
    pad_token='[PAD]'
)


In [5]:
config = BertConfig(
    vocab_size=60000,
    hidden_size=84,
    num_hidden_layers=6,
    num_attention_heads=12,
    intermediate_size=1446,
    hidden_dropout_prob=0.15,
    attention_probs_dropout_prob=0.3,  
    hidden_act="gelu_new",
)

model = BertForMaskedLM(config)
model.tokenizer = hf_tokenizer

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=f"{path}saved_model/{file_to_train}/{file_to_load}/checkpoints",
    overwrite_output_dir=True,
    num_train_epochs= 7,
    per_device_train_batch_size =32,
    per_device_eval_batch_size=32,
    evaluation_strategy= "steps",       
    eval_steps= 1000,                   
    save_steps= 1000,                   
    logging_steps=100,
    load_best_model_at_end=True,      
    metric_for_best_model="eval_loss",
    bf16=True,                        
)



# With mask 

In [None]:
# Load masked and unmasked text
with open(f"{path}{file_to_train}_results/{file_to_load}/final_masked_text.txt", "r", encoding="utf-8") as f_masked:
    masked_text = f_masked.read()

with open(f"{path}{file_to_train}_results/{file_to_load}/final_not_masked_text.txt", "r", encoding="utf-8") as f_unmasked:
    unmasked_text = f_unmasked.read()


batch_masked_text_list=masked_text.split('𡨸')
batch_unmasked_text_list=unmasked_text.split('𡨸')

In [7]:
len(batch_masked_text_list)

133064

In [8]:
len(batch_unmasked_text_list)

133064

In [None]:
def split_dataset(text_masked,text_unmasked):
    part_size = len(text_unmasked) // 10

    parts = [text_unmasked[i*part_size:(i+1)*part_size] for i in range(10)]
    parts_m = [text_masked[i*part_size:(i+1)*part_size] for i in range(10)]
    
    train_data = []
    test_data = []
    evaluate_data= []
    
    for unmasked_part, masked_part in zip(parts,parts_m):
        split_point = int(len(unmasked_part) * 0.8)
        
        train_data.extend([{"unmasked_text": unmask,"masked_text": mask} for unmask,mask in zip(unmasked_part[:split_point],masked_part[:split_point])])

        twenty_percent_un=unmasked_part[split_point:]
        twenty_percent_ma=masked_part[split_point:]
        split_point_t_e = int(len(twenty_percent_ma) * 0.5) 

        test_data.extend([{"unmasked_text": unmask,"masked_text": mask}for unmask,mask in zip(twenty_percent_un[:split_point_t_e],twenty_percent_ma[:split_point_t_e])])
        evaluate_data.extend([{"unmasked_text": unmask,"masked_text": mask}for unmask,mask in zip(twenty_percent_un[split_point_t_e:],twenty_percent_ma[split_point_t_e:])])


    return DatasetDict({
        'train': Dataset.from_list(train_data),
        'test': Dataset.from_list(test_data),
        'evaluate': Dataset.from_list(evaluate_data)
    })

final_datasets = split_dataset(batch_masked_text_list,batch_unmasked_text_list)

In [8]:
def tokenize_function(examples):
    outputs = hf_tokenizer(
        examples["masked_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_token_type_ids=False, 
    )
    
    labels = hf_tokenizer(
        examples["unmasked_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_token_type_ids=False, 
    )["input_ids"]

    # Convert to tensors
    mask_token_id = hf_tokenizer.mask_token_id
    for i, batch_token in enumerate(outputs["input_ids"]):
        for j, token_id in enumerate(batch_token):
            if token_id != mask_token_id:
                labels[i][j] = -100
    outputs["labels"] = labels
    return outputs


tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=["unmasked_text","masked_text"])

import gc
torch.cuda.empty_cache()
gc.collect()

Map: 100%|██████████| 106445/106445 [00:47<00:00, 2239.52 examples/s]
Map: 100%|██████████| 13305/13305 [00:05<00:00, 2376.76 examples/s]
Map: 100%|██████████| 13310/13310 [00:05<00:00, 2264.42 examples/s]


18

In [None]:

wandb.init(
    project=f"{file_to_train}",
    name=f'{file_to_load}',
    config=training_args.to_dict()

)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mlubos-kris[0m ([33mlubos-kris-comenius-university-in-bratislava[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
save_path=f"{path}saved_model/{file_to_train}/{file_to_load}"



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["evaluate"], 
)

trainer.train()

model.save_pretrained(save_path)
hf_tokenizer.save_pretrained(save_path)
torch.cuda.empty_cache()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Step,Training Loss,Validation Loss
1000,8.3151,8.413993
2000,7.8013,8.047735
3000,7.7278,8.00267
4000,7.7287,7.972606
5000,7.6854,7.951394
6000,7.6653,7.941825
7000,7.6628,7.926215
8000,7.6449,7.919032
9000,7.6285,7.913572
10000,7.6391,7.90867


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

# Without mask

In [None]:
with open(f"{path}{file_to_train}_results/{file_to_load}/final_not_masked_text.txt", "r", encoding="utf-8") as text:
    unmasked_text = text.read()

batch_unmasked_text_list = unmasked_text.split('𡨸')

In [None]:
def split_dataset(text_list):
    part_size = len(text_list) // 10
    parts = [text_list[i*part_size:(i+1)*part_size] for i in range(5)]
    
    train_data = []
    test_data = []
    evaluate_data= []
    
    for part in parts:
        split_point = int(len(part) * 0.8)
        
        train_data.extend([{"unmasked_text": text} for text in part[:split_point]])

        twenty_percent=part[split_point:]
        split_point_t_e = int(len(twenty_percent) * 0.5)     

        test_data.extend([{"unmasked_text": text} for text in twenty_percent[:split_point_t_e]])

        evaluate_data.extend([{"unmasked_text": text} for text in twenty_percent[split_point_t_e:]])

    return DatasetDict({
        'train': Dataset.from_list(train_data),
        'test': Dataset.from_list(test_data),
        'evaluate': Dataset.from_list(evaluate_data)
    })

final_datasets = split_dataset(batch_unmasked_text_list)



NameError: name 'DatasetDict' is not defined

In [9]:
def tokenize_function(examples):
    return hf_tokenizer(
        examples["unmasked_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_token_type_ids=False, 
        return_special_tokens_mask=True,
    )

tokenized_datasets = final_datasets.map(tokenize_function, batched=True, remove_columns=["unmasked_text"])

Map:   0%|          | 0/98730 [00:00<?, ? examples/s]

Map:   0%|          | 0/12340 [00:00<?, ? examples/s]

Map:   0%|          | 0/12345 [00:00<?, ? examples/s]

In [11]:

run = wandb.init(
    project=f'{file_to_train}',
    name=f'{file_to_load}',
    config=training_args.to_dict()

)

In [None]:
save_path=f"{path}saved_model/{file_to_train}/{file_to_load}"

data_collator = DataCollatorForLanguageModeling(
    tokenizer=hf_tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["evaluate"], 
)

trainer.train()
torch.cuda.empty_cache()
model.save_pretrained(save_path)
hf_tokenizer.save_pretrained(save_path)


test_results =trainer.evaluate(tokenized_datasets["test"]) 
# test_results = test_results.get("test_results", None) 
# with open(f"{loss_path}/test_results.json", "w") as f:
#     json.dump(test_results, f)
run.finish

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Step,Training Loss,Validation Loss
1000,8.634,8.51837
2000,8.188,8.112309
3000,8.1149,8.024786
4000,8.0474,7.961143
5000,7.9734,7.89816
6000,7.9197,7.845847
7000,7.8969,7.820701
8000,7.8591,7.785871
9000,7.8692,7.781829
10000,7.8187,7.763749


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

<function wandb.sdk.wandb_init._WandbInit.make_disabled_run.<locals>.<lambda>(*_, **__)>

## How many [MASK] are created

In [None]:
with open(f"{path}{file_to_train}_results/{file_to_load}/final_not_masked_text.txt", "r", encoding="utf-8") as text:
    unmasked_text = text.read()

batch_unmasked_text_list = unmasked_text.split('𡨸')


In [17]:
def split_dataset(text_list):
    data = []
    for text in text_list:
        data.append({"unmasked_text": text})
    return DatasetDict({
        'train': Dataset.from_list(data)
    })

test_data = split_dataset(batch_unmasked_text_list)

In [None]:
test_tokenizer = Tokenizer.from_file(f"{path}tok_bpe/{file_to_train}_BPE/tokenizer.json")

hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=test_tokenizer,
    return_token_type_ids=False,
    truncation=True,
    return_special_tokens_mask=True,
    mask_token='[MASK]',
    pad_token='[PAD]'
)


In [19]:
def tokenize_function(examples):
    return hf_tokenizer(
        examples["unmasked_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_token_type_ids=False, 
        return_special_tokens_mask=True,
    )

tokenized_datasets = test_data.map(tokenize_function, batched=True, remove_columns=["unmasked_text"])

Map:   0%|          | 0/138814 [00:00<?, ? examples/s]

In [20]:
def convert_to_tensors(example):
    example['input_ids'] = torch.tensor(example['input_ids'])
    example['attention_mask'] = torch.tensor(example['attention_mask'])
    example['special_tokens_mask'] = torch.tensor(example['special_tokens_mask'])
    return example
tr=tokenized_datasets['train']
train_data = tr.map(convert_to_tensors)

a={'input_ids':train_data['input_ids'],'attention_mask':train_data['attention_mask'], 'special_tokens_mask':train_data['special_tokens_mask']}

Map:   0%|          | 0/138814 [00:00<?, ? examples/s]

In [21]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=hf_tokenizer,
    mlm=True,
    mlm_probability=0.15,
)


In [29]:
col_a= data_collator([a])

In [15]:
torch.sum(col_a['labels'] != -100).item()

2657863

In [24]:
torch.sum(col_a['labels'] != -100).item()

2656394

In [28]:
torch.sum(col_a['labels'] != -100).item()

2658517

In [30]:
torch.sum(col_a['labels'] != -100).item()

2659281