In [2]:
# https://thepythoncode.com/article/pretraining-bert-huggingface-transformers-in-python

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

from datasets import *
from transformers import *
from tokenizers import *
import os
import json

2025-03-03 00:46:02.333351: I tensorflow/core/platform/cpu_feature_guard.cc:181] Beginning TensorFlow 2.15, this package will be updated to install stock TensorFlow 2.15 alongside Intel's TensorFlow CPU extension plugin, which provides all the optimizations available in the package and more. If a compatible version of stock TensorFlow is present, only the extension will get installed. No changes to code or installation setup is needed as a result of this change.
More information on Intel's optimizations for TensorFlow, delivered as TensorFlow extension plugin can be viewed at https://github.com/intel/intel-extension-for-tensorflow.
2025-03-03 00:46:02.333527: I tensorflow/core/platform/cpu_feature_guard.cc:192] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
dataset = load_dataset("cc_news", split="train")

In [4]:
d = dataset.train_test_split(test_size=0.1)
d["train"], d["test"]

(Dataset({
     features: ['title', 'text', 'domain', 'date', 'description', 'url', 'image_url'],
     num_rows: 637416
 }),
 Dataset({
     features: ['title', 'text', 'domain', 'date', 'description', 'url', 'image_url'],
     num_rows: 70825
 }))

In [1]:
# for t in d["train"]["text"][:1]:
#   print(t)
#   print("="*50)

## Train a Tokenizer

In [5]:
# if you want to train the tokenizer from scratch (especially if you have custom
# dataset loaded as datasets object), then run this cell to save it as files
# but if you already have your custom data as text files, there is no point using this
def dataset_to_text(dataset, output_filename="data.txt"):
  """Utility function to save dataset text to disk,
  useful for using the texts to train the tokenizer 
  (as the tokenizer accepts files)"""
  with open(output_filename, "w") as f:
    for t in dataset["text"]:
      print(t, file=f)

# save the training set to train.txt
dataset_to_text(d["train"], "/shared/3/projects/bangzhao/prosodic_embeddings/bert_train/dataset/train.txt")
# save the testing set to test.txt
dataset_to_text(d["test"], "/shared/3/projects/bangzhao/prosodic_embeddings/bert_train/dataset/test.txt")

In [3]:
from collections import Counter

# Count unique words in corpus with a progress bar
def count_vocab(file_path):
    word_set = set()
    with open(file_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Processing Lines", unit=" lines"):
            words = line.split()
            word_set.update(words)  # Add unique words to set
    return len(word_set)  # Count unique words

# File path
file_path = "/shared/3/projects/bangzhao/prosodic_embeddings/bert_train/dataset/train.txt"

# Count and print vocabulary size
vocab_count = count_vocab(file_path)
print(f"Unique words in corpus: {vocab_count}")

Processing Lines: 0 lines [00:00, ? lines/s]

Unique words in corpus: 3291715


In [5]:
special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]
# if you want to train the tokenizer on both sets
# files = ["train.txt", "test.txt"]
# training the tokenizer on the training set
files = ["/shared/3/projects/bangzhao/prosodic_embeddings/bert_train/dataset/train.txt"]
# 30,522 vocab is BERT's default vocab size, feel free to tweak
vocab_size = 30_522
# maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 512
# whether to truncate
truncate_longer_samples = False

In [6]:
# initialize the WordPiece tokenizer
tokenizer = BertWordPieceTokenizer()
# train the tokenizer
tokenizer.train(files=files, vocab_size=vocab_size, special_tokens=special_tokens)
# enable truncation up to the maximum 512 tokens
tokenizer.enable_truncation(max_length=max_length)






In [6]:
model_path = "/shared/3/projects/bangzhao/prosodic_embeddings/bert_train/pretrained-bert/"

In [None]:
# make the directory if not already there
if not os.path.isdir(model_path):
  os.mkdir(model_path)
# save the tokenizer  
tokenizer.save_model(model_path)
# dumping some of the tokenizer config to config file, 
# including special tokens, whether to lower case and the maximum sequence length
with open(os.path.join(model_path, "config.json"), "w") as f:
  tokenizer_cfg = {
      "do_lower_case": True,
      "unk_token": "[UNK]",
      "sep_token": "[SEP]",
      "pad_token": "[PAD]",
      "cls_token": "[CLS]",
      "mask_token": "[MASK]",
      "model_max_length": max_length,
      "max_len": max_length,
  }
  json.dump(tokenizer_cfg, f)

In [8]:
tokenizer = BertTokenizerFast.from_pretrained(model_path)

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
loading configuration file /shared/3/projects/bangzhao/prosodic_embeddings/bert_train/pretrained-bert/config.json
Model config BertConfig {
  "_name_or_path": "/shared/3/projects/bangzhao/prosodic_embeddings/bert_train/pretrained-bert/",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_token": "[MASK]",
  "max_len": 512,
  "max_position_embeddings": 512,
  "model_max_length": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token": "[PAD]",
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "sep_token": "[SEP]",
 

In [9]:
def encode_with_truncation(examples):
    """Mapping function to tokenize the sentences with truncation"""
    return tokenizer(
        examples["text"], truncation=True, padding="max_length",
        max_length=max_length, return_special_tokens_mask=True
    )

def encode_without_truncation(examples):
    """Mapping function to tokenize the sentences without truncation"""
    return tokenizer(examples["text"], return_special_tokens_mask=True)

# Select the appropriate function
encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation

train_dataset = d["train"].map(encode, batched=True, desc="Tokenizing Train Dataset")
test_dataset = d["test"].map(encode, batched=True, desc="Tokenizing Test Dataset")

if truncate_longer_samples:
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
    test_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
    train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])

Tokenizing Train Dataset:   0%|          | 0/637416 [00:00<?, ? examples/s]

Tokenizing Test Dataset:   0%|          | 0/70825 [00:00<?, ? examples/s]

In [10]:
train_dataset

Dataset({
    features: ['title', 'text', 'domain', 'date', 'description', 'url', 'image_url', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 637416
})

In [11]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
if not truncate_longer_samples:
  train_dataset = train_dataset.map(group_texts, batched=True,
                                    desc=f"Grouping texts in chunks of {max_length}")
  test_dataset = test_dataset.map(group_texts, batched=True,
                                  desc=f"Grouping texts in chunks of {max_length}")
  # convert them from lists to torch tensors
  train_dataset.set_format("torch")
  test_dataset.set_format("torch")

Grouping texts in chunks of 512:   0%|          | 0/637416 [00:00<?, ? examples/s]

Grouping texts in chunks of 512:   0%|          | 0/70825 [00:00<?, ? examples/s]

In [12]:
len(train_dataset), len(test_dataset)

(73912, 8221)

In [19]:
model_config = BertConfig(
    vocab_size=vocab_size,
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=2,
    intermediate_size=512,
    max_position_embeddings=max_length,
)

model = BertForMaskedLM(config=model_config)

# BERT-Base	768	12	12	3072
# BERT-Small 512	4	8	2048
# BERT-Mini	256	4	4	1024
# BERT-Tiny	128	2	2	512

In [43]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise

In [44]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

In [45]:
training_args = TrainingArguments(
    output_dir=model_path,          
    evaluation_strategy="steps",    
    overwrite_output_dir=True,      
    num_train_epochs=3,            
    per_device_train_batch_size=16,  # Reduce batch size for better convergence
    gradient_accumulation_steps=2,   # Keep it small for stability
    per_device_eval_batch_size=16,   
    logging_steps=1000,             
    save_steps=1000,
    learning_rate=1e-3,              # first use 3e-3 for 6 epochs, then use 1e-3 for 6 epochs
    warmup_steps=1000,               # Prevent instability
    weight_decay=0.01,               # Helps generalization
    report_to='none',
)

using `logging_steps` to initialize `eval_steps` to 1000
PyTorch: setting up devices


In [46]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [47]:
# train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask, title, description, url, image_url, text, domain, date. If special_tokens_mask, title, description, url, image_url, text, domain, date are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 73,912
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 6,930
  Number of trainable parameters = 4,416,698


Step,Training Loss,Validation Loss
1000,4.0653,3.837
2000,4.0768,3.838254
3000,4.0764,3.811674
4000,4.0706,3.79621
5000,4.055,3.773549
6000,4.0465,3.759283


The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask, title, description, url, image_url, text, domain, date. If special_tokens_mask, title, description, url, image_url, text, domain, date are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 8221
  Batch size = 16
Saving model checkpoint to /shared/3/projects/bangzhao/prosodic_embeddings/bert_train/pretrained-bert/checkpoint-1000
Configuration saved in /shared/3/projects/bangzhao/prosodic_embeddings/bert_train/pretrained-bert/checkpoint-1000/config.json
Model weights saved in /shared/3/projects/bangzhao/prosodic_embeddings/bert_train/pretrained-bert/checkpoint-1000/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask, title, description, url, im

TrainOutput(global_step=6930, training_loss=4.062674280472132, metrics={'train_runtime': 945.2284, 'train_samples_per_second': 234.585, 'train_steps_per_second': 7.332, 'total_flos': 302502114017280.0, 'train_loss': 4.062674280472132, 'epoch': 3.0})

In [48]:
model = BertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-6930"))
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_path)

loading configuration file /shared/3/projects/bangzhao/prosodic_embeddings/bert_train/pretrained-bert/checkpoint-6930/config.json
Model config BertConfig {
  "_name_or_path": "/shared/3/projects/bangzhao/prosodic_embeddings/bert_train/pretrained-bert/checkpoint-6930",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file /shared/3/projects/bangzhao/prosodic_embeddings/bert_train/pretrained-bert/checkpoint-6930/model.safetensors
All model checkp

In [49]:
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [50]:
# perform predictions
examples = [
  "Today's most trending hashtags on [MASK] is Donald Trump",
  "The [MASK] was cloudy yesterday, but today it's rainy.",
]
for example in examples:
  for prediction in fill_mask(example):
    print(f"{prediction['sequence']}, confidence: {prediction['score']}")
  print("="*50)

today's most trending hashtags on twitter is donald trump, confidence: 0.2891486883163452
today's most trending hashtags on facebook is donald trump, confidence: 0.08995313942432404
today's most trending hashtags on trump is donald trump, confidence: 0.06772739440202713
today's most trending hashtags on what is donald trump, confidence: 0.03804310783743858
today's most trending hashtags on today is donald trump, confidence: 0.037062812596559525
the evening was cloudy yesterday, but today it's rainy., confidence: 0.07923946529626846
the morning was cloudy yesterday, but today it's rainy., confidence: 0.07475239038467407
the weather was cloudy yesterday, but today it's rainy., confidence: 0.07051657140254974
the storm was cloudy yesterday, but today it's rainy., confidence: 0.06587118655443192
the afternoon was cloudy yesterday, but today it's rainy., confidence: 0.052087776362895966
