In [1]:
import torch
import random
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM, T5Tokenizer, T5ForConditionalGeneration
from collections import Counter
import re

In [2]:
class MaskingStrategies:
    def __init__(self, model_name="bert-base-uncased"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
        self.mask_token = self.tokenizer.mask_token
        self.mask_token_id = self.tokenizer.mask_token_id
        
    def get_word_frequencies(self, texts):
        word_counts = Counter()
        for text in texts:
            words = text.lower().split()
            word_counts.update(words)
        return word_counts
    
    def random_masking(self, text, mask_rate=0.15):
        tokens = self.tokenizer.tokenize(text)
        masked_tokens = tokens.copy()
        
        num_to_mask = int(len(tokens) * mask_rate)
        mask_indices = random.sample(range(len(tokens)), min(num_to_mask, len(tokens)))
        
        for idx in mask_indices:
            masked_tokens[idx] = self.mask_token
            
        return self.tokenizer.convert_tokens_to_string(masked_tokens)
    
    def frequency_based_masking(self, text, word_frequencies, mask_rate=0.15, mask_rare=True):
        tokens = self.tokenizer.tokenize(text)
        masked_tokens = tokens.copy()
        
        # Get frequency scores for each token
        token_scores = []
        for i, token in enumerate(tokens):
            word = token.replace('##', '')  # Handle subword tokens
            freq = word_frequencies.get(word.lower(), 1)
            token_scores.append((i, freq))
        
        # Sort by frequency (ascending for rare words, descending for common)
        if mask_rare:
            token_scores.sort(key=lambda x: x[1])  # Mask rare words first
        else:
            token_scores.sort(key=lambda x: x[1], reverse=True)  # Mask common words first
        
        num_to_mask = int(len(tokens) * mask_rate)
        mask_indices = [idx for idx, _ in token_scores[:num_to_mask]]
        
        for idx in mask_indices:
            masked_tokens[idx] = self.mask_token
            
        return self.tokenizer.convert_tokens_to_string(masked_tokens)
    
    def pos_based_masking(self, text, mask_rate=0.15, prefer_content_words=True):
        tokens = self.tokenizer.tokenize(text)
        masked_tokens = tokens.copy()
        
        # Simple heuristic for content words (nouns, verbs, adjectives, adverbs)
        content_word_patterns = [
            r'\w+ing$',  # gerunds/present participles
            r'\w+ed$',   # past tense verbs
            r'\w+ly$',   # adverbs
            r'\w+tion$', # nouns ending in -tion
            r'\w+ness$', # nouns ending in -ness
        ]
        
        # Function words to avoid masking
        function_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 
                         'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were'}
        
        content_indices = []
        function_indices = []
        
        for i, token in enumerate(tokens):
            word = token.replace('##', '').lower()
            
            if word in function_words:
                function_indices.append(i)
            elif any(re.match(pattern, word) for pattern in content_word_patterns):
                content_indices.append(i)
            elif len(word) > 3:  # Longer words are likely content words
                content_indices.append(i)
            else:
                function_indices.append(i)
        
        num_to_mask = int(len(tokens) * mask_rate)
        
        if prefer_content_words and content_indices:
            # Prioritize content words
            available_indices = content_indices + function_indices
            mask_indices = available_indices[:num_to_mask]
        else:
            # Random selection
            mask_indices = random.sample(range(len(tokens)), min(num_to_mask, len(tokens)))
        
        for idx in mask_indices:
            masked_tokens[idx] = self.mask_token
            
        return self.tokenizer.convert_tokens_to_string(masked_tokens)


In [3]:
class T5MaskingStrategies:
    def __init__(self):
        self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
        self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
    
    def span_masking(self, text, mask_rate=0.15, avg_span_length=3):
        tokens = text.split()
        masked_text = ""
        targets = ""
        
        i = 0
        mask_id = 0
        
        while i < len(tokens):
            if random.random() < mask_rate:
                # Start a masked span
                span_length = np.random.poisson(avg_span_length) + 1
                span_length = min(span_length, len(tokens) - i)
                
                # Add mask token to input
                mask_token = f"<extra_id_{mask_id}>"
                masked_text += mask_token + " "
                
                # Add original tokens to target
                targets += mask_token + " "
                for j in range(span_length):
                    if i + j < len(tokens):
                        targets += tokens[i + j] + " "
                
                i += span_length
                mask_id += 1
            else:
                masked_text += tokens[i] + " "
                i += 1
        
        targets += f"<extra_id_{mask_id}>"
        
        return masked_text.strip(), targets.strip()

In [4]:
def demonstrate_masking_strategies():
    
    # Sample texts
    texts = [
        "The quick brown fox jumps over the lazy dog in the beautiful garden.",
        "Machine learning algorithms require substantial computational resources for training.",
        "Climate change affects global weather patterns and ocean temperatures significantly.",
        "Artificial intelligence revolutionizes healthcare through predictive analytics and automation."
    ]
    
    print("=" * 80)
    print("MASKING STRATEGIES DEMONSTRATION")
    print("=" * 80)
    
    # Initialize masking strategies
    mlm_masker = MaskingStrategies("bert-base-uncased")
    t5_masker = T5MaskingStrategies()
    
    # Calculate word frequencies for intelligent masking
    word_frequencies = mlm_masker.get_word_frequencies(texts)
    
    for i, text in enumerate(texts):
        print(f"\n--- EXAMPLE {i+1} ---")
        print(f"Original: {text}")
        print()
        
        # Test different masking rates
        mask_rates = [0.15, 0.30, 0.50]
        
        for rate in mask_rates:
            print(f"MASKING RATE: {rate:.0%}")
            print("-" * 40)
            
            # 1. MLM - Random Masking
            random_masked = mlm_masker.random_masking(text, mask_rate=rate)
            print(f"MLM Random:     {random_masked}")
            
            # 2. MLM - Frequency-based (rare words)
            freq_rare_masked = mlm_masker.frequency_based_masking(
                text, word_frequencies, mask_rate=rate, mask_rare=True
            )
            print(f"MLM Rare Words: {freq_rare_masked}")
            
            # 3. MLM - POS-based (content words)
            pos_masked = mlm_masker.pos_based_masking(text, mask_rate=rate)
            print(f"MLM Content:    {pos_masked}")
            
            # 4. PLM/T5 - Span masking
            if rate <= 0.30:  # T5 works better with lower masking rates
                span_input, span_target = t5_masker.span_masking(text, mask_rate=rate)
                print(f"PLM Input:      {span_input}")
                print(f"PLM Target:     {span_target}")
            
            print()
    
    # Demonstrate advanced masking heuristics
    print("\n" + "=" * 80)
    print("ADVANCED MASKING HEURISTICS")
    print("=" * 80)
    
    sample_text = "The artificial intelligence system processes natural language with remarkable accuracy."
    
    print(f"Original: {sample_text}")
    print()
    
    # Different intelligent masking strategies
    strategies = [
        ("Random", lambda: mlm_masker.random_masking(sample_text, 0.25)),
        ("Rare Words", lambda: mlm_masker.frequency_based_masking(
            sample_text, word_frequencies, 0.25, mask_rare=True)),
        ("Common Words", lambda: mlm_masker.frequency_based_masking(
            sample_text, word_frequencies, 0.25, mask_rare=False)),
        ("Content Words", lambda: mlm_masker.pos_based_masking(
            sample_text, 0.25, prefer_content_words=True)),
        ("Function Words", lambda: mlm_masker.pos_based_masking(
            sample_text, 0.25, prefer_content_words=False)),
    ]
    
    for name, strategy in strategies:
        result = strategy()
        print(f"{name:15}: {result}")

In [5]:
def training_example():
    print("\n" + "=" * 80)
    print("TRAINING EXAMPLE")
    print("=" * 80)
    
    masker = MaskingStrategies("bert-base-uncased")
    
    # Sample training data
    training_text = "Machine learning models learn patterns from large datasets."
    
    # Create masked input
    masked_text = masker.random_masking(training_text, mask_rate=0.15)
    
    # Tokenize for model input
    inputs = masker.tokenizer(masked_text, return_tensors="pt", padding=True)
    labels = masker.tokenizer(training_text, return_tensors="pt", padding=True)
    
    print(f"Original text: {training_text}")
    print(f"Masked text:   {masked_text}")
    print(f"Input IDs:     {inputs['input_ids']}")
    print(f"Labels:        {labels['input_ids']}")
    
    # For T5 (PLM example)
    print("\nT5 (PLM) Example:")
    t5_masker = T5MaskingStrategies()
    span_input, span_target = t5_masker.span_masking(training_text, mask_rate=0.20)
    
    input_ids = t5_masker.tokenizer(span_input, return_tensors="pt").input_ids
    target_ids = t5_masker.tokenizer(span_target, return_tensors="pt").input_ids
    
    print(f"T5 Input:      {span_input}")
    print(f"T5 Target:     {span_target}")
    print(f"Input IDs:     {input_ids}")
    print(f"Target IDs:    {target_ids}")

In [6]:
if __name__ == "__main__":
    # Set random seed for reproducibility
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    
    demonstrate_masking_strategies()
    training_example()

MASKING STRATEGIES DEMONSTRATION


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


--- EXAMPLE 1 ---
Original: The quick brown fox jumps over the lazy dog in the beautiful garden.

MASKING RATE: 15%
----------------------------------------
MLM Random:     the [MASK] brown fox jumps over the lazy dog in [MASK] beautiful garden.
MLM Rare Words: the [MASK] [MASK] fox jumps over the lazy dog in the beautiful garden.
MLM Content:    the [MASK] [MASK] fox jumps over the lazy dog in the beautiful garden.
PLM Input:      <extra_id_0> over the lazy dog in <extra_id_1> garden.
PLM Target:     <extra_id_0> The quick brown fox jumps <extra_id_1> the beautiful <extra_id_2>

MASKING RATE: 30%
----------------------------------------
MLM Random:     [MASK] [MASK] brown [MASK] jumps over the lazy dog in the [MASK] garden.
MLM Rare Words: the [MASK] [MASK] [MASK] [MASK] over the lazy dog in the beautiful garden.
MLM Content:    the [MASK] [MASK] fox [MASK] [MASK] the lazy dog in the beautiful garden.
PLM Input:      The <extra_id_0> <extra_id_1> in the <extra_id_2>
PLM Target:     <

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Original text: Machine learning models learn patterns from large datasets.
Masked text:   machine learning models learn patterns from [MASK] datasets.
Input IDs:     tensor([[  101,  3698,  4083,  4275,  4553,  7060,  2013,   103,  2951, 13462,
          2015,  1012,   102]])
Labels:        tensor([[  101,  3698,  4083,  4275,  4553,  7060,  2013,  2312,  2951, 13462,
          2015,  1012,   102]])

T5 (PLM) Example:
T5 Input:      Machine learning models learn patterns <extra_id_0>
T5 Target:     <extra_id_0> from large datasets. <extra_id_1>
Input IDs:     tensor([[ 5879,  1036,  2250,   669,  4264, 32099,     1]])
Target IDs:    tensor([[32099,    45,   508, 17953,     7,     5, 32098,     1]])
