In [8]:

from transformers import AutoModelForMaskedLM

roberta = AutoModelForMaskedLM.from_pretrained("roberta-large")

print(roberta)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (

In [2]:

from typing import Any
from transformers import AutoModelForMaskedLM

roberta = AutoModelForMaskedLM.from_pretrained("roberta-large")

def visualize_children(
    object : Any,
    level : int = 0,
) -> None:
    """
    Prints the children of (object) and their children too, if there are any.
    Uses the current depth (level) to print things in a ordonnate manner.
    """
    print(f"{'   ' * level}{level}- {type(object).__name__}")
    try:
        for child in object.children():
            visualize_children(child, level + 1)
    except:
        pass

visualize_children(roberta)

0- RobertaForMaskedLM
   1- RobertaModel
      2- RobertaEmbeddings
         3- Embedding
         3- Embedding
         3- Embedding
         3- LayerNorm
         3- Dropout
      2- RobertaEncoder
         3- ModuleList
            4- RobertaLayer
               5- RobertaAttention
                  6- RobertaSelfAttention
                     7- Linear
                     7- Linear
                     7- Linear
                     7- Dropout
                  6- RobertaSelfOutput
                     7- Linear
                     7- LayerNorm
                     7- Dropout
               5- RobertaIntermediate
                  6- Linear
                  6- GELUActivation
               5- RobertaOutput
                  6- Linear
                  6- LayerNorm
                  6- Dropout
            4- RobertaLayer
               5- RobertaAttention
                  6- RobertaSelfAttention
                     7- Linear
                     7- Linear
                     7

In [13]:
total_params = sum(p.numel() for p in roberta.parameters())
param_size = 4

print(total_params /100000)
total_size = total_params * param_size / (1024 * 1024)
print(f'Model size in memory: {total_size:.2f} MB')

3554.12057
Model size in memory: 1355.79 MB


In [3]:

from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaConfig

def distill_roberta(
    teacher_model : RobertaPreTrainedModel,
) -> RobertaPreTrainedModel:
    """
    Distilates a RoBERTa (teacher_model) like would DistilBERT for a BERT model.
    The student model has the same configuration, except for the number of hidden layers, which is // by 2.
    The student layers are initilized by copying one out of two layers of the teacher, starting with layer 0.
    The head of the teacher is also copied.
    """
    # Get teacher configuration as a dictionnary
    configuration = teacher_model.config.to_dict()
    # Half the number of hidden layer
    configuration['num_hidden_layers'] //= 2
    # Convert the dictionnary to the student configuration
    configuration = RobertaConfig.from_dict(configuration)
    # Create uninitialized student model
    student_model = type(teacher_model)(configuration)
    # Initialize the student's weights
    distill_roberta_weights(teacher=teacher_model, student=student_model)
    # Return the student model
    return student_model

In [22]:
roberta.state_dict()

OrderedDict([('roberta.embeddings.word_embeddings.weight',
              tensor([[-0.1406, -0.0096,  0.0391,  ...,  0.0508, -0.0059, -0.0360],
                      [ 0.0078, -0.0156,  0.0156,  ..., -0.0156,  0.0231,  0.0156],
                      [-0.0828, -0.0007, -0.1174,  ...,  0.1086,  0.0696, -0.0356],
                      ...,
                      [ 0.0393,  0.0031,  0.0465,  ..., -0.0240, -0.0505,  0.0342],
                      [ 0.0499,  0.0272,  0.0413,  ..., -0.0370, -0.0100,  0.0071],
                      [-0.0149, -0.0114, -0.0222,  ...,  0.0441,  0.0116, -0.0330]])),
             ('roberta.embeddings.position_embeddings.weight',
              tensor([[-0.0038,  0.0253, -0.0092,  ...,  0.0177,  0.0062, -0.0162],
                      [ 0.0117, -0.0019, -0.0267,  ...,  0.0062, -0.0193,  0.0264],
                      [ 0.0316,  0.0148, -0.0549,  ..., -0.0717, -0.0460,  0.0468],
                      ...,
                      [-0.0209, -0.0052,  0.0484,  ..., -0.0394, 

In [18]:
type(roberta)

transformers.models.roberta.modeling_roberta.RobertaForMaskedLM

In [4]:
from transformers.models.roberta.modeling_roberta import RobertaEncoder, RobertaModel
from torch.nn import Module

def distill_roberta_weights(
    teacher : Module,
    student : Module,
) -> None:
    """
    Recursively copies the weights of the (teacher) to the (student).
    This function is meant to be first called on a RobertaFor... model, but is then called on every children of that model recursively.
    The only part that's not fully copied is the encoder, of which only half is copied.
    """
    # If the part is an entire RoBERTa model or a RobertaFor..., unpack and iterate
    if isinstance(teacher, RobertaModel) or type(teacher).__name__.startswith('RobertaFor'):
        for teacher_part, student_part in zip(teacher.children(), student.children()):
            distill_roberta_weights(teacher_part, student_part)
    # Else if the part is an encoder, copy one out of every layer
    elif isinstance(teacher, RobertaEncoder):
            teacher_encoding_layers = [layer for layer in next(teacher.children())]
            student_encoding_layers = [layer for layer in next(student.children())]
            for i in range(len(student_encoding_layers)):
                student_encoding_layers[i].load_state_dict(teacher_encoding_layers[2*i].state_dict())
    # Else the part is a head or something else, copy the state_dict
    else:
        student.load_state_dict(teacher.state_dict())

In [9]:
student_model = distill_roberta(roberta)

In [7]:
total_params = sum(p.numel() for p in student.parameters())
param_size = 4

print(total_params /100000)
total_size = total_params * param_size / (1024 * 1024)
print(f'Model size in memory: {total_size:.2f} MB')

2042.57369
Model size in memory: 779.18 MB


In [13]:
import torch
from transformers import AutoTokenizer
from transformers.models.roberta.modeling_roberta import RobertaForMaskedLM

# Assuming you have already defined the distill_roberta function and the distill_roberta_weights function

# Load the teacher model
teacher_model = RobertaForMaskedLM.from_pretrained("roberta-large")

# Create the student model
student_model = distill_roberta(teacher_model)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

# Prepare the input text with a masked token
input_text = "The capital of France is <mask>."
input_ids = tokenizer.encode(input_text, return_tensors="pt")  # Convert to tensor

# Print the tokenized input IDs and their corresponding tokens
print("Input IDs:", input_ids)
print("Tokens:", tokenizer.convert_ids_to_tokens(input_ids[0]))

# Run the student model
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = student_model(input_ids)
    logits = outputs.logits  # Get the logits (predictions)

# Decode the predictions
# Get the index of the masked token
masked_indices = torch.where(input_ids == tokenizer.mask_token_id)[1]

# Check if any masked tokens were found
if masked_indices.size(0) == 0:
    print("No [MASK] token found in the input.")
else:
    masked_index = masked_indices.item()

    # Get the predicted token logits for the masked position
    predicted_logits = logits[0, masked_index]  # logits for the masked position

    # Get the top predicted token (word) for the masked position
    top_k = 5  # Get top 5 predictions
    top_k_indices = predicted_logits.topk(top_k).indices.tolist()
    predicted_tokens = tokenizer.convert_ids_to_tokens(top_k_indices)

    # Print the predictions
    print(f"Input Text: {input_text}")
    print("Top Predictions for [MASK]:")
    for token in predicted_tokens:
        print(token)


Input IDs: tensor([[    0,   133,   812,     9,  1470,    16, 50264,     4,     2]])
Tokens: ['<s>', 'The', 'Ġcapital', 'Ġof', 'ĠFrance', 'Ġis', '<mask>', '.', '</s>']
Input Text: The capital of France is <mask>.
Top Predictions for [MASK]:
Ġexemplary
Ġundeniable
ĠColumbus
ĠWaterloo
Ġnoteworthy


In [15]:
import torch
from transformers import AutoTokenizer, RobertaForMaskedLM

# Step 1: Load the teacher model
teacher_model = RobertaForMaskedLM.from_pretrained("roberta-large")

# Step 2: Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

# Step 3: Prepare the input text with the correct mask token
input_text = "The capital of France is <mask>."
input_ids = tokenizer.encode(input_text, return_tensors="pt")  # Convert to tensor

# Print the tokenized input IDs and their corresponding tokens
print("Input IDs:", input_ids)
print("Tokens:", tokenizer.convert_ids_to_tokens(input_ids[0]))

# Step 4: Run the teacher model
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = teacher_model(input_ids)
    logits = outputs.logits  # Get the logits (predictions)

# Step 5: Decode the predictions
# Get the index of the masked token
masked_indices = torch.where(input_ids == tokenizer.mask_token_id)[1]

# Check if any masked tokens were found
if masked_indices.size(0) == 0:
    print("No <mask> token found in the input.")
else:
    masked_index = masked_indices.item()

    # Get the predicted token logits for the masked position
    predicted_logits = logits[0, masked_index]  # logits for the masked position

    # Get the top predicted token (word) for the masked position
    top_k = 5  # Get top 5 predictions
    top_k_indices = predicted_logits.topk(top_k).indices.tolist()
    predicted_tokens = tokenizer.convert_ids_to_tokens(top_k_indices)

    # Print the predictions
    print(f"Input Text: {input_text}")
    print("Top Predictions for <mask>:")
    for token in predicted_tokens:
        print(token)


Input IDs: tensor([[    0,   133,   812,     9,  1470,    16, 50264,     4,     2]])
Tokens: ['<s>', 'The', 'Ġcapital', 'Ġof', 'ĠFrance', 'Ġis', '<mask>', '.', '</s>']
Input Text: The capital of France is <mask>.
Top Predictions for <mask>:
ĠParis
ĠLyon
ĠFrance
ĠNice
ĠLondon


In [6]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

# Get the tokenizer size (vocabulary size)
tokenizer_size = tokenizer.vocab_size

# Print the tokenizer size
print("Tokenizer Size:", tokenizer_size)


Tokenizer Size: 50265


In [4]:
import sys
sys.getsizeof(tokenizer)

48

In [7]:
tokenizer

RobertaTokenizerFast(name_or_path='roberta-large', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [17]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

# Example text
text = "The capital of France is <mask>."

# Tokenize the input text
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text, return_tensors='pt')  # Convert to tensor

# Print results
print("Tokens:", tokens)
print("Token IDs:", token_ids)


Tokens: ['The', 'Ġcapital', 'Ġof', 'ĠFrance', 'Ġis', ' <mask>', '.']
Token IDs: tensor([[    0,   133,   812,     9,  1470,    16, 50264,     4,     2]])


In [9]:
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.layer1 = nn.Linear(10, 20)
        self.layer2 = nn.ReLU()
        self.layer3 = nn.Sequential(
            nn.Linear(20, 30),
            nn.ReLU()
        )
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        return x

model = SimpleModel()
for child in model.children():
    print(child)


Linear(in_features=10, out_features=20, bias=True)
ReLU()
Sequential(
  (0): Linear(in_features=20, out_features=30, bias=True)
  (1): ReLU()
)


In [23]:
model.state_dict()

OrderedDict([('layer1.weight',
              tensor([[ 0.2710,  0.2512,  0.1177, -0.2401, -0.1383, -0.1342, -0.1413,  0.0147,
                       -0.2559,  0.2947],
                      [ 0.1203, -0.0462,  0.2431,  0.2607,  0.2085, -0.0728,  0.2010, -0.0268,
                        0.0496, -0.1994],
                      [-0.2354,  0.1857,  0.1835,  0.1257, -0.2048,  0.2469,  0.2793, -0.2880,
                        0.2274,  0.2749],
                      [ 0.0772, -0.1103,  0.0012, -0.2032,  0.2379, -0.1303,  0.2110, -0.2235,
                       -0.2647, -0.0288],
                      [ 0.0209,  0.1875,  0.2497, -0.1036, -0.0693,  0.2917,  0.1314, -0.2609,
                        0.0766, -0.2290],
                      [ 0.0724,  0.0066,  0.1293, -0.2194, -0.1749,  0.2121, -0.0355, -0.1464,
                       -0.0912, -0.0704],
                      [-0.2104, -0.1687, -0.2671,  0.2304, -0.1537, -0.2162, -0.2994, -0.0780,
                        0.1128,  0.0612],
          

In [11]:
next(model.children())

Linear(in_features=10, out_features=20, bias=True)

In [12]:
next(model.children())

Linear(in_features=10, out_features=20, bias=True)