In [18]:
import math 
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import numpy as np
from transformers import RobertaModel, RobertaTokenizer
from typing import List, Optional, Tuple, Union
from transformers import AutoTokenizer, RobertaForMaskedLM, RobertaForCausalLM



if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

# No token_type_ids, just separate sequences with tokenizer.sep_token
# So I guess RobertaTokenizer automatically adds <s> and </s> tokens to input
# cls token is aparently <s> 

using device: mps


In [20]:
class RobertaEmbeddings(nn.Module):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """

    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        # self.register_buffer(
        #     "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        # )
    
        self.padding_idx = config.pad_token_id
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    def forward(self, input_ids):

        def create_position_ids_from_input_ids(input_ids, padding_idx):
        
            # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
            mask = input_ids.ne(padding_idx).int()
            incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
            return incremental_indices.long() + padding_idx
    

        position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx)
            
        embeddings = self.word_embeddings(input_ids)
                
        position_embeddings = self.position_embeddings(position_ids)
        
        embeddings += position_embeddings
        
        embeddings = self.LayerNorm(embeddings)
        
        return embeddings

class RobertaSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
       
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)


    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward( self, hidden_states, attention_mask):
        mixed_query_layer = self.query(hidden_states)
    
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        
        if attention_mask is not None:
            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
            attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        context_layer = torch.matmul(attention_probs, value_layer)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        outputs =  context_layer

        return outputs


# Copied from transformers.models.bert.modeling_bert.BertSdpaSelfAttention with Bert->Roberta
class RobertaSdpaSelfAttention(RobertaSelfAttention):
    def __init__(self, config):
        super().__init__(config)

    def forward( self, hidden_states, attention_mask = None):
        
        bsz, tgt_len, _ = hidden_states.size()

        query_layer = self.transpose_for_scores(self.query(hidden_states))
        
        current_states = hidden_states
        attention_mask = attention_mask

        key_layer = self.transpose_for_scores(self.key(current_states))
        value_layer = self.transpose_for_scores(self.value(current_states))

        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_layer,
            key_layer,
            value_layer,
            attn_mask=attention_mask,
            dropout_p=0.0,
            is_causal=False,
        )

        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)

        outputs = attn_output

        return outputs

class RobertaSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

class RobertaAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.self = RobertaSdpaSelfAttention(config)
        self.output = RobertaSelfOutput(config)

    def forward(self, hidden_states,attention_mask = None):
        
        self_outputs = self.self( hidden_states, attention_mask)
        attention_output = self.output(self_outputs, hidden_states)
        
        return attention_output

class RobertaIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        self.intermediate_act_fn = nn.GELU()

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

class RobertaOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
        
class RobertaLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        self.attention = RobertaAttention(config)
        self.intermediate = RobertaIntermediate(config)
        self.output = RobertaOutput(config)

    def forward( self, hidden_states, attention_mask = None):
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attention_outputs = self.attention( hidden_states, attention_mask)
        attention_output = self_attention_outputs

        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        
        return layer_output


class RobertaEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)])

    def forward( self, hidden_states, attention_mask = None):
        
        for i, layer_module in enumerate(self.layer):
          
            layer_outputs = layer_module(
                hidden_states,
                attention_mask
            )

            hidden_states = layer_outputs

        return hidden_states


class RobertaModel(nn.Module):


    def __init__(self, config):
        super().__init__()
        self.config = config

        self.embeddings = RobertaEmbeddings(config)
        self.encoder = RobertaEncoder(config)

    def forward(self, input_ids, attention_mask = None):
   
        input_shape = input_ids.size()
        batch_size, seq_length = input_shape
        
        device = input_ids.device
    
        embedding_output = self.embeddings(input_ids=input_ids)

        if attention_mask is None:
            attention_mask = torch.ones((batch_size, seq_length), device=device)

        use_sdpa_attention_masks = True

        extended_attention_mask = attention_mask
        # # Expand the attention mask
        # if use_sdpa_attention_masks and attention_mask.dim() == 2:
        #     # Expand the attention mask for SDPA.
        #     # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
        #     extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
        #         attention_mask, embedding_output.dtype, tgt_len=seq_length
        #     )
        # else:
        #     # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        #     # ourselves in which case we just need to make it broadcastable to all heads.
        #     extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)

        encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask)
        
        return encoder_outputs

class RobertaLMHead(nn.Module):
    """Roberta Head for masked language modeling."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        self.gelu = nn.GELU()
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)

    def forward(self, features):
        x = self.dense(features)
        x = self.gelu(x)
        x = self.layer_norm(x)

        # project back to size of vocabulary with bias
        x = self.decoder(x)

        return x

class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.out_proj(x)
        return x


In [23]:
class RobertaMaskedLM(nn.Module):

    def __init__(self, config):
        super().__init__()

        self.roberta = RobertaModel(config)
        self.lm_head = RobertaLMHead(config)
        self.config = config

        # weight tying between input embedding and prediction head "de-embedding"
        self.lm_head.decoder.weight = self.roberta.embeddings.word_embeddings.weight 


    def forward( self, input_ids, attention_mask = None, labels = None):

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
        )
        sequence_output = outputs
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            # move labels to correct device to enable model parallelism
            labels = labels.to(prediction_scores.device)
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

       
        output = prediction_scores
        return output
        # return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
    
    @classmethod
    def from_pretrained(cls, model_type = "FacebookAI/roberta-base"):
        """ Loading pretrained Roberta weights from hugging face """
        # print("loading weights for %s" % model_type)

        # Random init of model
        config = RobertaConfig()
        model = RobertaMaskedLM(config)
        
        sd = model.state_dict()
        sd_keys = sd.keys()

        # Init a Roberta from hugging face 
        model_hf = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")
        sd_hf = model_hf.state_dict()
        sd_hf_keys = [k for k in sd_hf.keys() if not k.endswith('lm_head.bias')]
        # Copy over weights. State Dicts are currently in same order, so I can just blind copy 
        for keys in zip(sd_keys, sd_hf_keys):
            # print(sd[keys[0]].shape)
            # print(sd_hf[keys[1]].shape)
            
            assert(sd[keys[0]].shape == sd_hf[keys[1]].shape)
            assert(keys[0] == keys[1])
            
            with torch.no_grad():
                sd[keys[0]].copy_(sd_hf[keys[1]])

        return model

@dataclass
class RobertaConfig:
    vocab_size = 50265
    hidden_size = 768 
    num_hidden_layers = 12
    num_attention_heads = 12
    intermediate_size = 3072
    max_position_embeddings = 514
    layer_norm_eps = 1e-12
    
    type_vocab_size = 1
    pad_token_id = 1
    bos_token_id = 0
    eos_token_id = 2

In [24]:
model = RobertaMaskedLM.from_pretrained().to(device)
model.eval()
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

# x = "Criminals are most likely to have <mask> colored skin"
x = "The man worked as a <mask>."
x = torch.tensor(tokenizer.encode(x)).unsqueeze(0).to(device)

with torch.no_grad():
    logits = model(x)

mask_token_index = (x == tokenizer.mask_token_id)[0].nonzero(as_tuple = True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)

probs = F.softmax(logits[0, mask_token_index], dim=1)

topk = torch.topk(probs, 5)

for i in range(topk.indices.shape[1]):
    print(tokenizer.decode(topk.indices.squeeze()[i].item()), round(topk.values.squeeze()[i].item() * 100, 2))

 mechanic 9.1
 waiter 7.72
 butcher 7.64
 miner 4.75
 guard 4.22


In [12]:
(lm.state_dict()['roberta.embeddings.word_embeddings.weight'] == lm.state_dict()['lm_head.decoder.weight']).all()
print(lm.state_dict()['roberta.embeddings.word_embeddings.weight'].shape)
print(lm.state_dict()['lm_head.decoder.weight'].shape)


torch.Size([50265, 768])
torch.Size([50265, 768])


In [7]:
lm = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")
keys = lm.state_dict().keys()
print(lm)
for key in keys:
    print(key)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

In [14]:
config = RobertaConfig()
model = RobertaMaskedLM(config)
print(model)

RobertaMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            )
          )
          (intermedia

In [15]:
model = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

## Roberta Model

In [359]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.intermediate = nn.Linear(config.hidden_size, config.intermediate_size)
        self.gelu = nn.GELU()
        self.output = nn.Linear(config.intermediate_size, config.hidden_size)

    def forward(self, x):
        x = self.intermediate(x)
        x = self.gelu(x)
        x = self.output(x)
        return x 

class FullSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.query = nn.Linear(config.hidden_size, config.hidden_size)
        self.key = nn.Linear(config.hidden_size, config.hidden_size)
        self.value = nn.Linear(config.hidden_size, config.hidden_size)
        
        self.output = nn.Linear(config.hidden_size, config.hidden_size)
        
        self.num_attention_heads = config.num_attention_heads

        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)

        
        
    def forward(self, x, mask):
        B, T, C = x.size()
        # (batch_size, max_len, hidden_size) 
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        q = q.view(B, T, self.num_attention_heads, C // self.num_attention_heads).transpose(1,2) # (B, num_head, T, head_size)
        k = k.view(B, T, self.num_attention_heads, C // self.num_attention_heads).transpose(1,2)
        v = v.view(B, T, self.num_attention_heads, C // self.num_attention_heads).transpose(1,2)

        # Calculate attention scores 
        attn = (q @ k.transpose(2, 3)) * (1.0 / math.sqrt(k.size(-1)))
        attn = attn.masked_fill(mask == 0, float('-inf'))
        attn = F.softmax(attn, dim = -1)

        y = attn @ v # (B, nh, T, T) x (B, nh, T, hs) = (B, nh, T, hs)
        y = y.transpose(1,2).contiguous().view(B,T,C) # Concat head outputs

        # Project
        y = self.output(y)        
        return y


class EncoderBlock(nn.Module):
    def __init__(self,config):
        super().__init__()
        # self.attention = FullSelfAttention(config)
        self.attention = RobertaSdpaSelfAttention(config)
        self.ln_1 = nn.LayerNorm(config.hidden_size)
        self.mlp = MLP(config)
        self.ln_2 = nn.LayerNorm(config.hidden_size)
        

    def forward(self, x, mask):
        x = x + self.ln_1(self.attention(x, mask)) 
        x = x + self.ln_2(self.mlp(x))
        return x

@dataclass
class RobertaConfig:
    vocab_size = 50265
    hidden_size = 768 
    num_hidden_layers = 12
    num_attention_heads = 12
    intermediate_size = 3072
    max_position_embeddings = 512
    
    type_vocab_size = 1
    pad_token_id = 1
    bos_token_id = 0
    eos_token_id = 2
    


class Roberta(nn.Module):

    def __init__(self, config):
        super().__init__()

        self.config = config
        self.embeddings = nn.ModuleDict(dict(
            word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx = config.pad_token_id),
            position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size, padding_idx = config.pad_token_id),
            token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size),
            LayerNorm = nn.LayerNorm(config.hidden_size)
        ))
        
        self.encoder = nn.ModuleDict(dict(
            layer = nn.ModuleList([EncoderBlock(config) for _ in range(config.num_hidden_layers)])
        ))
        
        self.lm_head = nn.ModuleDict(dict(
            dense = nn.Linear(config.hidden_size, config.hidden_size),
            gelu = nn.GELU(),
            LayerNorm = nn.LayerNorm(config.hidden_size),
            decoder = nn.Linear(config.hidden_size, config.vocab_size),
        ))

    def forward(self, x):
        B, T = x.shape
        # attention masking for padded token
        # (batch_size, 1, seq_len, seq_len)
        mask = (x >= 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1).to(x.device)

        # Token embeddings
        tok_emb = self.embeddings.word_embeddings(x)

        # Positional embeddings 
        pos_mask = x.ne(self.config.pad_token_id).int()
        indices = (((torch.cumsum(pos_mask,dim=1)).type_as(pos_mask))* pos_mask) + self.config.pad_token_id
        pos_emb = self.embeddings.position_embeddings(indices)

        # pos = torch.arange(0, T, dtype = torch.long, device = x.device)
        # pos_emb = self.embeddings.position_embeddings(pos)
        

        
        # Token Type embeddings
        # typ = torch.zeros((1,T), dtype = torch.long, device = x.device)
        # type_emb = self.embeddings.token_type_embeddings(typ)
        

        x = tok_emb + pos_emb

        x = self.embeddings.LayerNorm(x)
        
        # Pass batch through transformer 
        for block in self.encoder.layer:
            x = block(x, mask)

        # Pass through prediction head
        x = self.lm_head.dense(x)
        x = self.lm_head.gelu(x)
        x = self.lm_head.LayerNorm(x)
        x = self.lm_head.decoder(x)

        return x
            


    @classmethod
    def from_pretrained(cls, model_type = "FacebookAI/roberta-base"):
        """ Loading pretrained Roberta weights from hugging face """
        print("loading weights for %s" % model_type)

        # Random init of model
        config = RobertaConfig()
        model = Roberta(config)
        
        sd = model.state_dict()
        sd_keys = sd.keys()

        # Init a Roberta from hugging face 
        model_hf = RobertaForMaskedLM.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()
        sd_hf_keys = [k for k in sd_hf.keys() if not k.endswith('lm_head.bias')]
        # Copy over weights. State Dicts are currently in same order, so I can just blind copy 
        for keys in zip(sd_keys, sd_hf_keys):
            # print(sd[keys[0]].shape)
            # print(sd_hf[keys[1]].shape)
            
            assert(sd[keys[0]].shape == sd_hf[keys[1]].shape)
            
            with torch.no_grad():
                sd[keys[0]].copy_(sd_hf[keys[1]])

        return model    


In [394]:
model = RobertaMaskedLM.from_pretrained().to(device)
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

x = "The capital of <mask> is Paris"
x = torch.tensor(tokenizer.encode(x)).unsqueeze(0).to(device)
print(x)
with torch.no_grad():
    logits = model(x)

mask_token_index = (x == tokenizer.mask_token_id)[0].nonzero(as_tuple = True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
print(tokenizer.decode(predicted_token_id))

loading weights for FacebookAI/roberta-base
loading weights for FacebookAI/roberta-base
loading weights for FacebookAI/roberta-base
loading weights for FacebookAI/roberta-base
loading weights for FacebookAI/roberta-base
loading weights for FacebookAI/roberta-base
loading weights for FacebookAI/roberta-base
loading weights for FacebookAI/roberta-base
loading weights for FacebookAI/roberta-base
loading weights for FacebookAI/roberta-base
loading weights for FacebookAI/roberta-base


KeyboardInterrupt: 

In [361]:
o = logits[0].argmax(axis=-1)
print(tokenizer.decode(o))
print(logits[0, 6])

,,,,,,,,
tensor([ 0.6646, -5.2787,  9.3434,  ..., -3.4416, -4.1338,  4.4339],
       device='mps:0')


In [346]:
from transformers import RobertaLMHeadModel

ImportError: cannot import name 'RobertaLMHeadModel' from 'transformers' (/opt/anaconda3/envs/pytorch-gpu/lib/python3.13/site-packages/transformers/__init__.py)

In [249]:
from transformers import AutoTokenizer, RobertaForMaskedLM, RobertaForCasualLM
import torch

tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
model = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")

inputs = tokenizer("The capital of France is <mask>.<pad>", return_tensors="pt")
print(inputs)
with torch.no_grad():
    logits = model(**inputs).logits
print(logits.shape)
# retrieve index of <mask>
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
print(tokenizer.decode(predicted_token_id))

labels = tokenizer("The capital of France is Paris.<pad>", return_tensors="pt")["input_ids"]
# mask labels of non-<mask> tokens
labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)

outputs = model(**inputs, labels=labels)
print(outputs)
round(outputs.loss.item(), 2)

{'input_ids': tensor([[    0,   133,   812,     9,  1470,    16, 50264,     4,     1,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([1, 10, 50265])
 Paris
MaskedLMOutput(loss=tensor(0.0990, grad_fn=<NllLossBackward0>), logits=tensor([[[34.2751, -3.7783, 18.3324,  ...,  2.7908,  5.3139, 11.8587],
         [ 8.6809, -2.8665, 18.9842,  ...,  2.8312,  4.0936,  9.4246],
         [-3.3497, -4.3248,  8.5588,  ..., -1.9859, -2.6968,  0.3392],
         ...,
         [21.2609, -4.2944, 19.6318,  ...,  0.9557,  3.3131,  8.0253],
         [10.4854, -4.2014, 28.6527,  ..., -1.6521, -3.9379,  8.8991],
         [11.1121, -3.5715, 31.1623,  ...,  1.5217, -0.4953,  9.6180]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


0.1

In [248]:
model = Roberta.from_pretrained().to(device)
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

x = "The capital of France is <mask>.<pad>"
x = torch.tensor(tokenizer.encode(x)).unsqueeze(0).to(device)

logits = model.forward(x)
print(logits.shape)




loading weights for FacebookAI/roberta-base


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 10, 768])


In [238]:
logits[0,6].argmax(axis=-1)
print(tokenizer.decoder(predicted_token_id

tensor(2201)

In [207]:
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    """
    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
    mask = input_ids.ne(padding_idx).int()
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    return incremental_indices.long() + padding_idx

In [205]:
x = torch.tensor([[0,2,3,4,1,1,1],[0,5,2,4,9,1,1]])
mask = x.ne(4).int()

ind = ((torch.cumsum(mask,dim=1)).type_as(mask))* mask 
print(ind)

tensor([[1, 2, 3, 0, 4, 5, 6],
        [1, 2, 3, 0, 4, 5, 6]], dtype=torch.int32)


In [384]:
model = RobertaForMaskedLM.from_pretrained()
print(model)

# x = torch.tensor([[1, 2, 3, 4, 5], [5,6,7,0, 0], [9,9,0,0,0]])
# print(model.forward(x))


loading weights for FacebookAI/roberta-base


AttributeError: 'RobertaConfig' object has no attribute 'layer_norm_eps'

In [104]:
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
model_hf = RobertaModel.from_pretrained("FacebookAI/roberta-base")

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [196]:
inputs = tokenizer('The capital of France is <mask>.<pad><pad>')
print(inputs)

{'input_ids': [0, 133, 812, 9, 1470, 16, 50264, 4, 1, 1, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [363]:
# model_hf = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")
from transformers import AutoTokenizer, RobertaForMaskedLM, RobertaForCausalLM, RobertaForSequenceClassification, RobertaForQuestionAnswering

model_hf = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")

print(model_hf)


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

In [255]:
for k in model_hf.state_dict().keys():
    print(k)

roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.weight
roberta.encoder.layer.0.attention.self.query.bias
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.weight
roberta.encoder.layer.0.attention.self.value.bias
roberta.encoder.layer.0.attention.output.dense.weight
roberta.encoder.layer.0.attention.output.dense.bias
roberta.encoder.layer.0.attention.output.LayerNorm.weight
roberta.encoder.layer.0.attention.output.LayerNorm.bias
roberta.encoder.layer.0.intermediate.dense.weight
roberta.encoder.layer.0.intermediate.dense.bias
roberta.encoder.layer.0.output.dense.weight
roberta.encoder.layer.0.output.dense.bias
roberta.encoder.layer.0.output.LayerNorm.weight
roberta.encoder.layer.0.output.LayerNorm

Parameter containing:
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 

In [8]:
for key in model_hf.state_dict().keys():
    print(key)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [36]:
x = torch.tensor([[1, 2, 3, 4, 5], [5,6,7,0, 0], [9,9,0,0,0]])
# Create a mask where padding tokens are False 
# Create an extra dimension for each sequence
# Repeat to mimic attention mask
mask = (x > 0).unsqueeze(1).repeat(1, x.size(1),1)
print(mask)
print(mask.shape)

tensor([[[ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True]],

        [[ True,  True,  True, False, False],
         [ True,  True,  True, False, False],
         [ True,  True,  True, False, False],
         [ True,  True,  True, False, False],
         [ True,  True,  True, False, False]],

        [[ True,  True, False, False, False],
         [ True,  True, False, False, False],
         [ True,  True, False, False, False],
         [ True,  True, False, False, False],
         [ True,  True, False, False, False]]])
torch.Size([3, 5, 5])


In [130]:
t = nn.Embedding(512, 768, padding_idx = 1)
print(t.weight)


Parameter containing:
tensor([[-0.5500, -0.4970, -2.2028,  ...,  0.8643,  1.3536, -1.7909],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4929,  1.2868, -1.4918,  ...,  0.8632, -1.5980,  0.0521],
        ...,
        [ 0.9052, -1.7088,  1.0249,  ...,  0.9630, -1.0406, -1.2196],
        [-1.9594, -2.2697,  1.3669,  ...,  0.2475, -0.6283,  0.3957],
        [-2.5360, -0.2193,  0.6976,  ...,  0.5029,  0.2769,  1.3981]],
       requires_grad=True)


In [73]:
for key in model.state_dict().keys():
    print(key)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.query.weight
encoder.layer.0.attention.query.bias
encoder.layer.0.attention.key.weight
encoder.layer.0.attention.key.bias
encoder.layer.0.attention.value.weight
encoder.layer.0.attention.value.bias
encoder.layer.0.attention.output.weight
encoder.layer.0.attention.output.bias
encoder.layer.0.ln_1.weight
encoder.layer.0.ln_1.bias
encoder.layer.0.mlp.intermediate.weight
encoder.layer.0.mlp.intermediate.bias
encoder.layer.0.mlp.output.weight
encoder.layer.0.mlp.output.bias
encoder.layer.0.ln_2.weight
encoder.layer.0.ln_2.bias
encoder.layer.1.attention.query.weight
encoder.layer.1.attention.query.bias
encoder.layer.1.attention.key.weight
encoder.layer.1.attention.key.bias
encoder.layer.1.attention.value.weight
encoder.layer.1.attention.value.bias
encoder.layer.1.attention.output.weight
encoder.layer.1.