In [1]:
from transformers import MarianMTModel, MarianTokenizer, VisionEncoderDecoderModel, TrOCRProcessor, AutoTokenizer
import torch
from transformers import AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-handwritten')
trocr_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten').to(device)

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.2"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

In [78]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForCausalLM
import torch

roberta_model = AutoModelForCausalLM.from_pretrained("FacebookAI/roberta-base", is_decoder=True).to(device)

In [3]:
from PIL import Image

# create ids of encoded input vectors
filepath = 'data/ss2.png'
image = Image.open(filepath).convert("RGB")
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

In [38]:
# create BOS token


tensor([[ 100,   64,  422,   24,  423,    4,  978, 1437]], device='mps:0')

In [88]:
#roberta loop

decoder_input_ids = tokenizer("Capital of France is ", add_special_tokens=False, return_tensors="pt").input_ids.to(device)
decoder_input_ids

lm_logits = roberta_model(input_ids=decoder_input_ids, return_dict=True).logits

# sample last token with highest prob again
next_decoder_input_ids = torch.argmax(lm_logits[:, -1:], axis=-1)

# concat again
decoder_input_ids = torch.cat([decoder_input_ids, next_decoder_input_ids], axis=-1)
tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True), decoder_input_ids

('Capital of France is ',
 tensor([[38632,     9,  1470,    16,  1437,     2]], device='mps:0'))

In [21]:
assert decoder_input_ids[0, 0].item() == trocr_model.config.decoder_start_token_id, "`decoder_input_ids` should correspond to `model.config.decoder_start_token_id`"

AssertionError: `decoder_input_ids` should correspond to `model.config.decoder_start_token_id`

In [30]:
# STEP 1

# pass input_ids to encoder and to decoder and pass BOS token to decoder to retrieve first logit
outputs = trocr_model(pixel_values, decoder_input_ids=decoder_input_ids, return_dict=True)

In [31]:
# get encoded sequence
encoded_sequence = (outputs.encoder_last_hidden_state,)
# get logits
lm_logits = outputs.logits

# sample last token with highest prob
next_decoder_input_ids = torch.argmax(lm_logits[:, -1:], axis=-1)

next_decoder_input_ids

tensor([[2]], device='mps:0')

In [32]:
# concat
decoder_input_ids = torch.cat([decoder_input_ids, next_decoder_input_ids], axis=-1)
decoder_input_ids

tensor([[  100,    33,    57,  3044,    70,   363,     4,   978,    47,   646,
         31957,   742,     2]], device='mps:0')

In [33]:
tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True)

'I have been testing all night. Now you [bos]'

In [37]:
# STEP 2

# reuse encoded_inputs and pass BOS + "Ich" to decoder to second logit
lm_logits = trocr_model(None, encoder_outputs=encoded_sequence, decoder_input_ids=decoder_input_ids, return_dict=True).logits

# sample last token with highest prob again
next_decoder_input_ids = torch.argmax(lm_logits[:, -1:], axis=-1)

# concat again
decoder_input_ids = torch.cat([decoder_input_ids, next_decoder_input_ids], axis=-1)
tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True), decoder_input_ids

('I have been testing all night. Now you [bos] get )',
 tensor([[  100,    33,    57,  3044,    70,   363,     4,   978,    47,   646,
          31957,   742,     2,   120,  4839,     2,     2]], device='mps:0'))

In [14]:

# STEP 3
lm_logits = trocr_model(None, encoder_outputs=encoded_sequence, decoder_input_ids=decoder_input_ids, return_dict=True).logits
next_decoder_input_ids = torch.argmax(lm_logits[:, -1:], axis=-1)
decoder_input_ids = torch.cat([decoder_input_ids, next_decoder_input_ids], axis=-1)

# let's see what we have generated so far!
print(f"Generated so far: {tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True)}")

# This can be written in a loop as well.


Generated so far: [bos] 5-10


In [115]:
trocr_model.config.num_beams = 2

In [132]:
def get_model_output(images):
    pixel_values = processor(images=images, return_tensors="pt").pixel_values.to(device)
    output = trocr_model.generate(pixel_values, return_dict_in_generate=True, output_scores=True, max_new_tokens=30, output_logits=True)
    
    generated_texts = processor.batch_decode(output.sequences, skip_special_tokens=True)
    return generated_texts, output.sequences_scores, output

img = Image.open("data/fml_line.png").convert("RGB")
_, _, output = get_model_output([img])

In [146]:
tokenizer.encode(" cluster")

[0, 18016, 2]

In [182]:
cluster_logit = None

for logit in output.logits[2:3]:    
    confidence = logit.softmax(-1).max()
    cluster_logit = logit
    word = tokenizer.decode([logit.argmax()])
    print(word, confidence, logit.argmax())

 controls tensor(0.4203, device='mps:0') tensor(5656, device='mps:0')


In [None]:
FML_text = """K Means clustering algorithm
Assume we have K cluster of points; each point in a cluster
Is closest to its centroid (more than any other cluster centroid)
If cluster assignment is known, it is easy to compute the centroid"""



In [2]:
from transformers import pipeline
unmasker_base = pipeline('fill-mask', model='roberta-base', device="mps")
unmasker_large = pipeline('fill-mask', model='roberta-large', device="mps")



In [4]:
cluster_pred = cluster_logit.softmax(-1)
cluster_pred.shape

NameError: name 'cluster_logit' is not defined

In [165]:
res = unmasker_base("""K Means clustering algorithm
Assume we have K cluster of points; each point in a cluster
Is closest to its centroid (more than any other cluster centroid)
If cluster assignment is known, it is easy to compute the centroid
If <mask> centroid is known, it is easy to do cluster assignment""")


res

[{'score': 0.8191075921058655,
  'token': 5,
  'token_str': ' the',
  'sequence': 'K Means clustering algorithm\nAssume we have K cluster of points; each point in a cluster\nIs closest to its centroid (more than any other cluster centroid)\nIf cluster assignment is known, it is easy to compute the centroid\nIf the centroid is known, it is easy to do cluster assignment'},
 {'score': 0.034218356013298035,
  'token': 18016,
  'token_str': ' cluster',
  'sequence': 'K Means clustering algorithm\nAssume we have K cluster of points; each point in a cluster\nIs closest to its centroid (more than any other cluster centroid)\nIf cluster assignment is known, it is easy to compute the centroid\nIf cluster centroid is known, it is easy to do cluster assignment'},
 {'score': 0.01694742776453495,
  'token': 117,
  'token_str': ' no',
  'sequence': 'K Means clustering algorithm\nAssume we have K cluster of points; each point in a cluster\nIs closest to its centroid (more than any other cluster centro

In [5]:
res = unmasker_large("""K Means clustering algorithm
Assume we have K cluster of points; each point in a cluster
Is closest to its centroid (more than any other cluster centroid)
If cluster assignment is known, it is easy to compute the centroid
If cluster <mask> is known, it is easy to do cluster assignment""")

for pred in res:
    score, token, str = pred['score'], pred['token'], pred['token_str']
    confidence = score + max(cluster_pred[0][token], cluster_pred[1][token])
    print(str, confidence, max(cluster_pred[0][token], cluster_pred[1][token]))

NameError: name 'cluster_pred' is not defined

In [16]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForMaskedLM.from_pretrained("roberta-base")

# Input text with a mask token
text = "K Means clustering algorithm. <mask> we have K cluster of <mask><s>; each point in a cluster."

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt")

# Get the logits from the model
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Get the logits for the masked token
mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
mask_token_logits = logits[0, mask_token_index, :]

# Print the top 5 tokens predicted for the masked token
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
# for token in top_5_tokens:
#     print(tokenizer.decode([token]))

print(tokenizer.decode(logits.argmax(-1)[0], skip_special_tokens=True), inputs)

Kernel clustering algorithm. So we have K cluster of points; each point in a cluster. {'input_ids': tensor([[    0,   530, 27088, 46644,  2961, 17194,     4, 50264,    52,    33,
           229, 18016,     9, 50264,     0,   131,   349,   477,    11,    10,
         18016,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
