In [77]:
from pycparser.ply.yacc import token
from ultralytics import YOLO
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AutoModelForCausalLM, pipeline
from PIL import Image
import numpy as np
import pandas as pd
from nltk.translate import bleu_score
from nltk.translate.bleu_score import SmoothingFunction
import torch

yolo_weights_path = "final_wts.pt"

device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-handwritten')
trocr_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten').to(device)
trocr_model.config.num_beams = 1

yolo_model = YOLO(yolo_weights_path).to('mps')
unmasker_large = pipeline('fill-mask', model='roberta-large', device="mps")


print(f'TrOCR and YOLO Models loaded on {device}')

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.2"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

TrOCR and YOLO Models loaded on mps


In [2]:
CONFIDENCE_THRESHOLD = 0.72
BLEU_THRESHOLD = 0.6


def inference(image_path, debug=False, return_texts='final'):
    def get_cropped_images(image_path):
        results = yolo_model(image_path, save=True)
        patches = []
        ys = []
        for box in sorted(results[0].boxes, key=lambda x: x.xywh[0][1]):
            image = Image.open(image_path).convert("RGB")
            x_center, y_center, w, h  = box.xywh[0].cpu().numpy()
            x, y = x_center - w / 2, y_center - h / 2
            cropped_image = image.crop((x, y, x + w, y + h))
            patches.append(cropped_image)
            ys.append(y)
        bounding_box_path = results[0].save_dir + results[0].path[results[0].path.rindex('/'):-4] + '.jpg'
        return patches, ys, bounding_box_path
        
    def get_model_output(images):
        pixel_values = processor(images=images, return_tensors="pt").pixel_values.to(device)
        output = trocr_model.generate(pixel_values, return_dict_in_generate=True, output_scores=True, max_new_tokens=30)
        generated_texts = processor.batch_decode(output.sequences, skip_special_tokens=True)
        return generated_texts, output.sequences_scores

    def post_process_texts(generated_texts):
        for i in range(len(generated_texts)):
            if len(generated_texts[i]) > 2 and generated_texts[i][:2] == '# ':
                generated_texts[i] = generated_texts[i][2:]
                
            if len(generated_texts[i]) > 2 and generated_texts[i][-2:] == ' #':
                generated_texts[i] = generated_texts[i][:-2]
        return generated_texts

    def get_qualified_texts(generated_texts, scores, y):
        qualified_texts = []
        for text, score, y_i in zip(generated_texts, scores, y):
            if score > CONFIDENCE_THRESHOLD:
                qualified_texts.append({
                    'text': text,
                    'score': score,
                    'y': y_i
                })
        return qualified_texts

    def get_adjacent_bleu_scores(qualified_texts):
        def get_bleu_score(hypothesis, references):
            weights = [0.5, 0.5]
            smoothing = SmoothingFunction()
            return bleu_score.sentence_bleu(references, hypothesis, weights=weights,
                                            smoothing_function=smoothing.method1)

        for i in range(len(qualified_texts)):
            hyp = qualified_texts[i]['text'].split()
            bleu = 0
            if i < len(qualified_texts) - 1:
                ref = qualified_texts[i + 1]['text'].split()
                bleu = get_bleu_score(hyp, [ref])
            qualified_texts[i]['bleu'] = bleu
        return qualified_texts

    def remove_overlapping_texts(qualified_texts):
        final_texts = []
        new = True
        for i in range(len(qualified_texts)):
            if new:
                final_texts.append(qualified_texts[i])
            else:
                if final_texts[-1]['score'] < qualified_texts[i]['score']:
                    final_texts[-1] = qualified_texts[i]
            new = qualified_texts[i]['bleu'] < BLEU_THRESHOLD
        return final_texts

    cropped_images, y, bounding_box_path = get_cropped_images(image_path)
    if debug:
        print('Number of cropped images:', len(cropped_images))
    generated_texts, scores = get_model_output(cropped_images)
    normalised_scores = np.exp(scores.to('cpu').numpy())
    if return_texts == 'generated':
        return pd.DataFrame({
            'text': generated_texts,
            'score': normalised_scores,
            'y': y
        })
    generated_texts = post_process_texts(generated_texts)
    if return_texts == 'post_processed':
        return pd.DataFrame({
            'text': generated_texts,
            'score': normalised_scores,
            'y': y
        })
    qualified_texts = get_qualified_texts(generated_texts, normalised_scores, y)
    if return_texts == 'qualified':
        return pd.DataFrame(qualified_texts)
    qualified_texts = get_adjacent_bleu_scores(qualified_texts)
    if return_texts == 'qualified_with_bleu':
        return pd.DataFrame(qualified_texts)
    final_texts = remove_overlapping_texts(qualified_texts)
    final_texts_df = pd.DataFrame(final_texts, columns=['text', 'score', 'y'])
    return final_texts_df, bounding_box_path


image_path = "data/FML_whiteboard2.png"
df, bounding_path = inference(image_path, debug=False, return_texts='final')
df


image 1/1 /Users/amaljoe/Desktop/Workspace/IITB/NLP/OCR_with_LLMs/data/FML_whiteboard2.png: 384x640 8 handwritten_lines, 371.8ms
Speed: 11.1ms preprocess, 371.8ms inference, 1153.5ms postprocess per image at shape (1, 3, 384, 640)
Results saved to [1m/Users/amaljoe/Desktop/Workspace/IITB/NLP/OCR_with_LLMs/runs/detect/predict33[0m


AttributeError: 'GenerateEncoderDecoderOutput' object has no attribute 'sequences_scores'

In [81]:
def get_cropped_images(image_path):
    results = yolo_model(image_path, save=True)
    patches = []
    ys = []
    for box in sorted(results[0].boxes, key=lambda x: x.xywh[0][1]):
        image = Image.open(image_path).convert("RGB")
        x_center, y_center, w, h  = box.xywh[0].cpu().numpy()
        x, y = x_center - w / 2, y_center - h / 2
        cropped_image = image.crop((x, y, x + w, y + h))
        patches.append(cropped_image)
        ys.append(y)
    bounding_box_path = results[0].save_dir + results[0].path[results[0].path.rindex('/'):-4] + '.jpg'
    return patches, ys, bounding_box_path

def get_model_output(images):
    pixel_values = processor(images=images, return_tensors="pt").pixel_values.to(device)
    output = trocr_model.generate(pixel_values, return_dict_in_generate=True, output_logits=True, max_new_tokens=30)
    generated_texts = processor.batch_decode(output.sequences, skip_special_tokens=True)
    return generated_texts, output.logits


image_path = "data/FML_whiteboard2.png"
cropped_images, y, bounding_box_path = get_cropped_images(image_path)
generated_texts, logits = get_model_output(cropped_images)
for i in range(len(generated_texts)):
    print(generated_texts[i], logits[i].softmax(-1).max(-1).values.mean())


image 1/1 /Users/amaljoe/Desktop/Workspace/IITB/NLP/OCR_with_LLMs/data/FML_whiteboard2.png: 384x640 8 handwritten_lines, 39.2ms
Speed: 3.6ms preprocess, 39.2ms inference, 10.2ms postprocess per image at shape (1, 3, 384, 640)
Results saved to [1m/Users/amaljoe/Desktop/Workspace/IITB/NLP/OCR_with_LLMs/runs/detect/predict34[0m
K-means clustering algorithm tensor(0.8210, device='mps:0')
Assume we have K clusters of points ; each point in a cluster . tensor(0.9846, device='mps:0')
Assume we have K clusters of points ; each point in a cluster . tensor(0.9086, device='mps:0')
is closest to its centroid ( more than any other cluster centroid ) tensor(0.9713, device='mps:0')
If cluster assignment is known , it is easy to compute the centriots . tensor(0.8718, device='mps:0')
It cluster assignment is known , it is easy to compute the centroids . tensor(0.9372, device='mps:0')
If cluster centrids are known , it is easy to do cluster assignment . tensor(0.9796, device='mps:0')
How do we solve 

In [89]:
stacked_logits = torch.stack(logits, dim=1)
processor.batch_decode([stacked_logits[-2].argmax(-1)[:5]], skip_special_tokens=True)

['If cluster centrids']

In [71]:
stacked_logits[-2].softmax(-1).max(-1)

torch.return_types.max(
values=tensor([0.5884, 0.9979, 0.3797, 0.8033, 0.7809, 1.0000, 1.0000, 0.9999, 0.9997, 1.0000, 0.9999, 0.9966, 0.9997, 0.2181, 0.9184, 0.6810, 0.9987, 0.9935, 0.9677], device='mps:0'),
indices=tensor([ 1106, 18016,   715,   338,  7823,    32,   684,  2156,    24,    16,  1365,     7,   109, 18016, 11717,   479,     2,     2,     2], device='mps:0'))

In [103]:
res = unmasker_large("""K Means clustering algorithm
Assume we have K cluster of <mask>; each point in a cluster
Is closest to its centroid (more than any other cluster centroid)
If cluster assignment is known, it is easy to compute the centroid
If cluster <mask> is known, it is easy to do cluster assignment""")
res

[[{'score': 0.8517557978630066,
   'token': 332,
   'token_str': ' points',
   'sequence': '<s>K Means clustering algorithm\nAssume we have K cluster of points; each point in a cluster\nIs closest to its centroid (more than any other cluster centroid)\nIf cluster assignment is known, it is easy to compute the centroid\nIf cluster<mask> is known, it is easy to do cluster assignment</s>'},
  {'score': 0.0489627867937088,
   'token': 32833,
   'token_str': ' nodes',
   'sequence': '<s>K Means clustering algorithm\nAssume we have K cluster of nodes; each point in a cluster\nIs closest to its centroid (more than any other cluster centroid)\nIf cluster assignment is known, it is easy to compute the centroid\nIf cluster<mask> is known, it is easy to do cluster assignment</s>'},
  {'score': 0.01644345372915268,
   'token': 8720,
   'token_str': ' objects',
   'sequence': '<s>K Means clustering algorithm\nAssume we have K cluster of objects; each point in a cluster\nIs closest to its centroid (

[{'score': 0.10382635146379471,
  'token': 17194,
  'token_str': ' algorithm',
  'sequence': '<s>K Means clustering algorithm\nAssume we have K cluster of points; each point in a cluster\nIs closest to its centroid (more than any other cluster centroid)\nIf cluster assignment is known, it is easy to compute the centroid\nIf cluster<mask><mask> algorithm is known, it is easy to do cluster assignment</s>'},
 {'score': 0.03082429990172386,
  'token': 5043,
  'token_str': ' function',
  'sequence': '<s>K Means clustering algorithm\nAssume we have K cluster of points; each point in a cluster\nIs closest to its centroid (more than any other cluster centroid)\nIf cluster assignment is known, it is easy to compute the centroid\nIf cluster<mask><mask> function is known, it is easy to do cluster assignment</s>'},
 {'score': 0.017390219494700432,
  'token': 3854,
  'token_str': ' distribution',
  'sequence': '<s>K Means clustering algorithm\nAssume we have K cluster of points; each point in a clu