In [40]:
import os
from transformers import AutoProcessor, VisionEncoderDecoderModel, AutoTokenizer
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import cv2

In [41]:
# Load your fine-tuned Nepali OCR model and processor
processor = AutoProcessor.from_pretrained("model/part_3/nepali_ocr_processor", use_fast=False)
model = VisionEncoderDecoderModel.from_pretrained("model/part_3/nepali_ocr_model")

Config of the encoder: <class 'transformers.models.deit.modeling_deit.DeiTModel'> is overwritten by shared encoder config: DeiTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 384,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "model_type": "deit",
  "num_attention_heads": 6,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.48.2"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 384,
  "d_model": 256,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 1024,
  "decode

In [42]:
def ocr_detect(path):
    # Open and preprocess the image
    image = Image.open(path).convert("RGB")
    
    # Preprocess image and generate text
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return generated_text

In [43]:
image_dir = 'ocrdataset/dataset_test/preprocessed_images'
output_file = "ocrdataset/dataset_test/predicted.txt"

nepali_mapping = {'ः': '¶','ऊ': '§','ऋ': '†','ऌ': '‡','ऐ': '‰','ङ': '¤','ञ': '¢','ॐ': '£','ॠ': '¿',
        '०': '∞','१': '∑','२': '∂','३': '∇','४': '∆','५': '⌘','६': '≈','७': '≠','८': '¢','९': '¥'}
nepali_reverse_mapping = {v: k for k, v in nepali_mapping.items()}

def replace_chars(extracted_text, char_map):
    return ''.join(char_map.get(char, char) for char in extracted_text)
    

# Open the file in write mode
with open(output_file, "w", encoding="utf-8") as f:
    for i in range(10000):
        image_name = f"{i}.png"
        image_path = os.path.join(image_dir, image_name)

        if os.path.exists(image_path):  # Check if the image exists
            # Detect text using the OCR model
            extracted_text = ocr_detect(image_path)
            extracted_text = replace_chars(extracted_text, nepali_reverse_mapping)

            print(f"{i}: {extracted_text}")
            
            # Write to the file
            f.write(f"{extracted_text}\n")

0: हटाउन
1: गर्ने वा
2: २३.
3: अन्य
4: बैङ्कले आवश्यकता
5: अन्य
6: विषय वा
7: महिना: १३ गते: ३३
8: स.- ६, एअफ
9: न. पा.- ४, कक्षइं
10: प्रत्येक वाणिज्य
11: त्यवस्थापन,
12: २३-४४-४३
13: सो
14: लेखापरीक्षण
15: निर्धारण गर्ने
16: वा
17: तथा
18: ३४२४३
19: लेखामान बमोजिम गर्नु
20: फर्म,
21: ११
22: ००३०
23: कर शुल्क, दस्तुर,
24: खण्डमा नेपाल सरकार,
25: गते: १८
26: ४०-४०२४२
27: लागि प्रबन्धपत्र तथा
28: वा त्यस्तो कसूर
29: बैङ्क
30: कर्मचारीलाई
31: SSL पूँजी
32: लगाउने,
33: महिना: ६७
34: वा "तोकिए बमोजिम"
35: साल: ४२१४
36: कान्नबमोजिम कुनै वाणिज्य
37: स८्चालक,
38: मौका प्रदान
39: बढी शेयर
40: ४९-९८-४९
41: गा. वि. स.- ५, फओइ
42: बैड
43: गरी इजाजतपत्र प्राप्त
44: नवीकरण सम्वन्धित
45: व्यतित भए नभएको
46: etc/१) सभाको
47: समितिका
48: २०५५५
49: कम्पनी वा
50: औैषत मौज्दातको
51: २, ऊजू
52: देहाय बमोजिम
53: रहेको आफ्नो
54: ३१
55: (ज) बैङ्कको आन्तरिक
56: । (प)
57: बमोजिम चलनचल्तीबाट
58: वडा नं.: ७
59: गर्ने
60: अवधि,
61: गरी सो
62: हुनेछ
63: गर्ने वा निजहरुको
64: मध्येबाट
65: ३१
66: आएको
67: बनार स८्चा