<a href="https://colab.research.google.com/github/alirezaght/tesseract-train-font/blob/main/tesseract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mounting google drive to have access to data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create dataset

In [None]:
%cd /content
!rm -rf /content/dataset
!mkdir /content/dataset
%cd /content/dataset

from PIL import Image,ImageDraw,ImageFont,ImageFilter
from IPython.display import display
import os
import random

def create_dataset(number_for_each_font):
  FONT_DIR = '/content/drive/MyDrive/fonts'
  count = 0
  for font_file in os.listdir(f'{FONT_DIR}'):
    sample = False
    if not font_file.lower().endswith('ttf'):
      print(f'skipping {font_file}')
      continue
    for i in range(number_for_each_font):
      count += 1
      size = random.randint(12, 64)
      font = ImageFont.truetype(f"{FONT_DIR}/{font_file}", size, encoding='unic')
      length = random.randint(12, 20)
      numbers = []
      for j in range(length):
        n = str(random.randint(0, 9)) + ((" " * random.randint(0, 4)) if random.random() < 0.3 else "")
        numbers.append(n)        

      unicode_text = "".join(numbers)
      # get the line size
      text_width, text_height = font.getsize(unicode_text)

      # create a blank canvas with extra space between lines
      canvas = Image.new('RGB', (text_width + 10, text_height + 10), "white")
      
      # draw the text onto the text canvas, and use black as the text color
      draw = ImageDraw.Draw(canvas)
      draw.text((5,5), unicode_text, 'black', font)
      if random.random() < 0.3:
        canvas = canvas.rotate(random.randint(-5,5), expand = 1)

      if random.random() < 0.3 and size > 24:
        canvas = canvas.filter(ImageFilter.GaussianBlur(radius=random.randint(0,2)))
      # save the blank canvas to a file
      canvas.save(f"{count}.tif", dpi=(300, 300))
      with open(f'{count}.gt.txt', 'w') as f:
        f.write(unicode_text)
      if not sample:
        sample = True
        print(font_file)
        im = Image.open(f'{count}.tif', 'r')
        display(im)

create_dataset(300)

# Install tesseract

First let's install tesseract + all the dependency required

In [None]:
!apt install tesseract-ocr libtesseract-dev bc

# Clone tesstrain

In [None]:
%cd /content
!git clone https://github.com/tesseract-ocr/tesstrain

# Prepare data

If we want to load from google drive

In [None]:
%cd /content
!rm -rf /content/bnk-ground-truth
!cp /content/drive/MyDrive/dataset.rar /content/
!unrar x /content/dataset.rar
!mkdir tesstrain/data
!rm -rf tesstrain/data/bnk-ground-truth
!mv bnk-ground-truth tesstrain/data/bnk-ground-truth
%cd /content/tesstrain

If we want to use the datase we just created

In [None]:
%cd /content
!mkdir tesstrain/data
!rm -rf tesstrain/data/bnk-ground-truth
!mv dataset tesstrain/data/bnk-ground-truth
%cd /content/tesstrain

# Train

In [None]:
%cd /content/tesstrain
!mkdir tessdata
%cd tessdata
!wget https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
%cd /content/tesstrain

In [None]:
!make clean MODEL_NAME=bnk
!make training MODEL_NAME=bnk START_MODEL=eng TESSDATA=/content/tesstrain/tessdata

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Tesseract Open Source OCR Engine v4.0.0-beta.1 with Leptonica
Page 1
PYTHONIOENCODING=utf-8 python3 generate_line_box.py -i "data/bnk-ground-truth/41704.tif" -t "data/bnk-ground-truth/41704.gt.txt" > "data/bnk-ground-truth/41704.box"
+ tesseract data/bnk-ground-truth/41704.tif data/bnk-ground-truth/41704 --psm 13 lstm.train
Tesseract Open Source OCR Engine v4.0.0-beta.1 with Leptonica
Page 1
PYTHONIOENCODING=utf-8 python3 generate_line_box.py -i "data/bnk-ground-truth/41705.tif" -t "data/bnk-ground-truth/41705.gt.txt" > "data/bnk-ground-truth/41705.box"
+ tesseract data/bnk-ground-truth/41705.tif data/bnk-ground-truth/41705 --psm 13 lstm.train
Tesseract Open Source OCR Engine v4.0.0-beta.1 with Leptonica
Page 1
PYTHONIOENCODING=utf-8 python3 generate_line_box.py -i "data/bnk-ground-truth/41706.tif" -t "data/bnk-ground-truth/41706.gt.txt" > "data/bnk-ground-truth/41706.box"
+ tesseract data/bnk-ground-truth/41706.tif data/

In [None]:
!make traineddata MODEL_NAME=bnk