# Optical Character Recognition using pytesseract

### Import libraries

In [1]:
#pip install langdetect

In [2]:
import cv2 
import pytesseract
from pytesseract import Output

In [3]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [4]:
pytesseract

<module 'pytesseract' from 'C:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\pytesseract\\__init__.py'>

In [5]:
img = cv2.imread('sample.png')

custom_config = r'--oem 3 --psm 6'
pytesseract.image_to_string(img, config=custom_config).replace('\n', ' ')

'This is the first line of this text example.  This is the second line of the same text. '

### Preprocessing for tesseract

In [6]:
import numpy as np

img=cv2.imread('img.jpg')

#grayscale img
def get_grayscale(image):
    return cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)

#noise removal
def thresholding(image):
    return cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED) 

In [7]:
image = cv2.imread('abcd.jpg')

gray = get_grayscale(image)
thresh = thresholding(gray)
opening = opening(gray)
canny = canny(gray)

In [8]:
pytesseract.image_to_string(gray).replace('\n', ' ')

"ABCDEFGHIJKLM NOPQRSTUVWXYZ abcdefghijklm nopaqrstuvwxyz 01234567849 'DQ#SZAR*() "

In [9]:
pytesseract.image_to_string(thresh).replace('\n', ' ')

'ABCDEFGHIJKLM NOPQRSTUVUXYZ abcdef ghijklm nopqrstuvwxyz O1234567849 TDHSZAR*C) '

In [10]:
pytesseract.image_to_string(opening).replace('\n', ' ')

'ABCDEFGHIJKLM NOP@RSTUVEXYZ abcdefghijkla nopqrstuvwexyz 014234567449 !eeexaax() '

In [11]:
pytesseract.image_to_string(canny).replace('\n', ' ')

'ABCDEFGHIJKLM NOPA@RSTUVUXYZ abcdef ghijklm nopgrstuvuxyz GLe34Sb789 LMIFSZAR% CD '

### Boxes around texts

In [12]:
img = cv2.imread('sample.png')

d = pytesseract.image_to_data(img, output_type=Output.DICT)
print(d.keys())

dict_keys(['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text'])


In [13]:
n_boxes = len(d['text'])
for i in range(n_boxes):
    if int(d['conf'][i]) > 60:
        (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
        img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

cv2.imshow('img', img)
cv2.waitKey(0)
#press any key to stop execution

107

### Boxes around dates

In [15]:
import re

img = cv2.imread('inv.png')
d = pytesseract.image_to_data(img, output_type=Output.DICT)
keys = list(d.keys())

date_pattern = '^(0[1-9]|[12][0-9]|3[01])/(0[1-9]|1[012])/(19|20)\d\d$'

n_boxes = len(d['text'])
for i in range(n_boxes):
    if int(d['conf'][i]) > 50:
        if re.match(date_pattern, d['text'][i]):
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

cv2.imshow('img', img)
cv2.waitKey(0)

106

### List of supported languages

In [16]:
languages = pytesseract.get_languages(config='')

print(languages)

['afr', 'amh', 'ara', 'asm', 'aze', 'aze_cyrl', 'bel', 'ben', 'bod', 'bos', 'bre', 'bul', 'cat', 'ceb', 'ces', 'chi_sim', 'chi_sim_vert', 'chi_tra', 'chi_tra_vert', 'chr', 'cos', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'enm', 'epo', 'equ', 'est', 'eus', 'fao', 'fas', 'fil', 'fin', 'fra', 'frk', 'frm', 'fry', 'gla', 'gle', 'glg', 'grc', 'guj', 'hat', 'heb', 'hin', 'hrv', 'hun', 'hye', 'iku', 'ind', 'isl', 'ita', 'ita_old', 'jav', 'jpn', 'jpn_vert', 'kan', 'kat', 'kat_old', 'kaz', 'khm', 'kir', 'kmr', 'kor', 'lao', 'lat', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mkd', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nep', 'nld', 'nor', 'oci', 'ori', 'osd', 'pan', 'pol', 'por', 'pus', 'que', 'ron', 'rus', 'san', 'sin', 'slk', 'slv', 'snd', 'spa', 'spa_old', 'sqi', 'srp', 'srp_latn', 'sun', 'swa', 'swe', 'syr', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'uzb_cyrl', 'vie', 'yid', 'yor']


### Detect hindi words

In [17]:
img=cv2.imread('hindi.jpg')

custom_config = r'-l hin+eng --psm 6'
pytesseract.image_to_string(img, config=custom_config).replace('\n', ' ')

"पूरा होता रहता है। दिन प्रतिदिन का आहार तू आज हमें दे, अपराधों को क्षमा दान कर जैसे हमने अपने अपराधी क्षमा किये। भारी कठिन परीक्षा मत ले हमें उससे बचा जो बुरा है ।'  [क्योंकि राज्य और महिमा सदा तेरी है। "

### Detect language

In [18]:
custom_config = r'-l eng+hin --psm 6'
txt = pytesseract.image_to_string(img, config=custom_config)

from langdetect import detect_langs
detect_langs(txt)

[hi:0.9999978990577625]

### Whitelisting (alphabets)

In [19]:
img=cv2.imread('alphanum.png')

In [20]:
#whitelist
custom_config = r'-c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz --psm 6'
print(pytesseract.image_to_string(img, config=custom_config))

etterrequencyletterrequency

ezom
tsoseow
a
ozsovomgo
np
szb
h
rsoxk
dazsaomjo
iaorsoxx
czoqo
u



### Blacklisting (numbers)

In [21]:
custom_config = r'-c tessedit_char_blacklist=0123456789 --psm 6'
pytesseract.image_to_string(img, config=custom_config)

'Letter Frequency Letter Frequency\ne | z.To%|m__ | .%%|\nt__ | s.oseo%/w | .¢%\na\no | zsovo%[g | .o%\nn__ | .%[p | .%%|\ns__| .z/%[b —|_.%\nh\nr_ | s.o%|k | _.%%|\nd_ | azsao%|j | o.%\ni | a.orso%|x | .%%|\nc__ | zo%|q | __.o%%|\nu\n'

### Detect only digits

In [22]:
custom_config = r'--oem 3 --psm 6 outputbase digits'
print(pytesseract.image_to_string(img, config=custom_config))

-
12.70202.4060
9.05602.360094
750702.01503
6.74901.929086
6.32701.49209
5.98700.772086
425200.15308
14.02500.150086
2782040.095086

