Importing the necessary libraries

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract
!pip install Levenshtein

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2build2).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Levenshtein
  Downloading Levenshtein-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.5/175.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=2.3.0
  Downloading rapidfuzz-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
# import the necessary libraries
import os # for handling the directory
from google.colab import drive # to access the drive
import csv # to parse the dataset
import cv2 # to read and process images
from google.colab.patches import cv2_imshow # for visualization
import pytesseract # OCR model that converts images to string
import Levenshtein as lev # for performance analysis

Configuring the directory and importing the dataset(s)

In [None]:
# NOTE: make sure to have the "Apply AI Group 30" folder saved as a shortcut under "MyDrive" !!!
drive.mount('/content/drive')
# pointing the directory to the shared project folder
os.chdir('/content/drive/MyDrive/Apply AI Group 30/OCR_project/')
cwd = os.getcwd() # cwd = current working directory

Mounted at /content/drive


In [None]:
# access dataset
dataset_dir = cwd + '/resources/handmade_dataset/'

Parsing the dataset(s)

In [None]:
# parse the dataset(s)

csv_dir = dataset_dir + 'labels.csv'

tuple_list = []

# opening the CSV file
with open(csv_dir, mode = 'r') as file:

  # reading the CSV file
  csv_file = csv.reader(file)

  # displaying the contnets of the CSV file
  for image_directory, text in csv_file:
    tuple_list.append((image_directory, text))

# tuple_list is a list of 2-tuples
# where each 2-tuple is in the format (image_directory, text)
# file_name points to the exact image
# text has the text we expect the model to output for its corresponding image

Model code

In [None]:
# OCR model code
# https://www.geeksforgeeks.org/text-detection-and-extraction-using-opencv-and-ocr/

In [None]:
# initializing (img_dir, observed, expected) 3-tuple list
# img_dir points to the image
# observed is the string the image-to-text model outputs
# expected is the string that the model should output
string_tuple_list = []

In [None]:
# iterate through tuple_list
for img_dir, text in tuple_list:

  # reads image and converts it to grayscale
  grayscale_img = cv2.imread(dataset_dir + img_dir, 0)

  # blurs the image
  blurred_img = cv2.GaussianBlur(grayscale_img, (11, 11), cv2.BORDER_DEFAULT)

  # sharpens the image
  # works well for images where illumination varies between parts of image
  adaptive_tresh_img = cv2.adaptiveThreshold(blurred_img,255, cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,11,2)

  # convert image into string
  string = pytesseract.image_to_string(adaptive_tresh_img)
  
  # truncate the '\x0c' from the end of the string
  string = string[:len(string)-1]

  # remove all instances of '\n' from the outputted string
  string = string.replace('\n', '')

  # remove all instances of '\n' from the expected string
  expected = text.replace('\n', '')

  # append to 3-tuple list
  string_tuple_list.append((img_dir, string, expected))

Performance Analysis

In [None]:
# see how similar the observed string (output of the model) is to the expected string
# by calculating the Levenshtein distance between the two strings
results = []
for img_dir, observed, expected in string_tuple_list:
  similarity_ratio = lev.ratio(observed, expected)
  results.append((img_dir, similarity_ratio))

In [None]:
# note: further analysis is done in evaluation.ipynb
results

[('img_1.jpg', 0.37215589096643387),
 ('img_2.jpg', 0.17262881522809326),
 ('img_3.jpg', 0.3975993998499625),
 ('img_4.jpg', 0.4133993148077655),
 ('img_5.jpg', 0.44095216085047373),
 ('img_6.jpg', 0.45629370629370625),
 ('img_7.jpg', 0.4462650602409639),
 ('img_8.jpg', 0.4658273381294964),
 ('img_9.jpg', 0.4423748544819558),
 ('img_10.jpg', 0.3675777568331763),
 ('img_11.jpg', 0.5429061784897025),
 ('img_12.jpg', 0.6772546795235395),
 ('img_13.jpg', 0.7890698109676212),
 ('img_14.jpg', 0.7046545606345231),
 ('img_15.jpg', 0.8258513931888545),
 ('img_16.jpg', 0.8034360788276907),
 ('img_17.jpg', 0.505859375),
 ('img_18.jpg', 0.7318811881188119),
 ('img_19.jpg', 0.8352849336455894),
 ('img_20.jpg', 0.7602427921092565),
 ('img_21.jpg', 0.6798722044728435),
 ('img_22.jpg', 0.7043756670224119),
 ('img_23.jpg', 0.6697345132743363),
 ('img_24.jpg', 0.812206572769953),
 ('img_25.jpg', 0.8178438661710037),
 ('img_26.jpg', 0.9276437847866419),
 ('img_27.jpg', 0.9380413057961359),
 ('img_28.jpg'