# Model training

We will be using a pre-trained Tesseract model for this task. Data extracted in the eda file will be incorporated to enhance accuracy.

In [1]:
import pandas as pd 
import numpy as np 
import os
import warnings 
warnings.filterwarnings("ignore")

In [2]:
image_file_path = []
image_filenames = []

for dirname, _, filenames in os.walk(r'C:\Users\andre\Documents\Data science portfolio projects\trinity\data\images'):
    image_file_path.append(dirname)
    image_filenames.extend(filenames)

In [3]:
print(image_file_path)
print(image_filenames)

['C:\\Users\\andre\\Documents\\Data science portfolio projects\\trinity\\data\\images']
['0000223278.tif', '0001136521.tif', '0001139626.tif', '0001139716.tif', '0001144288.tif', '0001431487.tif', '00043445_00043449.tif', '0011500947.tif', '0011899826.tif', '0013040651.tif', '0013043193.tif', '0013046347.tif', '0013404965.tif', '00555341.tif', '0060026250.tif', '0060026253.tif', '0060026293.tif', '0060027362.tif', '0060027384.tif', '0060027402.tif', '0060027425.tif', '0060027746.tif', '0060027768.tif', '0060027778.tif', '0060031680.tif', '0060031691.tif', '0060031745.tif', '0060032065.tif', '0060047831.tif', '0060053761.tif', '0060053785.tif', '0060059322.tif', '0060060439.tif', '0060060466.tif', '0060060480.tif', '0060060805.tif', '0060063491.tif', '0060063704.tif', '0060068160.tif', '0060068899.tif', '0060075996.tif', '0060076160.tif', '0060076186.tif', '0060076201.tif', '0060076899.tif', '0060076945.tif', '0060085465.tif', '0060087268.tif', '0060087309.tif', '0060088007.tif', '00600

In [4]:
label_file_path = []
label_filenames = []

for dirname, _, filenames in os.walk(r'C:\Users\andre\Documents\Data science portfolio projects\trinity\data\labels'):
    label_file_path.append(dirname)
    label_filenames .extend(filenames)

In [5]:
print(label_file_path)
print(label_filenames)

['C:\\Users\\andre\\Documents\\Data science portfolio projects\\trinity\\data\\labels']
['0000223278_gt.txt', '0000223278_ocr.txt', '0001136521_gt.txt', '0001136521_ocr.txt', '0001139626_gt.txt', '0001139626_ocr.txt', '0001139716_gt.txt', '0001139716_ocr.txt', '0001144288_gt.txt', '0001144288_ocr.txt', '0001431487_gt.txt', '0001431487_ocr.txt', '00043445_00043449_gt.txt', '00043445_00043449_ocr.txt', '0011500947_gt.txt', '0011500947_ocr.txt', '0011899826_gt.txt', '0011899826_ocr.txt', '0013040651_gt.txt', '0013040651_ocr.txt', '0013043193_gt.txt', '0013043193_ocr.txt', '0013046347_gt.txt', '0013046347_ocr.txt', '0013404965_gt.txt', '0013404965_ocr.txt', '00555341_gt.txt', '00555341_ocr.txt', '0060026250_gt.txt', '0060026250_ocr.txt', '0060026253_gt.txt', '0060026253_ocr.txt', '0060026293_gt.txt', '0060026293_ocr.txt', '0060027362_gt.txt', '0060027362_ocr.txt', '0060027384_gt.txt', '0060027384_ocr.txt', '0060027402_gt.txt', '0060027402_ocr.txt', '0060027425_gt.txt', '0060027425_ocr.txt'

Converting tif images and text into LSTM files

the conversion is taking place in the shell and will output the files 

In [6]:
import os
import subprocess

# Directories for images and labels
image_dir = r'C:\Users\andre\Documents\Data science portfolio projects\trinity\data\images'
label_dir = r'C:\Users\andre\Documents\Data science portfolio projects\trinity\data\labels'

# Convert Images and Text to Tesseract's Format
for image_file in os.listdir(image_dir):
    # Assuming the image and label filenames (without extension) are the same
    base_name = os.path.splitext(image_file)[0]
    image_path = os.path.join(image_dir, image_file)
    output_name = os.path.join(image_dir, base_name)

    # Generate .lstmf file for each image
    command = f"tesseract {image_path} {output_name} --psm 6 lstm.train"
    subprocess.run(command, shell=True)

# Create a List of LSTM Files
lstmf_files = [f for f in os.listdir(image_dir) if f.endswith('.lstmf')]
with open('all-lstmf', 'w') as f:
    for file in lstmf_files:
        f.write(file + '\n')

Using pre-trained tesseract model w/ spaCy package for NER functionality

In [7]:
import os
import spacy
from PIL import Image
import pytesseract

# Set Tesseract path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

image_dir = r'C:\Users\andre\Documents\Data science portfolio projects\trinity\data\images'
ocr_output_dir = r'C:\Users\andre\Documents\Data science portfolio projects\trinity\data\ocr_results'
ner_output_dir = r'C:\Users\andre\Documents\Data science portfolio projects\trinity\data\ner_results'

# Check and create output directories if not exists
for dir in [ocr_output_dir, ner_output_dir]:
    if not os.path.exists(dir):
        os.makedirs(dir)

for image_file in os.listdir(image_dir):
    image_path = os.path.join(image_dir, image_file)
    image = Image.open(image_path)
    
    # Extract text from the image
    extracted_text = pytesseract.image_to_string(image)
    
    # Save the extracted text to a .txt file
    ocr_output_file_path = os.path.join(ocr_output_dir, os.path.splitext(image_file)[0] + '.txt')
    with open(ocr_output_file_path, 'w') as f:
        f.write(extracted_text)
    
    # Process the extracted text with spaCy for NER
    doc = nlp(extracted_text)
    named_entities = [f"{entity.text} ({entity.label_})" for entity in doc.ents]
    
    # Save the named entities to a .txt file
    ner_output_file_path = os.path.join(ner_output_dir, os.path.splitext(image_file)[0] + '.txt')
    with open(ner_output_file_path, 'w') as f:
        for entity in named_entities:
            f.write(entity + '\n')

print(f"Processed {len(os.listdir(image_dir))} images. OCR results stored in {ocr_output_dir}. NER results stored in {ner_output_dir}.")

Processed 520 images. OCR results stored in C:\Users\andre\Documents\Data science portfolio projects\trinity\data\ocr_results. NER results stored in C:\Users\andre\Documents\Data science portfolio projects\trinity\data\ner_results.


Below is the code without NER functionality

In [8]:
import pytesseract
from PIL import Image

#because tesseract is in the program files dir, specify the location of the package before excuting the model
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

image_dir = r'C:\Users\andre\Documents\Data science portfolio projects\trinity\data\images'
output_dir = r'C:\Users\andre\Documents\Data science portfolio projects\trinity\data\ocr_results'

# Check and create output directory if not exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for image_file in os.listdir(image_dir):
    image_path = os.path.join(image_dir, image_file)
    image = Image.open(image_path)
    
    # Extract text from the image
    extracted_text = pytesseract.image_to_string(image)
    
    # Save the extracted text to a .txt file
    output_file_path = os.path.join(output_dir, os.path.splitext(image_file)[0] + '.txt')
    with open(output_file_path, 'w') as f:
        f.write(extracted_text)

print(f"Processed {len(os.listdir(image_dir))} images. Results stored in {output_dir}.")

Processed 520 images. Results stored in C:\Users\andre\Documents\Data science portfolio projects\trinity\data\ocr_results.
