## Introduction
This notebook provides a utility for converting images to text using Optical Character Recognition (OCR). It takes a folder of images (.jpg, .png, .tiff) as input and generates text files using the Tesseract library.

# Image to text

In [None]:
#Import libraries 
import os
import glob
import numpy as np
import pytesseract as pt

from tqdm import tqdm

### Convert image to text

In [None]:
def convert_image_to_text(image_path, output_folder):
    """Converts an image to text using OCR.
    
    Args:
    image_path (str): The path to the image file.
    output_folder (str): The folder where the text file will be saved.
    """
    name, ext = os.path.splitext(image_path)
    if ext in [".jpg", ".png", ".tiff"]:
        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        gray, img_bin = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
        gray = cv2.bitwise_not(img_bin)
        kernel = np.ones((2,1), np.uint8)
        img = cv2.erode(gray, kernel, iterations=1)
        img = cv2.dilate(img, kernel, iterations=1)
        output = pt.image_to_string(img)
        with open(f"{output_folder}/{name}.txt", 'w') as file:
            file.write(output)

In [None]:
# Image folder
image_folder = glob.glob('./data/raw/*')
output_folder = "./data/processed/"

for file in tqdm(image_folder):
    convert_image_to_text(file, output_folder)