# Document processing with OCR and PDF parsing

##### 1. Document images using tesseract

In [None]:
import pytesseract
from PIL import Image

image_path = "images/document.jpg"

with Image.open(image_path) as img:
    text = pytesseract.image_to_string(img)

print(text)

##### 2. Receipt images using asprise receipt API

In [None]:
import json
import requests

url = "https://ocr.asprise.com/api/v1/receipt"
image = "images/receipt.jpg"

res = requests.post(url,
                    data = {
                        "api_key": "TEST",
                        "recognizer": "auto",
                        "ref_no": "ocr_python_123"
                    },
                    files = {
                        "file": open(image, "rb")
                    })

# saving resulting text into json file and consequently retrieve it for parsing
with open("response1.json", "w") as f:
          json.dump(json.loads(res.text), f)
          
with open("response1.json", "r") as f:
    data = json.load(f)

print(data["receipts"][0].keys())

items = data["receipts"][0]["items"]

print(f"Your purchase at {data['receipts'][0]['merchant_name']}")

for item in items:
    print(f"{item['description']} - {data['receipts'][0]['currency']} {item['amount']}")
    
print("-" * 30)
print(f"Subtotal:  {data['receipts'][0]['currency']} {data['receipts'][0]['subtotal']}")
print(f"Tax:  {data['receipts'][0]['currency']} {data['receipts'][0]['tax']}")
print("-" * 30)
print(f"Total:  {data['receipts'][0]['currency']} {data['receipts'][0]['total']}")

##### 3. Making images readable

In [None]:
import cv2

img = cv2.imread("images/pic.jpg")
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)        # turning colors into grayscale
img = cv2.resize(img, (560, 900))                  # resize the image while maintaining ratio

cv2.imshow("title", img)
cv2.waitKey(0)

# determining a threshold eg 100 and every pixel above it becomes 255 (white) and below becomes 0 (black)
_, result = cv2.threshold(img, 150, 255, cv2.THRESH_BINARY)

# adaptive threshold chooses different thresholds for every pixel based on the surroundings - mean_c or gaussian_c
# parameter 41 is for amount of pixel which determine threshold value; parameter 5 to reduce noise
adaptive_result = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                        cv2.THRESH_BINARY, 41, 5)

#creating 3 images (they pop up in windows)
cv2.imshow("result", result)
cv2.imshow("original", img)
cv2.imshow("adaptive", adaptive_result)
cv2.waitKey(0)

##### 4. Parsing PDFs into usable text and save it as docx

In [None]:
from PyPDF2 import PdfReader
from docx import Document

file_pdf_path = "images/pdf_file.pdf"

# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    text = ''
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

# apply function
pdf_text = extract_text_from_pdf(file_pdf_path)
slides = pdf_text.split("\n")

print(slides)

# Create a Word document
doc = Document()

doc.add_paragraph(slides)

doc.save("extracted_pdf.docx")