In [None]:
#!pip install fitz
#!pip install pymupdf pytesseract pillow
#!pip install easyocr

In [59]:
import fitz
import pytesseract
import easyocr
from PIL import Image
import io
from openai import OpenAI
import os
from dotenv import load_dotenv

In [67]:
class ContextualOCR:
    def __init__(self, pdf_path):
        """
        Initialize the OCR system with a PDF file.
        :param pdf_path: Path to the input PDF file.
        """
        self.pdf_path = pdf_path
        self.images = self._pdf_to_images()
        self.text_data = []
    
    def _pdf_to_images(self):
        """
        Convert the PDF pages to images.
        :return: List of images extracted from the PDF.
        """
        pdf_document = fitz.open(self.pdf_path)
        images = []
        
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes()))
            images.append(img)
        
        return images
    
    def _extract_text_ocr(self):
        """
        Perform OCR on extracted images to get text.
        :return: List of extracted text from each page.
        """
        text_data = []
        print("Extracting text using OCR!!")
        for img in self.images:
            text = pytesseract.image_to_string(img)
            text_data.append(text)
        return text_data
    
    def _clean_text(self, string):
        if type(string) != str:
            string = str(string)
        print("Cleaning text!!")
        return string.rstrip(' \n')

    def _extract_text(self):

        some_text = self._extract_text_ocr()
        client = OpenAI(
            base_url="https://huggingface.co/api/inference-proxy/together",
            api_key="" #API KEY
        )

        messages = [
            {
                "role": "user",
                "content": f'''
                The following text ({some_text}) has been extracted from a PDF named "{path}" using PyTesseract.  
                Due to OCR inaccuracies, the extracted text may contain errors such as misspellings, missing words, incorrect formatting, or garbled content.  

                Your task:
                - Analyze the text and extract all the questions that are present in it.  
                - Identify the topic of the text.  
                - Return the results in a CSV format where each row contains the topic and a question found in the text.  
                - Do NOT refine or alter the text.  
                - Do NOT add any commentary, explanations, or labels—just return the CSV.  
                - The format should be: `topic,question`.

                Now, extract the questions and the topic in CSV format:  
                '''
            }
        ]
        print("Refining text using DeepSeek!!")
        completion = client.chat.completions.create(
            model="deepseek-ai/DeepSeek-R1", 
            messages=messages, 
            max_tokens=1500
        )
        
        return self._clean_text(str(completion.choices[0].message).split('</think>')[1])

In [None]:
path = "AI Exam.pdf"
ocr = ContextualOCR(f"../pdf/{path}")
some_text = ocr._extract_text()

Extracting text using OCR!!
Refining text using DeepSeek!!


In [60]:
some_text

'\\n\\ntopic,question\\n"Search Algorithms (BFS + A*)","Problem 1 [BFS +A 7] 17 points}\\\\n\\\\nConsider the search space below, where Sis the start node and and\\\\n\\\\nGre goal nodes.\\\\nAres are labeled withthe value ofa cos function; the number gives the cost of traversing the ac\\\\n\\\\n‘Along each node isthe valu of a heuristic funtion; the number gives the estimate ofthe distance othe goal\\\\nAssume that uninformed seach algorithms always choose the lef branch fist when there isa choice.\\\\n\\\\nAlso assume that the algorithms do not Keep tack of and recognize repeated stats.\\\\n\\\\nQos sue\\\\n@ v=: 500\\\\n\\\\n12 Heursc estimate\\\\n\\\\nFor each ofthe following seach suategis:\\\\n\\\\n‘What path would be found by the algorithm\\\\n‘+ Listin omer, al the states that are popped off the OPEN lis. (Sequence of path)\\\\n‘+ Nodes in the Fringe when the goal was found\\\\n\\\\noer [+1 +1 Points}\\\\n\\\\noar [24141 Pains"\\n"Search Algorithms (DFS)","Problem\', refusal=No