In [1]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install easyocr
!pip install openai

In [3]:
# Need to use Google Drive or replace image path to valid JPEG handwriting image
image_path = '/content/drive/My Drive/NLP Final Project Data/tarun_writing.jpeg'

In [4]:
# Need to use Google Drive or replace model path to valid model.pth file
model_path = '/content/drive/My Drive/NLP Final Project Data/IAM/best_model.pth'

instructions = """

    You are a computer science expert and a skilled writer.

    Craft detailed content about the given computer science subtopic for university-level lecture notes, targeting a total of about 500 words distributed over a few paragraphs.

    Begin with an introductory paragraph that lays the foundation of the subtopic. Follow this with detailed paragraphs focusing on the critical aspects of the subtopic. Include applications only if they are essential for understanding the concept; otherwise, concentrate on explaining the concept itself and its nuances.

    You can selectively, if necessary, use examples, tables in Markdown format to illustrate key points, ensuring that any code provided is concise and directly demonstrates the concept, otherwise you don't need to include it.

    Please also avoid overly detailed explanations of complex algorithms unless they are central to the subtopic. Do not go overboard with technical details that may overwhelm students.

    Let's try to avoid generating code unless its short and obvious, otherwise, focus on detailed explanations and if you use equations, please use inline HTML. Quick and simple inline equations can utilize HTML ampersand entity codes, such as:

        h<sub>&theta;</sub>(x) = &theta;<sub>o</sub> x + &theta;<sub>1</sub>x

    This method works in practically all Markdown and does not require any external libraries. Avoid using LaTeX. If you cannot express it in HTML, please avoid using equations. Unless the symbol is simple and can be represented in HTML and Markdown, avoid using those symbols.

    Let's try to avoid generating code unless its short and obvious, otherwise, focus on detailed explanations and if you use equations, please use LaTeX format.

    Maintain clear and concise language suitable for a 10th-grade reading level, using academic language where appropriate. Avoid overly technical jargon unless it is necessary for clarity.

    Also avoid your conclusion paragraph in the end since the content should be detailed throughout.

    The entire response must be in valid Markdown format and avoid the use of diagrams unless they can be effectively represented in Markdown. You must stay in our limit of 500 words.

    LaTeX is impossible to use in Markdown, so please use HTML for equations. Do not use LaTeX.

    Your input will always be a single computer science subtopic, and your output should not conclude with a summarizing paragraph but rather emphasize detailed explanation throughout.


    Now, please generate detailed content about the subtopic in Markdown:

    """

In [5]:
import sys
import easyocr
import torch
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

def load_ocr_model(model_path = None):
    """
    Load the TrOCR model and processor for handwritten text recognition.

    Args:
        model_path (str, optional): Path to a pre-trained model state dictionary. 
                                    If provided, the model will be loaded from this path. 
                                    Defaults to None, in which case the default TrOCR model 
                                    'microsoft/trocr-base-handwritten' is loaded.

    Returns:
        model (VisionEncoderDecoderModel): The loaded TrOCR model.
        processor (TrOCRProcessor): The processor used for preprocessing images 
                                    and decoding model outputs.
    """
    processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
    model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
    if model_path:
        model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    return model, processor

def process_image(image_path, model_path = None):
    """
    Process an image to perform text detection using EasyOCR and text recognition using a custom TrOCR model.

    Args:
        image_path (str): Path to the image file to be processed.
        model_path (str, optional): Path to a pre-trained TrOCR model state dictionary. 
                                    If provided, the model will be loaded from this path. 
                                    Defaults to None, in which case the default TrOCR model 
                                    'microsoft/trocr-base-handwritten' is used.

    Returns:
        extracted_words (list): A list of words extracted from the image using the OCR model.
    """
    # Initialize EasyOCR for text detection
    reader = easyocr.Reader(['en'])

    # Load your custom TrOCR model
    model, processor = load_ocr_model(model_path)

    # Perform text detection and recognition using EasyOCR
    easyocr_results = reader.readtext(image_path)

    # Open the full image
    full_image = Image.open(image_path)

    extracted_words = []

    print("Processing results:")
    for idx, (bbox, easyocr_text, prob) in enumerate(easyocr_results, 1):
        print(f"\nProcessing text region {idx}:")
        print(f"EasyOCR Text: {easyocr_text}")
        print(f"Probability: {prob}")

        # Calculate coordinates for cropping
        x_min = min(point[0] for point in bbox)
        y_min = min(point[1] for point in bbox)
        x_max = max(point[0] for point in bbox)
        y_max = max(point[1] for point in bbox)

        # Ensure coordinates are integers
        x_min, y_min, x_max, y_max = map(int, [x_min, y_min, x_max, y_max])

        # Crop the image based on the bounding box
        cropped_image = full_image.crop((x_min, y_min, x_max, y_max))

        # Preprocess the cropped image
        pixel_values = processor(cropped_image, return_tensors="pt").pixel_values

        # Generate text using your model
        generated_ids = model.generate(pixel_values)

        # Decode the generated ids
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        print(f"Your OCR Model Generated text: {generated_text}")

        # Add words to the list
        extracted_words.extend(generated_text.split())

    print("\nExtracted Words:")
    print(extracted_words)

    return extracted_words

In [6]:
from unsloth import FastLanguageModel

# Reuse the same parameters from training
max_seq_length = 2048
dtype = None  # None for auto detection
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/content/drive/My Drive/NLP Final Project Data/lora_model",  # The directory where your model was saved
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# Define the prompt template
prompt_template = """{instructions}

###INPUT (Notes):
{input}

###OUTPUT (Expected Generations):
{output}"""

# Function to generate text
def generate_text(input_text, max_new_tokens=1000):
    """
    Generate text based on the input text using our fine-tuned Llama model.

    Args:
        input_text (str): The input text or notes that will be expanded upon.
        max_new_tokens (int, optional): The maximum number of new tokens to generate. 
                                        Defaults to 1000.

    Returns:
        str: The generated text based on the input.
    """
    prompt = prompt_template.format(
        instructions=instructions,  # Use the instructions from your training
        input=input_text,
        output=""  # Leave this blank for generation
    )

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,
            use_cache=True,
            temperature=0.7,  # Adjust as needed
            top_p=0.9,  # Adjust as needed
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
import openai

def grammar_check(ocr_text):
  """
  Corrects the spelling and grammar of text generated by an OCR model using OpenAI's GPT model.

  Args:
    ocr_text (str): The text generated by the OCR model that needs to be corrected.

  Returns:
    str: The corrected text with improved grammar and spelling, while maintaining the original meaning and structure.
  """
  # Initialize the OpenAI client
  client = openai.OpenAI()

  OCR_CORRECTION_PROMPT = """
  Correct the spelling and grammar in the following text, which was generated by an OCR model. Maintain the original meaning and structure of the sentence as much as possible. Only make changes necessary for clarity and correctness. Do not add new information or significantly alter the sentence structure. Here's the text to correct:

  "{text}"

  Provide the corrected version of the text.
  """

  prompt = OCR_CORRECTION_PROMPT.format(text=ocr_text)

  response = client.chat.completions.create(
      model="gpt-4o-mini",  # Use the appropriate model
      messages=[
          {"role": "system", "content": "You are a helpful assistant."},
          {"role": "user", "content": prompt}
      ],
      max_tokens=200  # Adjust as needed for longer content
  )

  corrected_text = response.choices[0].message.content.strip()

  return corrected_text


In [8]:
def load_image(image_path):
    """
    Load an image from the specified file path.

    Args:
        image_path (str): The path to the image file to be loaded.

    Returns:
        PIL.Image.Image: The loaded image as a PIL Image object.
    """
    try:
        image = Image.open(image_path)
        return image
    except Exception as e:
        print(f"Error loading image: {e}")
        sys.exit(1)

def pipeline(image_path):
    """
    Process an image through a text recognition and expansion pipeline.

    This function loads an image, processes it using an OCR model, performs grammar validation, and generates an expanded text. The final output is saved to a markdown file.

    Args:
        image_path (str): The path to the image file to be processed.

    Returns:
        str: The final generated text after processing and expansion.

    Workflow:
        1. Load the image using `load_image`.
        2. Process the image with the vision model to extract words.
        3. Join the extracted words into a full OCR output text.
        4. Validate the grammar of the OCR text using `grammar_check`.
        5. Generate expanded text using a language model.
        6. Save the final expanded text to a markdown file.

    Example:
        generated_text = pipeline("path/to/image.jpg")
    """
    # Load the image
    image = load_image(image_path)

    # Process the image with the vision mode
    processed_image_word_list = process_image(image)

    # Join the list of output values from the vision model pipeline
    full_ocr_output_text = " ".join(processed_image_word_list)

    print(f"Pre-validated OCR text: {full_ocr_output_text}")

    # Grammar check
    grammar_validated_text = grammar_check(full_ocr_output_text)

    print(f"Post-validated OCR text: {grammar_validated_text}")

    # Into Llama model
    generated_text = generate_text(grammar_validated_text)

    output_file_path = "/content/drive/My Drive/NLP Final Project Data/output/expansion_text.md"

    # Save the combined text to the markdown file
    with open(output_file_path, "w") as file:
      file.write(generated_text)

    return generated_text

In [9]:
# Call model pipeline function with the relative path
print(pipeline(image_path))



    You are a computer science expert and a skilled writer.

    Craft detailed content about the given computer science subtopic for university-level lecture notes, targeting a total of about 500 words distributed over a few paragraphs.

    Begin with an introductory paragraph that lays the foundation of the subtopic. Follow this with detailed paragraphs focusing on the critical aspects of the subtopic. Include applications only if they are essential for understanding the concept; otherwise, concentrate on explaining the concept itself and its nuances.

    You can selectively, if necessary, use examples, tables in Markdown format to illustrate key points, ensuring that any code provided is concise and directly demonstrates the concept, otherwise you don't need to include it.

    Please also avoid overly detailed explanations of complex algorithms unless they are central to the subtopic. Do not go overboard with technical details that may overwhelm students.

    Let's try to avoi