In [2]:
import pymupdf
from ollama import pull, generate
import math
import glob
from kokoro import KPipeline
from IPython.display import display, Audio
import torch
from typing import List

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_NAME = "gemma3:4b"
pull(MODEL_NAME)

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [8]:
def process_pdf(input_path: str) -> list[str]:
    try:
        with pymupdf.open(input_path) as pdf:
            text_by_page = [page.get_text() for page in pdf]

    except Exception as e:
        raise Exception(f"PDF file processing failed. Please ensure that the file path is correct and that the file is not corrupted. Full Error: {e}")
    
    return text_by_page

def chunk_input(input: list, n_chunks: int = 5) -> list[list]:
    chunked_inputs = []

    for idx in range(math.ceil(len(input) / n_chunks)):
        start, end = idx * n_chunks, (idx+1) * n_chunks
        chunked_inputs.append(input[start:end])

    return chunked_inputs

def get_file_ext(fpath: str) -> str:
    return fpath.split(".")[-1]

In [37]:
def summarize(input_text: str|list[str], target_word_ct: int, context_length: int = 8192) -> str:
    summary_prompt = f"You are an assistant that is tasked with summarizing a set of pages of documents that are given to you. Write a summary using the information provided. Do not use bullet points or any other formatting. The summary should be between {target_word_ct-50} and {target_word_ct+50} words long. The text is as follows: "

    if isinstance(input_text, str):
        summary_prompt += input_text
    elif isinstance(input_text, list) and all(isinstance(element, str) for element in input_text):
        full_input = "\n".join([chunk for chunk in input_text])
        summary_prompt += full_input
    else:
        raise TypeError("The input text does not align with the intended types of `str` or `list[str]`. Please verify that your input text is either a string or a list of strings.")

    summary_response = generate(
        model=MODEL_NAME,
        prompt=summary_prompt,
        options={
            "num_ctx": context_length
        }
    )

    return summary_response["response"]

In [28]:
def create_chunked_summaries(input_fpath: str) -> list[str]:
    output_summaries = []

    file_ext = get_file_ext(input_fpath)
    if file_ext == "pdf":
        parsed_input = process_pdf(input_fpath)
    else:
        raise ValueError(f"Uploaded file type of `{file_ext}` is not supported for text parsing. Please try again without using any files with the specified file extension.")
    
    summary_input = chunk_input(parsed_input)

    if isinstance(summary_input, list) and len(summary_input) > 30:
        n_chunk_groups = math.ceil(len(summary_input) / 30)

        for chunk_group_idx in range(n_chunk_groups):
            chunk_group_summaries = []

            chunk_group_start_idx, chunk_group_end_idx = chunk_group_idx * n_chunk_groups, (chunk_group_idx + 1) * n_chunk_groups
            chunk_group = summary_input[chunk_group_start_idx:chunk_group_end_idx]

            for chunk in chunk_group:
                chunk_summary = summarize(input_text=chunk, target_word_ct=200)
                chunk_group_summaries.append(chunk_summary)
            
            chunk_group_summary = summarize(input_text=chunk_group_summaries, target_word_ct=500)
            output_summaries.append(chunk_group_summary)

    elif isinstance(summary_input, list) and len(summary_input) <= 30:
        for chunk in summary_input:
            chunk_summary = summarize(input_text=chunk, target_word_ct=200)
            output_summaries.append(chunk_summary)

    return output_summaries

In [29]:
# TODO: Create function to summarize chunk summaries into main summary

In [38]:
test = create_chunked_summaries("../sample_inputs/transformers_paper.pdf")
test

['The Transformer, proposed by Vaswani et al., presents a novel neural network architecture for sequence transduction, eschewing recurrence and convolution entirely in favor of a self-attention mechanism. This paper details the model’s design, highlighting its superior performance and parallelizability compared to existing models like those utilizing recurrent neural networks or convolutional networks. The core innovation lies in the multi-head self-attention mechanism, allowing the model to draw global dependencies between input and output sequences. This architecture achieved state-of-the-art results on machine translation tasks, significantly improving translation quality while reducing training time.\n\nThe Transformer consists of stacked encoder and decoder layers, each incorporating multi-head self-attention and position-wise feed-forward networks. Crucially, the model utilizes scaled dot-product attention, mitigating issues with large dot product magnitudes and facilitating effi