In [1]:
from re import sub
from time import sleep
from typing import Optional
from requests import post



In [2]:
CHUNK_LEN = 5000
CHUNK_OVERLAP_LEN = 1000
DEFAULT_ASSISTANT_MODEL = "llama3.2"
DOCUMENT_FILEPATH_BASE = "documents"
DOCUMENT_FILENAME = "the_three_musketeers.01-05.txt"
# DOCUMENT_FILENAME = "the_three_musketeers.original.txt"
LEN_LIMIT = 5000
PREVENT_OVERLOAD_PAUSE = 10  # 0 to disable
PREVENT_OVERLOAD_PAUSE_PERIOD = 25  # 0 to disable
SUMMARY_FILEPATH_BASE = "results/summary"
SUMMARY_FILEPATH_EXT = "txt"
TEM_RESULT_FILEPATH = "results/temp_result.txt"

In [3]:
def make_post_request(url: str, data: Optional[dict] = None) -> dict:
    """
    Make a POST request to a specified URL with optional JSON data.

    :param url: The URL to which the POST request is made.
    :param data: A dictionary containing the JSON data to be sent in the request body. Defaults to None.
    :return: A dictionary containing the JSON response from the server.
    """
    response = post(url, json=data)
    response.raise_for_status()  # Raise an exception for HTTP errors

    return response.json()

In [4]:
def generate_completion(query: str, model):
    # return prompt.rsplit(' ', 1)[0]  # Just for testing purposes

    promptText = f"""
You are a precise text summarizer. Your task is to create concise, informative summaries of any given text in no more than 3 sentences. Follow these rules strictly:
- Capture the main idea or argument in the first sentence
- Include the most important supporting details or evidence in the second sentence
- Conclude with key implications, outcomes, or conclusions in the third sentence
- If the text can be adequately summarized in fewer than 3 sentences, use fewer
- Maintain objectivity and use clear, straightforward language
- Preserve the original tone (academic, casual, technical) while remaining concise
- Do not introduce new information not present in the original text
- Format the summary as a single paragraph
- Do not include any other explanation or context except the summary itself
- Do not include any introductory text, nor concluding sentences, nor outroductory text, like "Here is the text with the requested formatting:"
Following the rules described above, please summarize the following text:
<text>
{query}
</text>
"""

    data = {
        "model": model,
        "prompt": promptText,
        "stream": False,
    }

    completion = make_post_request(
        url="http://localhost:11434/api/generate", data=data)["response"]

    return completion

In [5]:
def load_doc(filepath: str) -> str:
    """
    Load the contents of a document from a specified file path.

    :param filepath: The path to the file to be read.
    :return: The text content of the file.
    """
    with open(filepath, 'r') as file:
        text = file.read()

    return text

In [6]:
def save_text_to_file(text: str, filepath: str) -> None:
    """
    Save a string of text to a specified file path.

    :param text: The text to be saved.
    :param filepath: The path to the file to be written.
    """
    with open(filepath, 'w') as file:
        file.write(text)

In [7]:
def split_text_into_chunks(text: str, chunk_size: int, overlap_size: int) -> list[str]:
    """
    Split a text into chunks of a specified size with a specified overlap.

    :param text: The input text to be split into chunks.
    :param chunk_size: The size of each chunk.
    :param overlap_size: The number of characters that overlap between consecutive chunks.
    :return: A list of text chunks.
    """
    chunks: list[str] = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap_size

    return chunks

In [8]:
def summarize_document(document_filepath: str, chunk_len=CHUNK_LEN, overlap_len=CHUNK_OVERLAP_LEN, len_limit=LEN_LIMIT) -> list[str]:
    """
    Summarize a document by splitting it into chunks and processing each chunk iteratively until the total length is within the specified limit.

    :param document_filepath: The path to the document to be summarized.
    :param chunk_len: The length of each chunk.
    :param overlap_len: The length of the overlap between consecutive chunks.
    :param len_limit: The maximum allowed length of the summarized document.
    :return: A list of summarized text chunks.
    """
    # Read the contents of the document
    content = load_doc(document_filepath)

    # Initial split of the document into chunks
    chunks = split_text_into_chunks(content, chunk_len, overlap_len)

    # Check the total length of the initial chunks
    total_len = sum(len(chunk) for chunk in chunks)
    if total_len <= len_limit:
        return ''.join(chunks)  # TODO: just return the original text

    # Iteratively process and concatenate chunks until the total length is within the limit
    iter_count = 0
    while True:
        iter_count += 1
        print(f"::: Iteration {iter_count} :::")
        print(f"::: Total length of chunks: {len(chunks)}")
        print(
            f"::: Total length of concatenated chunks: {sum(len(chunk) for chunk in chunks)}")

        # List to store processed chunks in the current iteration
        processed_chunks = []

        # Process each chunk
        chunk_iter = 0
        for chunk in chunks:
            chunk_iter += 1
            print(
                f"::: Processing chunk {iter_count}-{chunk_iter}/{len(chunks)}...")
            processed_chunk = generate_completion(query=chunk, model=DEFAULT_ASSISTANT_MODEL)
            # print(f"::: Processed chunk {processed_chunk}")
            processed_chunks.append(processed_chunk)

            # Sleep for 5 seconds to avoid overloading the server
            if PREVENT_OVERLOAD_PAUSE != 0 and PREVENT_OVERLOAD_PAUSE_PERIOD != 0 and chunk_iter % PREVENT_OVERLOAD_PAUSE_PERIOD == 0:
                print(
                    f"::: Pausing for {PREVENT_OVERLOAD_PAUSE} seconds...")
                sleep(PREVENT_OVERLOAD_PAUSE)

        # Concatenate all processed chunks into a single string
        concatenated_chunks = ''.join(processed_chunks)
        save_text_to_file(concatenated_chunks, TEM_RESULT_FILEPATH)

        # Check if the total length of concatenated chunks is below the limit
        if len(concatenated_chunks) <= len_limit:
            break

        # Re-split concatenated text into overlapping chunks for the next iteration
        chunks = split_text_into_chunks(
            concatenated_chunks, chunk_len, overlap_len)

    return '  ø ' + '\n\n  ø '.join(processed_chunks)

In [9]:
document_filepath = f"{DOCUMENT_FILEPATH_BASE}/{DOCUMENT_FILENAME}"
summaryText = summarize_document(document_filepath=document_filepath)

escapedFilename = sub(r'\W+', '_', DOCUMENT_FILENAME)
escapedModelname = sub(r'\W+', '', DEFAULT_ASSISTANT_MODEL)
summary_filepath = f"{SUMMARY_FILEPATH_BASE}_{escapedFilename}_{escapedModelname}_{CHUNK_LEN}_{CHUNK_OVERLAP_LEN}_{LEN_LIMIT}.{SUMMARY_FILEPATH_EXT}"
save_text_to_file(summaryText, summary_filepath)

print("\n\n\n")
print(f"::: Final Summary ${summaryText} :::")


::: Iteration 1 :::
::: Total length of chunks: 30
::: Total length of concatenated chunks: 147599
::: Processing chunk 1-1/30...
::: Processing chunk 1-2/30...
::: Processing chunk 1-3/30...
::: Processing chunk 1-4/30...
::: Processing chunk 1-5/30...
::: Processing chunk 1-6/30...
::: Processing chunk 1-7/30...
::: Processing chunk 1-8/30...
::: Processing chunk 1-9/30...
::: Processing chunk 1-10/30...
::: Processing chunk 1-11/30...
::: Processing chunk 1-12/30...
::: Processing chunk 1-13/30...
::: Processing chunk 1-14/30...
::: Processing chunk 1-15/30...
::: Processing chunk 1-16/30...
::: Processing chunk 1-17/30...
::: Processing chunk 1-18/30...
::: Processing chunk 1-19/30...
::: Processing chunk 1-20/30...
::: Processing chunk 1-21/30...
::: Processing chunk 1-22/30...
::: Processing chunk 1-23/30...
::: Processing chunk 1-24/30...
::: Processing chunk 1-25/30...
::: Pausing for 10 seconds...
::: Processing chunk 1-26/30...
::: Processing chunk 1-27/30...
::: Processing c