In [1]:
import os
import pandas as pd
import numpy as np
from BookSpliter import split_book
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
#from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Pegasus Model & Tokenizer
model_checkpoint = 'pegasus_summaryflow_model'
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

#summarizer = pipeline("summarization", model="google/pegasus-large")
#summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [3]:
# Check if the number is divisible by 2 or 3
def divide_number(number):
    if number == 1 or number == 2 or number == 3:
        return number,  # Return a tuple with a single element
    if number % 3 == 0 or number % 2 == 0:
        return number,  # Return a tuple with a single element
    else:
        # Start values
        large_part = number - 2
        small_part = 2
        # Find two valuse divisible by 2 or 3 there sum equals the input number
        while True:
            if (large_part % 3 == 0 or large_part % 2 == 0) and (small_part % 3 == 0 or small_part % 2 == 0):
                # The two values were found
                break
            else:
                large_part -= 1
                small_part += 1
        return large_part, small_part

In [4]:
# Summarize the book recursively
def summarize_book(splited_book):
    if len(splited_book) == 1: # Base case (summary of the whole book)
        print('Length of splited book is : ', len(splited_book))
        # Tokenize the input text
        inputs = tokenizer(splited_book[0], return_tensors="pt", max_length=1024, truncation=True)
        # Generate summary
        summary_ids = model.generate(inputs.input_ids.to(model.device))
        # Decode the summary tokens
        final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        #final_summary = summarizer(splited_book[0])[0]['summary_text']
        print('Last paragraph : ', splited_book[0])
        return final_summary
    else:
        summary_list = []
        print('Length of splited book is : ', len(splited_book))
        print('First paragraph : ', splited_book[0])
        for i in range(len(splited_book)): # Loop through all paragraphs to generate summaries
            # Tokenize the input text
            inputs = tokenizer(splited_book[i], return_tensors="pt", max_length=1024, truncation=True)
            # Generate summary
            summary_ids = model.generate(inputs.input_ids.to(model.device))
            # Decode the summary tokens
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            #summary = summarizer(splited_book[i])[0]['summary_text']
            print('Summary of paragraph number ', i + 1, ' : ')
            print(summary)
            # Add to summary list
            summary_list.append(summary)

        summarized_book = [] # Carries the summaries of the book after combine them in paragraphs
        # Check if the number of the summaries is divisible by 2 or 3
        result = divide_number(len(summary_list))
        if len(result) == 2: # The number of summaries is not divisible by 2 or 3
            large_part, small_part = result
            print(f"The summaries size {len(summary_list)} is divided into {large_part} and {small_part}")
            # Combine first 2 or 3 summaries togeather and add it as 1 paragraph
            if small_part == 2:
                # Take the first 2 summaries
                paragraph = summary_list[0] + summary_list[1]
                # Combine the 2 summarise into a single paragraph
                combined_paragraph = ''.join(paragraph)
                # Append the combined paragraph to the new book list
                summarized_book.append(combined_paragraph)
            elif small_part == 3:
                # Take the first 3 summaries
                paragraph = summary_list[0] + summary_list[1] + summary_list[2]
                # Combine the 3 summarise into a single paragraph
                combined_paragraph = ''.join(paragraph)
                # Append the combined paragraph to the new book list
                summarized_book.append(combined_paragraph)

            # Loop through summaries and combine each 2 or 3 summaries in 1 paragraph
            if large_part % 3 == 0: # Number of remaining summaries is divisible by 3
                # Loop through the list of summarise, taking each 3 values
                for i in range(small_part, len(summary_list), 3):
                    # Take the next 3 summaries
                    paragraph = summary_list[i:i+3]
                    # Combine the 3 summarise into a single paragraph
                    combined_paragraph = ''.join(paragraph)
                    # Append the combined paragraph to the new book list
                    summarized_book.append(combined_paragraph)

            elif large_part % 2 == 0: # Number of remaining summaries is divisible by 2
                # Loop through the list of summarise, taking each 2 values
                for i in range(small_part, len(summary_list), 2):
                    # Take the next 2 summaries
                    paragraph = summary_list[i:i+2]
                    # Combine the 2 summarise into a single paragraph
                    combined_paragraph = ''.join(paragraph)
                    # Append the combined paragraph to the new book list
                    summarized_book.append(combined_paragraph)

            # Recursively summarize the generated summaries
            print('A level is summarized successfully!!')
            return summarize_book(summarized_book)
        else:
            if len(summary_list) == 1: # There is only 1 summary (whole book summary)
                summarized_book.append(summary_list[0])

            elif len(summary_list) == 0: # Handle the error if there is no summaries
                error_massage = 'Something went wrong. The book is empty.'
                summarized_book.append(error_massage)

            elif len(summary_list) % 3 == 0: # Number of summaries is divisible by 3
                # Loop through the list of summarise, taking each 3 values
                for i in range(0, len(summary_list), 3):
                    # Take the next 3 summaries
                    paragraph = summary_list[i:i+3]
                    # Combine the 3 summarise into a single paragraph
                    combined_paragraph = ''.join(paragraph)
                    # Append the combined paragraph to the new book list
                    summarized_book.append(combined_paragraph)

            elif len(summary_list) % 2 == 0: # Number of summaries is divisible by 2
                # Loop through the list of summarise, taking each 2 values
                for i in range(0, len(summary_list), 2):
                    # Take the next 2 summaries
                    paragraph = summary_list[i:i+2]
                    # Combine the 2 summarise into a single paragraph
                    combined_paragraph = ''.join(paragraph)
                    # Append the combined paragraph to the new book list
                    summarized_book.append(combined_paragraph)

            # Recursively summarize the generated summaries
            print('A level is summarized successfully!!')
            return summarize_book(summarized_book)


In [5]:
def save_summary(book_name, summary):
    # Create a folder named "Summaries" if it doesn't already exist
    summaries_folder = r"D:\Gethub\SummaryFlow\Summaries"
    os.makedirs(summaries_folder, exist_ok=True)

    # Generate the file path for the summary text file
    summary_file_path = os.path.join(summaries_folder, book_name + " summary.txt")

    # Write the final summary to the text file
    with open(summary_file_path, 'w', encoding='utf-8') as f:
        f.write(summary)

In [7]:
# Try to summarize known book

# Take the book path
book_path = r"D:\Gethub\SummaryFlow\WholeNovels\The Alchemist.txt"

# Split the book into paragraphs
splited_book = split_book(book_path)

# Summarize the book and print it 
final_summary = summarize_book(splited_book)
print("Book Summary : ")
print(final_summary)

# Save the summary to a text file
book_name = "The Alchemist"
save_summary(book_name, final_summary)


Max size of paragraph is  1024
The Alchemist  splited successfully !!
Length of splited book is :  25
First paragraph :  And little by little, my dream was becoming reality. Ten, a hundred, a thousand, a million copies sold in America. One day, a Brazilian journalist phoned to say that President Clinton had been photographed reading the book. Some time later, when I was in Turkey, I opened the magazine Vanity Fair and there was Julia Roberts declaring that she adored the book. Walking alone down a street in Miami, I heard a girl telling her mother: “You must read The Alchemist!” The book has been translated into fifty-six languages, has sold more than twenty million copies, and people are beginning to ask: What’s the secret behind such a huge success? The only honest response is: I don’t know. All I know is that, like Santiago the shepherd boy, we all need to be aware of our personal calling. What is a personal calling? It is God’s blessing, it is the path that God chose for you here o

In [None]:
# Try to summarize large book (2.6 MB)

# Take The Book Path
book_path = r"D:\Gethub\SummaryFlow\WholeNovels\The Count of Monte Cristo.txt"

# Split The Book Into Paragraphs
splited_book = split_book(book_path)

# Summarize the book and print it 
final_summary = summarize_book(splited_book)
print("Book Summary : ")
print(final_summary)

book_name = "The Count of Monte Cristo"
save_summary(book_name, final_summary)