<a href="https://colab.research.google.com/github/ana-bharadwaj/Textbook_Summarizer/blob/chief/SummarizerFinalPhy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyMuPDF
!pip install nltk
!pip install Pillow
!pip install transformers


Collecting PyMuPDF
  Downloading PyMuPDF-1.23.5-cp310-none-manylinux2014_x86_64.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.5 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.5 PyMuPDFb-1.23.5
Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31

In [None]:
!pip install spacy
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
import json
import fitz  # PyMuPDF
from nltk.corpus import stopwords
from nltk import sent_tokenize
from nltk import word_tokenize
from PIL import Image
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import io
import os
import heapq


# Function to extract summaries from a list of paragraphs
def summarize_text(paragraphs):
    summaries = []

    for paragraph in paragraphs:
        # Extract the word frequency for each paragraph
        wordFrequencies = findWordFreq(paragraph)
        # Get the sentence score for each paragraph
        sentScore = findSentScore(wordFrequencies, paragraph)
        # Get the top sentences
        summary_sentences = heapq.nlargest(3, sentScore, key=sentScore.get)
        summary = ' '.join(summary_sentences)
        summaries.append(summary)

    return summaries

# Find the word frequencies
# Find the word frequencies
def findWordFreq(paragraph):
    # Check for stopwords in English
    stopwordEng = stopwords.words('english')
    word_frequencies = {}
    # Get the word frequency
    for word in word_tokenize(paragraph):
        if word not in stopwordEng:
            if word not in word_frequencies:
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    if word_frequencies:  # Check if the dictionary is not empty
        # Get the word with maximum frequency
        maximum_frequency = max(word_frequencies.values())

        # Get the weighted word frequency for each word
        for word in word_frequencies.keys():
            word_frequencies[word] = (word_frequencies[word] / maximum_frequency)

    return word_frequencies


# Find sentence score
def findSentScore(wordFrequencies, paragraph):
    sentence_scores = {}
    # Sentence tokenize the paragraph
    sentence_list = sent_tokenize(paragraph)
    # For each sentence
    for sent in sentence_list:
        # Get all the words in the single sentence
        for word in word_tokenize(sent.lower()):
            # Check if the word is in the wordFreq dictionary
            if word in wordFrequencies.keys():
                # Add the word frequency in each sentence
                if sent not in sentence_scores:
                    sentence_scores[sent] = wordFrequencies[word]
                else:
                    sentence_scores[sent] += wordFrequencies[word]

    return sentence_scores

# Function to extract paragraphs from a PDF
def extract_paragraphs_from_pdf(pdf_document):
    headings = []
    paragraphs = []
    page_numbers = []
    current_heading = ""
    current_content = ""

    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        text = page.get_text("text").strip().split('\n')

        for line in text:
            if line.startswith("8."):
                if current_heading:
                    paragraphs.append((current_heading, current_content, page_numbers[-1]))
                current_heading = line
                headings.append(current_heading)
                page_numbers.append(page_number + 1)
                current_content = ""
            else:
                current_content += " " + line

    if current_heading:
        paragraphs.append((current_heading, current_content, page_numbers[-1]))

    return headings, paragraphs, page_numbers

# Function to check if a page has an image
def check_page_has_image(pdf_document, page_number):
    page = pdf_document[page_number - 1]
    xrefs = page.get_images(full=True)
    return len(xrefs) > 0

# Function to save images from a PDF
# Function to save images from a PDF
def save_images_from_pdf(pdf_document):
    image_paths = []

    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        images = page.get_images(full=True)

        for img_index, image in enumerate(images):
            base_image = pdf_document.extract_image(image[0])
            image_data = base_image["image"]
            image = Image.open(io.BytesIO(image_data))

            # Convert the image to RGB mode (add this line)
            image = image.convert('RGB')

            image_path = f"/content/drive/MyDrive/Peoples/images/Chem12Im/CE8{page_number + 1}_{img_index + 1}.png"
            image.save(image_path)
            image_paths.append(image_path)

    return image_paths

if __name__ == "__main__":
    pdf_path = "/content/drive/MyDrive/Peoples/Chem12/lech203.pdf"
    pdf_document = fitz.open(pdf_path)
    headings, paragraphs, page_numbers = extract_paragraphs_from_pdf(pdf_document)
    has_images = []

    if headings and paragraphs:
        current_heading = ""
        current_content = ""
        i = 1
        start_page = page_numbers[0]
        end_page = page_numbers[-1]
        total_pages = len(page_numbers)
        output_json = {
            "name": "Chemistry",
            "class": "12",
            "chapter": "The d & f Block Elements",
            "meta": {
                "title": "The d & f Block Elements",
                "description": "",
                "cover_image": "",
                "start_page": start_page,
                "end_page": end_page,
                "total_pages": total_pages
            },
            "config": []
        }
        for (heading, content, page) in zip(headings, paragraphs, page_numbers):
            has_image = check_page_has_image(pdf_document, page)  # Check if the page has an image
            has_images.append(has_image)
            if page < 1:
                continue  # Skip pages before page 1
            if heading != current_heading:
                if current_heading:
                    summary = summarize_text([current_content[1]])[0]
                    config_data = {
                        "pageNo": str(page),
                        "page_meta": {
                            "headers": "",
                            "footers": "",
                            "header_image": "",
                            "footer_image": "",
                            "number_of_paragraphs": i
                        },
                        "content": [
                            {
                                "idx": current_heading,
                                "text": current_content[1],
                                "has_image": has_image,
                                "image_list": [f"/content/drive/MyDrive/Peoples/images/Chem12Im/CE8{page}.png"] if has_image else []
                            }
                        ]
                    }
                    output_json["config"].append({"data": [config_data]})
                    print(f"Paragraph {i} (Page {page}):")
                    print(f"Heading: {current_heading}")
                    print(f"Content: {current_content[1]}\n")
                    print(f"Summarized Content: {summary}\n")
                current_heading = heading
                current_content = content  # Store the entire content tuple
                i = 1
            else:
                current_content += content  # Append content to the current_content
            i += 1
        # Print the last paragraph
        has_image = check_page_has_image(pdf_document, page)  # Check if the page has an image
        has_images.append(has_image)
        summary = summarize_text([current_content[1]])[0]
        config_data = {
            "pageNo": str(page),
            "page_meta": {
                "headers": "",
                "footers": "",
                "header_image": "",
                "footer_image": "",
                "number_of_paragraphs": i
            },
            "content": [
                {
                    "idx": current_heading,
                    "text": current_content[1],
                    "has_image": has_image,
                    "image_list": [f"/content/drive/MyDrive/Peoples/images/Chem12Im/CE8{page}.png"] if has_image else []
                }
            ]
        }
        output_json["config"].append({"data": [config_data]})
        print(f"Paragraph {i} (Page {page}):")
        print(f"Heading: {current_heading}")
        print(f"Content: {current_content[1]}\n")
        print(f"Summarized Content: {summary}\n")

        # Save the JSON structure to a file
        with open("/content/drive/MyDrive/Peoples/Chem12json/lech203", "w") as json_file:
            json.dump(output_json, json_file, indent=4)

        # Save images from the PDF
        image_paths = save_images_from_pdf(pdf_document)

    else:
        print("No headings and paragraphs found in the PDF.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Paragraph 2 (Page 2):
Heading: 8.1.1
Content:  Nomenclature I. Aldehydes and ketones Aldehydes and ketones are the simplest and most important carbonyl compounds. There are two systems of nomenclature of aldehydes and ketones. (a) Common names Aldehydes and ketones are often called by their common names instead of IUPAC names. The common names of most aldehydes are derived from the common names of the corresponding carboxylic acids [Section 8.6.1] by replacing the ending –ic of acid with aldehyde. At the same time, the names reflect the Latin or Greek term for the original source of the acid or aldehyde. The location of the substituent in the carbon chain is indicated by Greek letters a, b, g, d, etc. The a-carbon being the one directly linked to the aldehyde group, b- carbon the next, and so on. For example

Summarized Content: The location of the substituent in the carbon chain is indicated by Greek letters a, b, g, d, etc. The common names of most aldehydes are derived from the comm