In [5]:
!pip install pdfplumber pandas PyMuPDF nltk




In [6]:
!python -m pip install --upgrade pip




In [7]:
import os
import pdfplumber
import pandas as pd
import fitz  # PyMuPDF for highlighting
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('punkt_tab') 
import warnings
warnings.filterwarnings("ignore") 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ARYAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ARYAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
folder_path = "C:\\Users\\ARYAN\\Documents\\PDF_Search_Summary_Project\\pdfs"

In [9]:
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
print(" Found PDF files:")
for file in pdf_files:
    print("-", file)

 Found PDF files:
- CS-101-1.3.1_History-of-Java-Programming-Language.pdf
- interface and abstract class.pdf
- OLevel_2_B4_CLang_26Mar_SS.pdf


In [10]:
search_input = input("Enter keywords : ")
search_terms = [term.strip().lower() for term in search_input.split(",") if term.strip()]
print(search_terms)


Enter keywords :  java,programming,history


['java', 'programming', 'history']


In [11]:

results = []
keyword_counts = {}

for file in pdf_files:
    file_path = os.path.join(folder_path, file)
    with pdfplumber.open(file_path) as pdf:
        for i in range(len(pdf.pages)):
            page = pdf.pages[i]
            text = page.extract_text()

            if text:
                sentences = sent_tokenize(text)
                for sentence in sentences:
                    lowered = sentence.lower()
                    matched_terms = [term for term in search_terms if term in lowered]
                    for term in matched_terms:
                        results.append({
                            "Filename": file,
                            "Page": i + 1,
                            "Keyword": term,
                            "Sentence": sentence.strip()
                        })
                        keyword_counts[term] = keyword_counts.get(term, 0) + lowered.count(term)


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value


In [12]:
df = pd.DataFrame(results)
df.head(10)



Unnamed: 0,Filename,Page,Keyword,Sentence
0,CS-101-1.3.1_History-of-Java-Programming-Langu...,1,java,“History of Java Programming Language”\nFree J...
1,CS-101-1.3.1_History-of-Java-Programming-Langu...,1,programming,“History of Java Programming Language”\nFree J...
2,CS-101-1.3.1_History-of-Java-Programming-Langu...,1,history,“History of Java Programming Language”\nFree J...
3,CS-101-1.3.1_History-of-Java-Programming-Langu...,1,java,Unlike conventional languages\nwhich are gener...
4,CS-101-1.3.1_History-of-Java-Programming-Langu...,1,java,"Java is only distantly related to JavaScript, ..."
5,CS-101-1.3.1_History-of-Java-Programming-Langu...,1,java,History\nJava was started as a project called ...
6,CS-101-1.3.1_History-of-Java-Programming-Langu...,1,history,History\nJava was started as a project called ...
7,CS-101-1.3.1_History-of-Java-Programming-Langu...,1,java,The first public\nimplementation was Java 1.0 ...
8,CS-101-1.3.1_History-of-Java-Programming-Langu...,1,java,New versions for large and small platforms (J2...
9,CS-101-1.3.1_History-of-Java-Programming-Langu...,1,java,"Sun has not announced any\nplans for a ""Java 3""."


In [17]:
df.to_csv("search_results.csv", index=False)
print(" Search results saved to 'search_results.csv'")


 Search results saved to 'search_results.csv'


In [15]:
count_df = pd.DataFrame(list(keyword_counts.items()), columns=["Keyword", "Count"])
count_df.to_csv("keyword_counts.csv", index=False)
count_df.head()


Unnamed: 0,Keyword,Count
0,java,108
1,programming,21
2,history,10


In [16]:
highlighted_folder = "highlighted_pdfs"
os.makedirs(highlighted_folder, exist_ok=True)

for file in pdf_files:
    file_path = os.path.join(folder_path, file)
    doc = fitz.open(file_path)

    for page in doc:
        for term in search_terms:
            matches = page.search_for(term)
            for match in matches:
                try:
                    page.add_highlight_annot(match)
                except:
                    pass  # Skip problematic highlight

    output_path = os.path.join(highlighted_folder, f"highlighted_{file}")
    doc.save(output_path)
    doc.close()

print("Highlighted PDFs saved in 'highlighted_pdfs' folder")

Highlighted PDFs saved in 'highlighted_pdfs' folder
