In [None]:
import os
import fitz
import pytesseract 
from pdf2image import convert_from_path  
from transformers import pipeline 
import pandas as pd
import shutil 
import textwrap  

if not shutil.which("tesseract"):
    exit(1)

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

pdf_dir = os.getcwd()
output_data = []

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text.strip()

def extract_text_from_images(pdf_path):
    images = convert_from_path(pdf_path)
    extracted_text = ""
    for img in images:
        extracted_text += pytesseract.image_to_string(img)
    return extracted_text.strip()

def chunk_text(text, chunk_size=512):
    return textwrap.wrap(text, chunk_size)

def summarize_text(text, max_length=200):
    if len(text) < 50:
        return text

    chunks = chunk_text(text, chunk_size=500)
    summaries = []

    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=max_length, min_length=50, do_sample=False)[0]['summary_text']
            summaries.append(summary)
        except Exception as e:
            print(f"Skipping chunk due to error: {e}")
            summaries.append(chunk)  

    return " ".join(summaries)  

def process_pdfs():
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")]
    
    if not pdf_files:
        print("No PDF files found.")
        return
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        print(f"📄 Processing: {pdf_file}")

        extracted_text = extract_text_from_pdf(pdf_path)

        if not extracted_text:
            print(f"Applying OCR on {pdf_file} as no direct text was found.")
            extracted_text = extract_text_from_images(pdf_path)

        summary = summarize_text(extracted_text)

        output_data.append({
            "pdf_file": pdf_file,
            "extracted_text": extracted_text[:500],
            "summary": summary
        })

    df = pd.DataFrame(output_data)
    df.to_csv("summarized_pdfs.csv", index=False)

process_pdfs()


❌ Tesseract OCR is not installed. Install it using: brew install tesseract


Device set to use mps:0


🔍 Found 6 PDF files. Starting processing...
📄 Processing: astrazeneca.pdf


Your max_length is set to 200, but your input_length is only 107. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 200, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 200, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 200, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


📄 Processing: ltimindtree.pdf


Your max_length is set to 200, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 200, but your input_length is only 98. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 200, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 200, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Yo

📄 Processing: nxtra airtel.pdf


Your max_length is set to 200, but your input_length is only 151. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=75)
Your max_length is set to 200, but your input_length is only 144. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=72)
Your max_length is set to 200, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 200, but your input_length is only 98. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Y

📄 Processing: intel.pdf


Your max_length is set to 200, but your input_length is only 134. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)
Your max_length is set to 200, but your input_length is only 158. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=79)
Your max_length is set to 200, but your input_length is only 143. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=71)
Your max_length is set to 200, but your input_length is only 157. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=78)


📄 Processing: tata.pdf


Your max_length is set to 200, but your input_length is only 132. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=66)
Your max_length is set to 200, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
Your max_length is set to 200, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 200, but your input_length is only 144. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=72)


📄 Processing: wipro.pdf


Your max_length is set to 200, but your input_length is only 107. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 200, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 200, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 200, but your input_length is only 94. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)
You

Summarization completed. Results saved to summarized_pdfs.csv


In [None]:
import pandas as pd

input_csv = "summarized_pdfs.csv"
output_csv = "cleaned_summarized_pdfs.csv"

df = pd.read_csv(input_csv, dtype=str, encoding="utf-8")
df.fillna("", inplace=True)  # Replace NaN values with empty strings
max_text_length = 2000  # Adjust this limit as needed
df["extracted_text"] = df["extracted_text"].apply(lambda x: x[:max_text_length] + "..." if len(x) > max_text_length else x)
df["summary"] = df["summary"].apply(lambda x: x[:max_text_length] + "..." if len(x) > max_text_length else x)
df.to_csv(output_csv, index=False, encoding="utf-8")
print(f"Fixed CSV saved as: {output_csv}")

✅ Fixed CSV saved as: cleaned_summarized_pdfs.csv
