# Install libraries for PDF processing, concurrency, and MongoDB integration. These libraries will handle PDF parsing (PyPDF2 or pdfminer.six), database integration (pymongo), and concurrency (concurrent.futures). For natural language processing tasks like summarization and keyword extraction, use nltk or create custom algorithms.

In [1]:
!pip install PyPDF2 pdfminer.six pymongo nltk

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting pymongo
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloa

# Connect to MongoDB: Use the connection string to connect to your MongoDB database.

In [2]:
!pip install pymongo



In [3]:
from pymongo import MongoClient
from urllib.parse import quote_plus
connection_string = 'mongodb+srv://anujchauhan05130:123456789io@anujchauhan001.zdwry.mongodb.net/?retryWrites=true&w=majority&appName=AnujChauhan001'
client = MongoClient(connection_string)
db = client['Anujchauhan001']
collection = db['pdf_metadata']

print("Connected to MongoDB Atlas!")


Connected to MongoDB Atlas!


# Creating and Storing PDF Metadata in MongoDB


In [5]:
from datetime import datetime

pdf_metadata = {
    "name": "Sample PDF",
    "path": "/content/sample.pdf",
    "size": 2048,
    "summary": "This is a summary of the PDF content.",
    "keywords": ["keyword1", "keyword2", "keyword3"],
    "uploaded_at": datetime.utcnow()
}

print("PDF metadata has been successfully created!")


PDF metadata has been successfully created!


# This code is designed to process PDF files in a specified folder, extract text from them, generate summaries, and identify relevant keywords. It then stores this information in a MongoDB database for easy retrieval and analysis.

In [11]:
from datetime import datetime
import os
import json
from PyPDF2 import PdfReader
from pymongo import MongoClient
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import threading
import logging
import time

nltk.download('punkt')

# Set up logging for error handling
logging.basicConfig(filename='pdf_processing.log', level=logging.ERROR)

# MongoDB Connection
client = MongoClient('mongodb://localhost:27017/')  # Update with your MongoDB connection string
db = client['pdf_database']  # Replace with your database name
collection = db['pdf_metadata']  # Replace with your collection name

def summarize_text(text):
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return ""
    elif len(sentences) <= 3:
        return text
    elif len(sentences) <= 10:
        return ' '.join(sentences[:3])
    else:
        return ' '.join(sentences[:5])

def extract_keywords(text, num_keywords=5):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha()]
    word_counts = Counter(words)
    keywords = word_counts.most_common(num_keywords)
    return [word for word, _ in keywords if word not in {"the", "is", "at", "on", "and"}]

def process_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            full_text = ""
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    full_text += text + "\n"

        if not full_text.strip():  # Check if full_text is empty
            raise ValueError(f"No text found in {pdf_path}")

        summary = summarize_text(full_text)
        keywords = extract_keywords(full_text)

        pdf_metadata = {
            "name": os.path.basename(pdf_path),
            "path": pdf_path,
            "size": os.path.getsize(pdf_path),
            "summary": summary,
            "keywords": keywords,
            "uploaded_at": datetime.utcnow()
        }

        # Update MongoDB
        try:
            collection.update_one(
                {"path": pdf_path},
                {"$set": {
                    "summary": summary,
                    "keywords": keywords,
                    "size": pdf_metadata["size"],
                    "uploaded_at": pdf_metadata["uploaded_at"]
                }},
                upsert=True
            )
        except Exception as mongo_error:
            logging.error(f"MongoDB update failed for {pdf_path}: {mongo_error}")

        print(f"Processed and updated: {pdf_metadata['name']}")
        print(f"Size: {pdf_metadata['size']} bytes")
        print(f"Uploaded at: {pdf_metadata['uploaded_at']}")
        print("Summarized Successfully")

    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        print(f"Failed to process {pdf_path}. Check log for details.")

def process_pdfs_in_folder(folder_path):
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    threads = []

    start_time = time.time()  # Start the timer for performance metrics

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        thread = threading.Thread(target=process_pdf, args=(pdf_path,))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    end_time = time.time()  # End the timer
    total_time = end_time - start_time
    print(f"Total time taken to process PDFs: {total_time:.2f} seconds")

folder_path = '/content/pdf_files'
process_pdfs_in_folder(folder_path)
print("***************Summarized Every PDF Dataset Successfully ***************")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set5.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set5.pdf
Size: 335126 bytes

ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set12.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>



Uploaded at: 2024-10-09 07:55:53.368261
Summarized Successfully
Processed and updated: Sample_Set12.pdf
Size: 268943 bytes
Uploaded at: 2024-10-09 07:56:04.524121
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set8.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set8.pdf
Size: 298845 bytes
Uploaded at: 2024-10-09 07:56:07.056469


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set14.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Summarized Successfully
Processed and updated: Sample_Set14.pdf
Size: 393198 bytes
Uploaded at: 2024-10-09 07:56:07.410379
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set9.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set9.pdf
Size: 310088 bytes
Uploaded at: 2024-10-09 07:56:10.738137
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set18.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set18.pdf
Size: 13767577 bytes
Uploaded at: 2024-10-09 07:56:15.704082
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set6.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set6.pdf
Size: 468975 bytes
Uploaded at: 2024-10-09 07:56:16.092429
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set11.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>
ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set7.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connec

Processed and updated: Sample_Set11.pdf
Size: 551321 bytes
Uploaded at: 2024-10-09 07:56:17.115096
Summarized Successfully
Processed and updated: Sample_Set7.pdf
Size: 468890 bytes
Uploaded at: 2024-10-09 07:56:17.219684
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set1.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set1.pdf
Size: 650025 bytes
Uploaded at: 2024-10-09 07:56:18.286098
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set15.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set15.pdf
Size: 502052 bytes
Uploaded at: 2024-10-09 07:56:21.044150
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set16.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set16.pdf
Size: 234382 bytes
Uploaded at: 2024-10-09 07:56:21.993693
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set13.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set13.pdf
Size: 471370 bytes
Uploaded at: 2024-10-09 07:56:23.283902
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set17.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set17.pdf
Size: 2232079 bytes
Uploaded at: 2024-10-09 07:56:38.443219
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set10.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set10.pdf
Size: 2126778 bytes
Uploaded at: 2024-10-09 07:56:55.872765
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set4.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set4.pdf
Size: 1324200 bytes
Uploaded at: 2024-10-09 07:57:03.439433
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set2.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set2.pdf
Size: 5323822 bytes
Uploaded at: 2024-10-09 07:57:21.252667
Summarized Successfully


ERROR:root:MongoDB update failed for /content/pdf_files/Sample_Set3.pdf: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6706370663b3e5a6506e4e4e, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


Processed and updated: Sample_Set3.pdf
Size: 2413611 bytes
Uploaded at: 2024-10-09 07:57:29.006653
Summarized Successfully
Total time taken to process PDFs: 129.10 seconds
***************Summarized Every PDF Dataset Successfully ***************
