In [None]:
import PyPDF2
import json

# File paths
book_pdf_path = "./Forozon.pdf"
course_pdf_path = "../Syllabus/01CT1514_VLSI.pdf"

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    text = []
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text.append(page.extract_text())
    return text, len(reader.pages)  # Returning text per page and total page count

# Extract text from both PDFs
book_text, book_page_count = extract_text_from_pdf(book_pdf_path)
course_text, course_page_count = extract_text_from_pdf(course_pdf_path)

# Example topics from course content
course_topics = {
    "Introduction": ["Use of Computer Networks", "Network Hardware", "Network Software", "OSI", "TCP/IP Reference Model"],
    "Data Link Layer": ["Error Detection", "Flow Control", "Sliding Window Protocols", "HDLC"],
    "Medium Access Control": ["Multiple Access Protocols", "Ethernet", "Wireless LANs", "Virtual LANs"],
    "Network Layer": ["Routing Algorithms", "Congestion Control", "QoS", "Internetworking"],
    "Transport Layer": ["UDP", "TCP", "Congestion Control"],
    "Application Layer": ["DNS", "Email", "World Wide Web", "Multimedia"]
}

# Mapping course topics to book page ranges
topic_page_mapping = {}

for topic, subtopics in course_topics.items():
    page_ranges = []
    for page_num, content in enumerate(book_text):
        for subtopic in subtopics:
            if subtopic.lower() in content.lower():
                page_ranges.append(page_num + 1)  # Pages are 1-based index

    # Convert to ranges (e.g., [2,3,4,7] -> [(2,4), (7,7)])
    if page_ranges:
        page_ranges.sort()
        ranges = []
        start = page_ranges[0]

        for i in range(1, len(page_ranges)):
            if page_ranges[i] != page_ranges[i - 1] + 1:
                ranges.append((start, page_ranges[i - 1]))
                start = page_ranges[i]
        ranges.append((start, page_ranges[-1]))

        topic_page_mapping[topic] = ranges

# Metadata extraction
book_metadata = {
    "Title": "Data Communications and Networking",
    "Author": "Behrouz A. Forouzan",
    "PageCount": book_page_count
}

# Final JSON structure
output_json = {
    "BookMetadata": book_metadata,
    "TopicMappings": topic_page_mapping
}

# Convert to JSON format
json_output = json.dumps(output_json, indent=4)
json_output
#save to the json file



'{\n    "BookMetadata": {\n        "Title": "Data Communications and Networking",\n        "Author": "Behrouz A. Forouzan",\n        "PageCount": 1171\n    },\n    "TopicMappings": {\n        "Introduction": [\n            [\n                7,\n                7\n            ],\n            [\n                13,\n                13\n            ],\n            [\n                25,\n                25\n            ],\n            [\n                31,\n                31\n            ],\n            [\n                36,\n                36\n            ],\n            [\n                39,\n                39\n            ],\n            [\n                42,\n                42\n            ],\n            [\n                49,\n                51\n            ],\n            [\n                62,\n                62\n            ],\n            [\n                66,\n                72\n            ],\n            [\n                74,\n                74\n            ],\

In [2]:
with open("output.json", "w") as file:
    file.write(json_output)