# MongoDB - Parsing, Storage, and Vector Search of PDFs

1. Import parsing libraries and the MDB driver.

In [2]:
import requests
from io import BytesIO
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
from bs4 import BeautifulSoup
import csv
from pymongo import MongoClient
import nltk
nltk.download("punkt")  # Download the necessary data for tokenization


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brady.byrd/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

2. Parse out paragraphs from PDFs

In [3]:

def parse_pdf(url):
    response = requests.get(url)
    response.raise_for_status()

    pdf_content = response.content

    pdf_file = BytesIO(pdf_content)

    text_file = BytesIO()

    # Extract the text from the PDF and store it in the file-like object
    extract_text_to_fp(pdf_file, text_file, laparams=LAParams())

    text_file.seek(0)

    extracted_text = text_file.read().decode("utf-8")

    # Split the extracted text into separate paragraphs
    paragraphs = extracted_text.split("\n\n")

    # Tokenize sentences from each paragraph
    docs = []
    for paragraph in paragraphs:
        sentences = []
        paragraph_sentences = nltk.sent_tokenize(paragraph)
        sentences.extend(paragraph_sentences)
        docs.append({"raw": paragraph, "sentences": sentences})

    return docs

3. Parse out paragraphs from HTML

In [30]:
def parse_html(url):
    response = requests.get(url)
    response.raise_for_status()

    html_content = response.content

    soup = BeautifulSoup(html_content, "html.parser")

    # Find all paragraphs in the HTML document
    paragraphs = [p.get_text() for p in soup.find_all("p")]

    # Tokenize sentences from each paragraph
    docs = []
    for paragraph in paragraphs:
        sentences = []
        paragraph_sentences = nltk.sent_tokenize(paragraph)
        sentences.extend(paragraph_sentences)
        docs.append({"raw": paragraph, "sentences": sentences})


    return docs

4. Get documents from the URLs specified in the aetna_plan_docs.csv file

In [None]:
csv_file_path = "aetna_plan_docs.csv"
print("Opening csv file")
with open(csv_file_path, "r") as file:
    reader = csv.reader(file)
    header = next(reader)
    print(f'Got header: {header}')
    client = MongoClient('mongodb+srv://main_admin:<secret>@hackathon.ughh2.mongodb.net')
    db = client['vector_search_demo']
    collection = db['mpsf_plan_docs']

    # Loop through each row in the CSV file
    for row in reader:
        row_dict = {header[i]: row[i] for i in range(len(header))}
        url = row_dict['Content Location']
        if url.endswith(".pdf"):
            parsed_par = parse_pdf(url)
            row_dict["paragraphs"] = parsed_par
        elif url.endswith((".html", ".htm")):
            parsed_par = parse_html(url)
            row_dict["paragraphs"] = parsed_par
        else:
            raise ValueError("Unsupported document type")

        collection.insert_one(row_dict)
    
    client.close()


5. Create the MDB client, loop through the list of URLs and store the data in MDB. 