This notebook contains steps for the DDQ Document Ingestion Process


### Import Libraries


In [None]:
import pickle
import json
from datetime import datetime
from typing import Union
import re

from document_processor_subclasses import (
    MasterDDQProcessor,
    OMProcessor,
    ClientResponsesProcessor,
    ClientResponseProcessor,
)

from constants import (
    TARGET_PDF_PATH,
    CONNECTION_STRING,
    DATABASE_NAME,
    COLLECTION_NAME,
    OPENAI_API_KEY,
    OPENAI_API_VERSION,
    OPENAI_ENDPOINT,
    DI_ENDPOINT,
    DI_API_KEY,
    SHAREPOINT_USR,
    SHAREPOINT_PWD,
    SHAREPOINT_URL,
    SHAREPOINT_FOLDER,
)

from docx import Document
from docx.document import Document as DocumentType
from docx.table import Table
from docx.text.paragraph import Paragraph

from functions import (
    get_openai_client,
    get_db_client,
    get_service_management_client,
    get_models,
    get_access_token,
    get_sharepoint_headers,
    get_sharepoint_site_id,
    get_sharepoint_drive_id
)

from extractor import analyze_layout

from embeddings import generate_embeddings, convert_chunks_to_json

from classes import DocumentChunk, DocumentFlow

from document_parser import DocumentParser

from document_parser_utils import is_similar_color, remove_non_alphanumeric

from azure.ai.formrecognizer import AnalyzeResult, DocumentParagraph

### Import Clients


In [None]:
db_client = get_db_client()

openai_client = get_openai_client()

embedding_model, completions_model = get_models()

### Database setup

- Only needs to be run when setting up database collection and collection indices


In [None]:
db_client.setup_collection()

db_client.create_indices()

### Initial Document Parsing

- Makes API call to Azure DI for initial document parsing
- Only needs to be run once per document, then analysis result in stored in pkl file for future access


In [None]:
document_analysis_result = analyze_layout(
    TARGET_PDF_PATH, DI_ENDPOINT, DI_API_KEY)

# print(document_analysis_result)

### Document Chunking

Modify this code as needed based on the document being parsed


In [None]:
# Loads in saved document analysis result from pkl file

with open("layout_backup.pkl", "rb") as file:
    result = pickle.load(file)

# Custom document parser class built on top of Azure DI output.
# Required by document processor classes for chunking process

document_parser = DocumentParser(result=result)

# Python-docx object
# Required for determining headings based on styling

document: DocumentType = Document(TARGET_PDF_PATH.replace(".pdf", ".docx"))

filename = TARGET_PDF_PATH.split("/")[-1]

client_response_processor = ClientResponseProcessor(document_parser, filename, document)

# Custom document flow class that holds a list of chunks
# Print document flow object to see processed chunks
document_flow = client_response_processor.process_document()

print(document_flow)

### Uploading chunks to vector db


In [None]:
# Save parsed chunks as json file
with open(
    f"{document_flow.client_name}_{document_flow.document_name}_parsing_backup.json",
    "w",
) as file:
    file.write(json.dumps(document_flow.to_dict()))

# Vectorizes chunks and saves vectorized content to a backup json file
vectorized_chunks = convert_chunks_to_json(
    document_flow.chunks, openai_client, embedding_model
)

# with open(f"{document_flow.client_name}_{document_flow.document_name}_parsing_vectorized_backup.json", "r", encoding="utf-8") as file:
#     data = file.read()

# Pushes vectorized chunks to db
db_client.add_data_to_collection(vectorized_chunks)

### Uploading pdf documents to sharepoint


In [9]:
import requests

access_token = get_access_token()

headers = get_sharepoint_headers()

graph_api_endpoint = "https://graph.microsoft.com/v1.0/sites/forumequitypartners.sharepoint.com:/sites/REIIFDDQAssistant"

site_id = get_sharepoint_site_id(graph_api_endpoint)

drive_id = get_sharepoint_drive_id(site_id)

file_path = f"General/{TARGET_PDF_PATH.split('/')[-1]}"

upload_api_endpoint = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/root:/{file_path}:/content"

with open(TARGET_PDF_PATH, "rb") as file:
    file_content = file.read()

response = requests.put(upload_api_endpoint, headers=headers, data=file_content)
uploaded_file_info = response.json()

print(response.status_code, uploaded_file_info['webUrl'])

200 https://forumequitypartners.sharepoint.com/sites/REIIFDDQAssistant/Shared%20Documents/General/GallantMacDonald_Responses_05-05-2023.pdf
