In [1]:
import os  
import json  
import openai
from Utilities.envVars import *
import uuid

# Set OpenAI API key and endpoint
openai.api_type = "azure"
openai.api_version = OpenAiVersion
openai_api_key = OpenAiKey
assert openai_api_key, "ERROR: Azure OpenAI Key is missing"
openai.api_key = openai_api_key
openAiEndPoint = f"{OpenAiEndPoint}"
assert openAiEndPoint, "ERROR: Azure OpenAI Endpoint is missing"
assert "openai.azure.com" in openAiEndPoint.lower(), "ERROR: Azure OpenAI Endpoint should be in the form: \n\n\t<your unique endpoint identifier>.openai.azure.com"
openai.api_base = openAiEndPoint

In [2]:
# Parameters
embeddingModelType = "azureopenai"
temperature = 0
tokenLength = 1000

In [3]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.utilities import BingSearchAPIWrapper
from langchain.docstore.document import Document
import pandas as pd
from langchain.prompts import PromptTemplate
from datetime import datetime
from pytz import timezone
from dateutil.relativedelta import relativedelta
from datetime import timedelta
import typing
from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
from langchain.chains import LLMChain
# Import required libraries
# Import required libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import (
    PDFMinerLoader,
    UnstructuredFileLoader,
)
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from IPython.display import display, HTML
from langchain.chains.summarize import load_summarize_chain
from Utilities.pibCopilot import createSearchIndex, indexSections, findFileInIndex, performCogSearch, mergeDocs, createProspectusSummary, findTopicSummaryInIndex

In [4]:
# Flexibility to change the call to OpenAI or Azure OpenAI
if (embeddingModelType == 'azureopenai'):
    openai.api_type = "azure"
    openai.api_key = OpenAiKey
    openai.api_version = OpenAiVersion
    openai.api_base = OpenAiEndPoint

    llm = AzureChatOpenAI(
                openai_api_base=openai.api_base,
                openai_api_version=OpenAiVersion,
                deployment_name=OpenAiChat16k,
                temperature=temperature,
                openai_api_key=OpenAiKey,
                openai_api_type="azure",
                max_tokens=tokenLength)
    
    logging.info("LLM Setup done")
    embeddings = OpenAIEmbeddings(deployment=OpenAiEmbedding, openai_api_key=OpenAiKey, openai_api_type="azure")
elif embeddingModelType == "openai":
    openai.api_type = "open_ai"
    openai.api_base = "https://api.openai.com/v1"
    openai.api_version = '2020-11-07' 
    openai.api_key = OpenAiApiKey
    embeddings = OpenAIEmbeddings(openai_api_key=OpenAiApiKey)

    llm = ChatOpenAI(temperature=temperature,
        openai_api_key=OpenAiApiKey,
        model_name="gpt-3.5-turbo",
        max_tokens=tokenLength)

In [5]:
# Set the file name and the namespace for the index
fileName = "Bumble Bee.pdf"
pdfPath = "Data/PDF/" + fileName
# Load the PDF with Document Loader available from Langchain
loader = PDFMinerLoader(pdfPath)
rawDocs = loader.load()
# Set the source 
for doc in rawDocs:
    doc.metadata['source'] = pdfPath

textSplitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=1000)
docs = textSplitter.split_documents(rawDocs)

In [6]:
print("Number of documents chunks generated from PDF : ", len(docs))

Number of documents chunks generated from PDF :  162


In [7]:
indexName = 'prospectus'
r = findFileInIndex(SearchService, SearchKey, indexName, fileName)
if r.get_count() == 0:
    # Call Helper function to create Index and Index the sections
    createSearchIndex(SearchService, SearchKey, indexName)
    indexSections(OpenAiEndPoint, OpenAiKey, OpenAiVersion, OpenAiApiKey, SearchService, SearchKey, embeddingModelType, OpenAiEmbedding, fileName, indexName, docs)
else:
    print("Found in index")

Found in index


In [8]:
def summarizeTopic(llm, query, promptTemplate):
    r = performCogSearch(OpenAiEndPoint, OpenAiKey, OpenAiVersion, OpenAiApiKey, SearchService, SearchKey, embeddingModelType, OpenAiEmbedding, query, indexName, 3)
    if r == None:
        resultsDoc = [Document(page_content="No results found")]
    else :
        resultsDoc = [
                Document(page_content=doc['content'], metadata={"id": doc['id'], "source": doc['sourcefile']})
                for doc in r
                ]
        
    customPrompt = PromptTemplate(template=promptTemplate, input_variables=["text"])
    chainType = "map_reduce"
    summaryChain = load_summarize_chain(llm, chain_type=chainType, return_intermediate_steps=True, 
                                        map_prompt=customPrompt, combine_prompt=customPrompt)
    summary = summaryChain({"input_documents": resultsDoc}, return_only_outputs=True)
    outputAnswer = summary['output_text']
    return outputAnswer

In [9]:
promptTemplate = """You are an AI assistant tasked with summarizing documents from large documents that contains information about Initial Public Offerings. 
            IPO document contains sections with information about the company, its business, strategies, risk, management structure, financials, and other information.
            Your summary should accurately capture the key information in the document while avoiding the omission of any domain-specific words. 
            Please generate a concise and comprehensive summary that includes details. 
            Ensure that the summary is easy to understand and provides an accurate representation. 
            Begin the summary with a brief introduction, followed by the main points.
            Generate the summary with minumum of 7 paragraphs and maximum of 10 paragraphs.
            Please remember to use clear language and maintain the integrity of the original information without missing any important details:
            {text}

            """

In [11]:
prospectusSummaryIndexName = 'prospectussummary'
createProspectusSummary(SearchService, SearchKey, prospectusSummaryIndexName)
topicSummary = []

selectedTopics = [
        "Strengths",
        "Growth Strategy",
        "Investment Risk",
        "Intellectual Property"
    ]

for topic in selectedTopics:
    r = findTopicSummaryInIndex(SearchService, SearchKey, prospectusSummaryIndexName, fileName, 'prospectus', topic)
    if r.get_count() == 0:
        answer = summarizeTopic(llm, topic, promptTemplate)
        if "I don't know" not in answer:
            topicSummary.append({
                'id' : str(uuid.uuid4()),
                'fileName': fileName,
                'docType': 'prospectus',
                'topic': topic,
                'summary': answer
        })

mergeDocs(SearchService, SearchKey, prospectusSummaryIndexName, topicSummary)

Creating prospectussummary search index
Total docs: 4
	Indexed 4 sections, 4 succeeded
