# Financial PDS Analyser

#### This sample code is based on content published in this article - https://blog.fabric.microsoft.com/en-us/blog/harness-the-power-of-langchain-in-microsoft-fabric-for-advanced-document-summarization?ft=All and this repo - https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Langchain.ipynb

In [None]:
# Install
%pip install openai==0.28.1 langchain==0.0.331 pdf2image pdfminer.six unstructured==0.10.24 pytesseract numpy==1.22.4

In [None]:
# Import required libraries
import os, openai, langchain, uuid
from langchain.llms import AzureOpenAI, OpenAI
from langchain.agents import load_tools, initialize_agent, AgentType
from langchain.chains import TransformChain, LLMChain, SimpleSequentialChain
from langchain.document_loaders import OnlinePDFLoader
from langchain.tools.bing_search.tool import BingSearchRun, BingSearchAPIWrapper
from langchain.prompts import PromptTemplate
import pyspark.sql.functions as f
from synapse.ml.cognitive.langchain import LangchainTransformer
from synapse.ml.core.platform import running_on_synapse, find_secret

In [None]:
# Provide keys for services
openai_api_key = ""
openai_api_base = ""
openai_api_version = "2023-03-15-preview"
openai_api_type = ""
deployment_name = ""
bing_search_url = "https://api.bing.microsoft.com/v7.0/search"
bing_subscription_key = ""

os.environ["BING_SUBSCRIPTION_KEY"] = bing_subscription_key
os.environ["BING_SEARCH_URL"] = bing_search_url
os.environ["OPENAI_API_TYPE"] = openai_api_type
os.environ["OPENAI_API_VERSION"] = openai_api_version
os.environ["OPENAI_API_BASE"] = openai_api_base
os.environ["OPENAI_API_KEY"] = openai_api_key

In [None]:
# Initialize Azure OpenAI class
llm = AzureOpenAI(
    deployment_name=deployment_name,
    model_name=deployment_name,
    temperature=0.1,
    verbose=True,
)

In [None]:
# Extract content from PDS
def pds_content_extraction(inputs: dict) -> dict:
    pds_link = inputs["pds_link"]
    loader = OnlinePDFLoader(pds_link)
    pages = loader.load_and_split()
    return {"pds_content": pages[4].page_content} # Main page chosen from the PDS document 

# Generate prompt
def prompt_generation(inputs: dict) -> dict:
    output = inputs["Output"]
    prompt = (
        "You are an assistant that helps users understand details on banking financial products. Summarise the content in the financial product disclosure statement document below and output it. After that, use websearch to find the names of 3 other banks in Australia that offer the same financial product and append it to the summary generated before: <PDS Document Start>\n"
        + output
        + "<PDS Document End>."
    )
    return {"prompt": prompt}

In [None]:
# Chain to extract content
pds_content_extraction_chain = TransformChain(
    input_variables=["pds_link"],
    output_variables=["pds_content"],
    transform=pds_content_extraction,
    verbose=False,
)

pds_summarizer_template = """You are a banking financial product disclosure statement summarizer, given the financial product disclosure statement content, it is your job to summarize the document into a short summary.
Here is the financial product disclosure statement content:
{pds_content}
Output:
summary.
"""
prompt = PromptTemplate(
    input_variables=["pds_content"], template=pds_summarizer_template
)
summarize_chain = LLMChain(llm=llm, prompt=prompt, verbose=False)

prompt_generation_chain = TransformChain(
    input_variables=["Output"],
    output_variables=["prompt"],
    transform=prompt_generation,
    verbose=False,
)

In [None]:
bing = BingSearchAPIWrapper(k=20)
tools = [BingSearchRun(api_wrapper=bing)]
web_search_agent = initialize_agent(
    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False
)

# Create a sequential chain
sequential_chain = SimpleSequentialChain(
    chains=[
        pds_content_extraction_chain,
        summarize_chain,
        prompt_generation_chain,
        web_search_agent,
    ]
)

In [None]:
# Add the PDSs to be analysed here
paper_df = spark.createDataFrame(
    [
        (0, "ULR1"),
        (1, "URL2")
    ],
    ["label", "pds_link"],
)

# Construct LangChain transformer using the PDS summarizer chain defined above
pds_info_extractor = (
    LangchainTransformer()
    .setInputCol("pds_link")
    .setOutputCol("pds_info")
    .setChain(sequential_chain)
    .setSubscriptionKey(openai_api_key)
    .setUrl(openai_api_base)
)

display(pds_info_extractor.transform(paper_df))