## Search and summarize - GPT 4

#### Imports 

In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import MapReduceDocumentsChain
from langchain.chains import ReduceDocumentsChain
from langchain.document_loaders import JSONLoader
from langchain.chains import StuffDocumentsChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.vectorstores import FAISS
import logging
import yaml
import os 

##### Setup logging

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

#### Setup

In [3]:
with open('./credentials/oai-key.yml', 'rb') as f:
    credentials = yaml.safe_load(f)
    
api_key = credentials['key']
os.environ['OPENAI_API_KEY'] = api_key

In [4]:
LOCAL_INPUT_DIR = './DATA/INPUT'
LOCAL_OUTPUT_DIR = './DATA/OUTPUT'
MODEL_NAME = 'gpt-4'
FILE_NAME = 'file-2'

In [5]:
text_embeder = OpenAIEmbeddings()

#### Encode and index text content for pages 

In [6]:
def extract_metadata(record: dict, metadata: dict) -> dict:
    metadata['doc_name'] = record.get('doc_name')
    metadata['page_num'] = record.get('page_num')
    return metadata


loader = JSONLoader(file_path=f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/TEXT/{FILE_NAME}.jsonl', 
                    jq_schema='.', 
                    metadata_func=extract_metadata, 
                    content_key='page_content', 
                    json_lines=True)

segments = loader.load()  # here segments correspond to pages 

In [7]:
vector_store = FAISS.from_documents(segments, text_embeder)

#### Setup Retrieval Chain

In [8]:
retriever = vector_store.as_retriever(search_type='mmr', search_kwargs={'k': 10})
retriever

VectorStoreRetriever(tags=['FAISS'], metadata=None, vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x12714c3a0>, search_type='mmr', search_kwargs={'k': 10})

In [9]:
llm = ChatOpenAI(model_name=MODEL_NAME, 
                 temperature=0.0, 
                 max_tokens=2048)

#### Retrieve matching pages related to `derivative contracts`

In [10]:
query = "derivative contracts"

In [11]:
pages = retriever.get_relevant_documents(query)

for page in pages:
    logger.info(f'Content => {page.page_content}')
    logger.info(f'Metadata => {page.metadata}')
    logger.info(f"\n{'-' * 200}\n")

Content =>   
 
 savings association elected to treat the derivative  contract as multiple derivative contracts under 
§ 3.113(e)(6); 
(D) For derivative contracts that are not cleared through a QCCP, the cash collateral 
received by the recipient counterparty is not segregated (by law, regulation, or an agreement with 
the counterparty); 
(E) Variation margin is calculated and transferred on a daily basis based on the mark-to-
fair value of the derivative contract; 
(F) The variation margin transferred under th e derivative contract or the governing rules 
of the CCP or QCCP for a cleared transaction is the full amount that is necessary to fully extinguish the net current credit exposure to th e counterparty of the derivative contracts, subject 
to the threshold and minimum transfer amounts applicable to the counterparty under the terms of 
the derivative contract or the governing rules for a cleared transaction; 
(G) The variation margin is in the form of cash in the same currency as

#### Summarize the matched pages into a detailed summary using `MapReduce`

In [12]:
# Map chain
map_template = """You are a Derivatives Risk Analyst. Your job is to create a detailed SUMMARY of financial documents.
Your summary should include all important facts, math formulas, and technical details, and leave out extra information.
Your goal is to provide a clear and precise SUMMARY of the PAGE below that shows your knowledge of finance, with high accuracy and quality.
---
PAGE => {PAGE}
---
SUMMARY =>"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, 
                     prompt=map_prompt, 
                     verbose=True)

# Reduce chain
reduce_template = """The following is set of SUMMARIES:
---
SUMMARIES => {SUMMARIES}
---
Combine the summaries into a single cohesive CONSOLIDATED summary without distilling or removing any information. Add sections wherever applicable to clearly delineate topics or themes.
CONSOLIDATED SUMMARY =>"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, 
                        prompt=reduce_prompt, 
                        verbose=True)

combine_documents_chain = StuffDocumentsChain(llm_chain=reduce_chain, 
                                              document_variable_name='SUMMARIES', 
                                              verbose=True)

reduce_documents_chain = ReduceDocumentsChain(combine_documents_chain=combine_documents_chain, 
                                              collapse_documents_chain=combine_documents_chain, 
                                              token_max=2048, 
                                              verbose=True)

map_reduce_chain = MapReduceDocumentsChain(llm_chain=map_chain, 
                                           reduce_documents_chain=reduce_documents_chain, 
                                           document_variable_name='PAGE', 
                                           return_intermediate_steps=False, 
                                           verbose=True)

In [13]:
completion = map_reduce_chain.run(pages)



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a Derivatives Risk Analyst. Your job is to create a detailed SUMMARY of financial documents.
Your summary should include all important facts, math formulas, and technical details, and leave out extra information.
Your goal is to provide a clear and precise SUMMARY of the PAGE below that shows your knowledge of finance, with high accuracy and quality.
---
PAGE =>   
 
 savings association elected to treat the derivative  contract as multiple derivative contracts under 
§ 3.113(e)(6); 
(D) For derivative contracts that are not cleared through a QCCP, the cash collateral 
received by the recipient counterparty is not segregated (by law, regulation, or an agreement with 
the counterparty); 
(E) Variation margin is calculated and transferred on a daily basis based on the mark-to-
fair value of the derivative contract; 
(F) The variation margin tr

In [14]:
completion = completion.strip()
logger.info(completion)

The document provides a comprehensive discussion on the treatment of derivative contracts, particularly those not cleared through a Qualified Central Counterparty (QCCP). It outlines the rules for cash collateral, variation margin, and the governing agreements for derivative contracts. Cash collateral received by the recipient counterparty is not segregated unless required by law, regulation, or an agreement with the counterparty. Variation margin is calculated and transferred daily based on the mark-to-fair value of the derivative contract. The transferred variation margin should be enough to fully extinguish the net current credit exposure to the counterparty, subject to the threshold and minimum transfer amounts applicable to the counterparty. The derivative contract and the variation margin should be governed by a qualifying master netting agreement between the legal entities that are the counterparties to the derivative contract or by the governing rules for a cleared transaction.

In [15]:
SUMMARY_OUT_PATH = f'{LOCAL_OUTPUT_DIR}/{FILE_NAME}/SUMMARY'
os.makedirs(SUMMARY_OUT_PATH, exist_ok=True)

with open(f'{SUMMARY_OUT_PATH}/{FILE_NAME}-summary-oai.txt', 'w') as f:
    f.write(completion)