In [1]:
import json
import os
import sys
import boto3
import streamlit as st
from dotenv import load_dotenv

load_dotenv()

## We will be using Titan Embeddings Model To generate Embedding

from langchain_community.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock
from langchain_community.chat_models.bedrock import BedrockChat

## Data Ingestion

import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader

# Vector Embedding And Vector Store
# from langchain_community.vectorstores import FAISS
from langchain.vectorstores import FAISS

## LLm Models
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

from pypdf import PdfReader

In [36]:

## Bedrock Clients
bedrock=boto3.client(service_name="bedrock-runtime")
bedrock_embeddings=BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",client=bedrock)


#Extract PDF Data
def extract_pdf(filename):
    reader = PdfReader(filename)
    page = reader.pages[0]
    return page.extract_text()

## Data ingestion
def data_ingestion(inp):
    loader=PyPDFDirectoryLoader(inp)
    documents=loader.load()

    # - in our testing Character split works better with this PDF data set
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=200,
                                                 chunk_overlap=20)
    
    docs=text_splitter.split_documents(documents)
    return docs


In [3]:
def get_claude_llm():
    ##create the Anthropic Model
    llm=BedrockChat(model_id="anthropic.claude-3-sonnet-20240229-v1:0",client=bedrock,
                model_kwargs={'max_tokens':1000})
    
    return llm

In [84]:
from langchain.schema import Document

In [99]:
def load_pdf_as_chunk(pdf_path):
    # Open the PDF file\
    loader = PyPDFLoader(pdf_path)
    pages = loader.load_and_split()
    all_text = ""
    
    # Iterate through the pages and extract text
    for page in pages:
        all_text += page.page_content
    
    # Create a Document object with the entire text
    doc = Document(page_content=all_text)
    
    return doc

In [100]:
docs = load_pdf_as_chunk("pdfs/Final draft Guidelines on ICT and security risk management.pdf")

In [101]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | get_claude_llm()
    | StrOutputParser()
)

summaries = chain.batch([docs], {"max_concurrency": 5})


In [105]:
summaries[0]

"Here is a summary of the key points from the document:\n\n- The guidelines provide requirements for financial institutions on how to manage ICT and security risks. They integrate and expand upon previous guidelines on security measures for operational and security risks of payment services under PSD2.\n\n- The scope covers payment service providers for payment services, credit institutions for all activities, and investment firms for all activities. \n\n- Key areas covered include:\n    - Governance and strategy for ICT and security risk management\n    - Risk management framework to identify, assess, and mitigate ICT and security risks\n    - Information security measures like logical security, physical security, operations security\n    - ICT operations management \n    - ICT project and change management\n    - Business continuity management\n    - Relationship management requirements for payment service users\n\n- The guidelines aim to be technology and methodology agnostic to all

In [106]:
def get_vector_store(docs, inp):
    vectorstore_faiss=FAISS.from_documents(
        docs,
        bedrock_embeddings
    )
    vectorstore_faiss.save_local(f"faiss_index_{inp}")

In [111]:
get_vector_store([Document(summaries[0])], 'ict_s')

In [116]:
from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever

In [136]:
faiss_index = FAISS.load_local("faiss_index_ict_s", bedrock_embeddings, allow_dangerous_deserialization=True)

In [137]:
store = InMemoryByteStore()
id_key = "doc_id"

# The retriever
retriever_inp = MultiVectorRetriever(
    vectorstore=faiss_index,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in [docs]]

In [138]:
# Docs linked to summaries
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

# Add
retriever_inp.vectorstore.add_documents(summary_docs)
retriever_inp.docstore.mset(list(zip(doc_ids, [docs])))

In [142]:
reader = load_pdf_as_chunk('B.pdf')
user_question = reader.page_content

In [139]:
prompt_template_compliance = """
Imagine you are a compliance officer for a bank checking if policies and guidelines are being met.
Check the sections of the following document on whether the policies are being met.
<question>
{question}
</question

The following are the poilicies to be checked against:
<context>
{context}
</context

Provide the summary of the non-compliant sections 
and a high level yes, no or partially compliant
in tabular form with the summary of the non-compliant section in one column, 
yes or no in the other column and 
the high level reason of non compliance or partial compliance in less than 5 words. 
Also provide the detailed summary under the table with the non compliant or partially compliant 
sections with quoted reference and suggested change. 
Please refer only to the document. 
Please be formal in your response. 
Please avoid any biases.
Assistant:"""

In [140]:

PROMPT1 = PromptTemplate(
    template=prompt_template_compliance, input_variables=["context", "question"]
)

In [141]:
def get_response_llm(llm,vectorstore_faiss,query, PROMPT):
    qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)
    answer=qa({"query":query})
    return answer['result']

In [143]:
get_response_llm(get_claude_llm(),faiss_index,user_question, PROMPT1)

  warn_deprecated(


'I will provide a formal and unbiased response based solely on the document provided, without any biases.\n\nSummarizing non-compliance in tabular form:\n\nNon-Compliant Section Summary | Compliance Status | Reason (< 5 words)\n-----------------------------|--------------------|-----------------------\nNo specific sections identified as non-compliant | - | -\n\nBased on the feedback provided in the document, no specific sections are explicitly identified as non-compliant with the guidelines. The comments and responses focus on clarifications, suggestions for improvements, and requests for additional guidance, but there is no clear indication of any sections being outright non-compliant.\n\nDetailed Summary:\n\nThe document does not explicitly quote any sections as being non-compliant with the guidelines. The comments and responses primarily revolve around the following:\n\n1. Requests for clarification on specific terms, definitions, or requirements.\n2. Suggestions for rewording or re