In [1]:
import json
import os
import sys
import boto3
import streamlit as st
from dotenv import load_dotenv

load_dotenv()

## We will be using Titan Embeddings Model To generate Embedding

from langchain_community.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock
from langchain_community.chat_models.bedrock import BedrockChat

## Data Ingestion

import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader

# Vector Embedding And Vector Store
# from langchain_community.vectorstores import FAISS
from langchain.vectorstores import FAISS

## LLm Models
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

from pypdf import PdfReader

In [36]:

## Bedrock Clients
bedrock=boto3.client(service_name="bedrock-runtime")
bedrock_embeddings=BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",client=bedrock)


#Extract PDF Data
def extract_pdf(filename):
    reader = PdfReader(filename)
    page = reader.pages[0]
    return page.extract_text()

## Data ingestion
def data_ingestion(inp):
    loader=PyPDFDirectoryLoader(inp)
    documents=loader.load()

    # - in our testing Character split works better with this PDF data set
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=200,
                                                 chunk_overlap=20)
    
    docs=text_splitter.split_documents(documents)
    return docs


In [3]:
def get_claude_llm():
    ##create the Anthropic Model
    llm=BedrockChat(model_id="anthropic.claude-3-sonnet-20240229-v1:0",client=bedrock,
                model_kwargs={'max_tokens':1000})
    
    return llm

In [84]:
from langchain.schema import Document

In [99]:
def load_pdf_as_chunk(pdf_path):
    # Open the PDF file\
    loader = PyPDFLoader(pdf_path)
    pages = loader.load_and_split()
    all_text = ""
    
    # Iterate through the pages and extract text
    for page in pages:
        all_text += page.page_content
    
    # Create a Document object with the entire text
    doc = Document(page_content=all_text)
    
    return doc

In [100]:
docs = load_pdf_as_chunk("pdfs/Final draft Guidelines on ICT and security risk management.pdf")

In [101]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | get_claude_llm()
    | StrOutputParser()
)

summaries = chain.batch([docs], {"max_concurrency": 5})


In [105]:
summaries[0]

"Here is a summary of the key points from the document:\n\n- The guidelines provide requirements for financial institutions on how to manage ICT and security risks. They integrate and expand upon previous guidelines on security measures for operational and security risks of payment services under PSD2.\n\n- The scope covers payment service providers for payment services, credit institutions for all activities, and investment firms for all activities. \n\n- Key areas covered include:\n    - Governance and strategy for ICT and security risk management\n    - Risk management framework to identify, assess, and mitigate ICT and security risks\n    - Information security measures like logical security, physical security, operations security\n    - ICT operations management \n    - ICT project and change management\n    - Business continuity management\n    - Relationship management requirements for payment service users\n\n- The guidelines aim to be technology and methodology agnostic to all

In [106]:
def get_vector_store(docs, inp):
    vectorstore_faiss=FAISS.from_documents(
        docs,
        bedrock_embeddings
    )
    vectorstore_faiss.save_local(f"faiss_index_{inp}")b

In [111]:
get_vector_store([Document(summaries[0])], 'ict_s')

In [112]:
from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever