based on [link1](https://python.langchain.com/v0.2/docs/tutorials/rag/) and 
[link2](https://python.langchain.com/v0.2/docs/integrations/vectorstores/chroma/)

In [None]:
%pip install -q langchain langchain_community langchain_chroma langchain-openai 

In [1]:
import os

from langchain_openai import ChatOpenAI

LANGCHAIN_TRACING_V2=os.getenv("LANGCHAIN_TRACING_V2")
LANGCHAIN_ENDPOINT=os.getenv("LANGCHAIN_ENDPOINT")
LANGCHAIN_API_KEY=os.getenv("LANGCHAIN_API_KEY")
LANGCHAIN_PROJECT=os.getenv("LANGCHAIN_PROJECT")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")


In [2]:
llm = ChatOpenAI(model="gpt-3.5-turbo")

In [7]:
from langchain import hub
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from uuid import uuid4

import sys
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

import argparse
from src.preprocess.main_preprocess import preprocess_raw_files

parser = argparse.ArgumentParser()
parser.add_argument('--input_files_path', type=str, help='An integer parameter')
parser.add_argument('--action', type=str, help='A string parameter')
args = parser.parse_args(['--input_files_path', 'data/raw_filled_security_questionnaire', "--action", "indexdb"])


In [4]:
PROJECT_PATH=os.getenv("PROJECT_PATH")
os.path.join(PROJECT_PATH, args.input_files_path)

# Load, chunk and index the contents of the blog.
structured_qa_pairs = preprocess_raw_files(args)


Model 'en_core_web_md' is already installed, loading...


In [9]:
# create documents
documents = []
for file in structured_qa_pairs:
    quest, answ = tuple(structured_qa_pairs[file].keys())
    # print(structured_qa_pairs[file][quest], structured_qa_pairs[file][answ])
    
    ids=0
    for quest_, answ_ in zip(structured_qa_pairs[file][quest], structured_qa_pairs[file][answ]):
        # print(quest_, answ_)
        documents += [Document(
            page_content = quest_,
            metadata = {
                "source": file,
                "answer": answ_
            },
            id = ids
        )]
        ids += 1


# Create vector store
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not neccesary
)
uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, ids=uuids)
# vectorstore = Chroma.from_documents(documents=documents, embedding=OpenAIEmbeddings())


['0094529b-7527-4769-a959-2853d925adf2',
 'b6649652-c6e1-4491-b04a-27ec77fa7f69',
 '2f804f62-de99-416b-ae84-433e6c9c422f',
 'cdfaaafd-ffe0-45bb-ae90-5e4528657fce',
 '4462f468-0011-4936-aa2e-a371a2f98807',
 '599d8866-7bde-489d-bd97-a47bf6d09aad',
 'bf3b7373-59bd-418c-83eb-8d9275079b18',
 '745b7191-051f-4af0-9feb-a7eb452fe586',
 '3b8cb0ce-eb4f-416c-a5f5-70c2cfd4176e',
 '35d66ce6-448c-403b-b317-171756592fc0',
 '12d94805-99e1-458f-b89c-f551bab33cd2',
 '4611573e-473f-4d49-b315-664e3ce59ea1',
 '523b5019-ad1a-4d47-bc40-777154a535a7',
 'fc088ce3-b987-4437-b7a9-a889199f521d',
 'f7e1d7a1-128f-4344-bdee-89c6c32c1c20',
 '7237923c-6a14-4a0a-89a2-87af25da0b6f',
 '9573dcfd-affb-4d7f-a940-e84bfdc92ed9',
 '3bec60ad-c14d-4d97-b7af-c2661c75ccd6',
 '558923c3-54f9-4344-bf3d-60ea876c0939',
 'b0683772-4f90-4709-82e6-f422c453d2e3',
 '36c20088-a5b6-4461-8d7d-b5b0981f3ca5',
 '75def41c-5cb1-4196-83d8-96ab4c3e098b',
 'a1d6c1bb-23a1-489a-9603-7d256d563050',
 '2c29f31a-97be-4372-b2ea-afbb96ff5113',
 'e13e072d-1937-

In [11]:

# Retrieve and generate using the relevant snippets of the blog.
retriever = vector_store.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("Do you have a capability to patch vulnerabilities across all of your computing devices, applications, and systems?")

# не то пальто

'Yes, we have the capability to patch vulnerabilities across all of our computing devices, applications, and systems. We conduct application penetration tests regularly and have anti-malware programs installed on all IT infrastructure components. We review applications for security vulnerabilities before deployment to production.'

In [12]:
documents

[Document(id='0', metadata={'source': 'Adyen_SQ.xls', 'answer': 'Comment'}, page_content='Question'),
 Document(id='1', metadata={'source': 'Adyen_SQ.xls', 'answer': 'Adyen automates this in accordance with PCI DSS requirement 6.6.'}, page_content='Do you use an automated source code analysis tool to detect security defects in code prior to production?'),
 Document(id='2', metadata={'source': 'Adyen_SQ.xls', 'answer': 'Adyen reviews our applications and address issues in accordance with PCI DSS requirement 6.3.2.'}, page_content='(SaaS only) Do you review your applications for security vulnerabilities and address any issues prior to deployment to production?'),
 Document(id='3', metadata={'source': 'Adyen_SQ.xls', 'answer': 'Adyen addresses and remediates requirements for access accordance with PCI DSS requirement 10.8.1, 11.3, 11.3.3.'}, page_content='Are all identified security, contractual, and regulatory requirements for customer access contractually addressed and remediated prior 

In [13]:
results = vector_store.similarity_search(
    "Do you have a capability to patch vulnerabilities across all of your computing devices, applications, and systems?",
    k=2,
    filter={"source": "Adyen_SQ.xls"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Do you have a capability to patch vulnerabilities across all of your computing devices, applications, and systems? [{'answer': 'Adyen has a vulnerability management program in accordance with PCI DSS requirement 6.1, 6.2.', 'source': 'Adyen_SQ.xls'}]
* Do you conduct application penetration tests of your cloud infrastructure regularly as prescribed by industry best practices and guidance? [{'answer': 'Adyen conducts application penetration tests in accordance with PCI DSS requirement 11.3.1, 11.3.2.', 'source': 'Adyen_SQ.xls'}]
