In [1]:
import os
import re
import time
import hashlib

import tqdm
import boto3
import pandas as pd
import pinecone

from typing import List

from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document

from langchain.prompts import PromptTemplate
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

from langchain.output_parsers import PydanticOutputParser
from langchain.pydantic_v1 import BaseModel, Field

from langchain.vectorstores.pinecone import Pinecone
from langchain_community.document_loaders import UnstructuredPDFLoader

  from tqdm.autonotebook import tqdm


In [2]:
## make sure you exported these environement variables

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
PINECONE_ENV = os.environ.get("PINECONE_ENV")

In [3]:
profile_name = 'genese-llm-acc'
bedrock_region = 'us-west-2'

session = boto3.Session(profile_name=profile_name)
bedrock = session.client('bedrock-runtime' , bedrock_region, endpoint_url=f'https://bedrock.{bedrock_region}.amazonaws.com')
model_kwargs = {
    "max_tokens_to_sample": 4096,
    "temperature": 0,
    "top_k": 250,
    "top_p": 1,
    "stop_sequences": ["\n\nHuman"],
}
llm = Bedrock(model_id="anthropic.claude-v2", client=bedrock, model_kwargs=model_kwargs)
embeddings = BedrockEmbeddings(client=bedrock)

In [4]:
# initialize pinecone

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

In [5]:
index_name = "llm-application"

In [6]:
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name, 
        dimension=1536,
        metric='cosine'
    )

In [7]:
# Confirm we indeed created our index
pinecone.list_indexes()

['llm-application']

In [8]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00035,
 'namespaces': {'qa_chunks_vs': {'vector_count': 35}},
 'total_vector_count': 35}

In [9]:
## to make hash of content as make it as id so that we don't have duplicate entries in the vector database
def calculate_md5(text):
    md5_hash = hashlib.md5()
    md5_hash.update(text.encode('utf-8'))
    md5_hexdigest = md5_hash.hexdigest()
    return md5_hexdigest

In [10]:
def get_pinecone_vectorstore(index, embeddings, text_key="text", namespace=None):
    vectorstore = Pinecone(index, embeddings, text_key, namespace)
    return vectorstore

In [11]:
def pinecone_upsert(vectorstore, texts, metadatas=None, ids=None):
    return vectorstore.add_texts(texts, metadatas=metadatas, ids=ids)

In [12]:
def pinecone_similarity_search(vectorstore, query, filter={}, k=4):
    docs = vectorstore.similarity_search(
        query,
        k=k,
        filter=filter
    )
    return docs

In [13]:
# get vector store
pinecone_vs = get_pinecone_vectorstore(index, embeddings, text_key="text", namespace="qa_chunks_vs")

In [14]:
def get_cleaned_page_content(page_content):
    ## remove url patterns
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    page_content = url_pattern.sub('', page_content)
    
    try:
        social_pattern = r'Facebook \d+\n\n'
        result = re.split(social_pattern, page_content)
        page_content = result[0]
    except:
        pass
    return page_content

In [15]:
def make_document(docs):
    cleaned_page_content = get_cleaned_page_content(docs[0].page_content)
    doc = Document(page_content=cleaned_page_content, metadata=docs[0].metadata)
    return doc

In [16]:
def get_pdf_paths(pdf_directory):
    pdf_paths = []
    for root, dirs, files in os.walk(pdf_directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                pdf_path = pdf_path.replace('\\', '/')
                pdf_paths.append(pdf_path)
    return pdf_paths

In [17]:
pdf_paths = get_pdf_paths('./docs')
pdf_paths

['./docs/Why Is Email Marketing Still Relevant in 2023_.pdf',
 './docs/The Critical Significance of Cloud Security and Cybersecurity in Nepal.pdf',
 './docs/Unveiling the Future_ Top Cloud Computing Trends Shaping 2024 and Beyond.pdf',
 './docs/Which Zoom Security Features Are Best for Your Industry_.pdf',
 './docs/Unlocking the Power of Cloud Computing_ Impactful Uses Across Industries.pdf',
 './docs/Boost Your Business Security with Cloud-Based Security as a Service (SECaaS).pdf',
 './docs/What is Business Intelligence (BI)_.pdf',
 './docs/6 Reasons to choose Zoho Workplace.pdf',
 './docs/Importance of Quality Assurance for Your Business.pdf']

In [18]:
documents = []

for pdf_path in pdf_paths:
    loader = UnstructuredPDFLoader(pdf_path)
    docs = loader.load()
    documents.append(docs)

In [19]:
new_documents = list(map(make_document, documents))

In [20]:
len(new_documents)

9

In [21]:
new_documents[0].page_content[:500]

'1/4/24, 8:17 AM\n\nWhy Is Email Marketing Still Relevant in 2023?\n\nInternational 3\n\n\ue093\ue093 \ue094\ue094 \ue09d\ue09d \ue09a\ue09a\n\naa\n\n\uf0e0 Contact us\n\nWhy Is Email Marketing Still Relevant in 2023?\n\nby sakar | Apr 25, 2023 | Email Marketing | 0 comments\n\nEmail marketing is a popular and reliable marketing method for businesses due to\n\nits stability, a\x00ordability, and control. Since email marketing is an owned media\n\nchannel, marketers have greater control over the messaging and reach of their\n\ncampaigns.\n\nAlthough trendy marketing t'

In [22]:
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=2048,
    chunk_overlap=250,
    length_function=len,
    is_separator_regex=False
)

In [23]:
for doc in tqdm.tqdm(new_documents):
    text_chunks = text_splitter.split_text(doc.page_content)
    chunk_ids = [calculate_md5(text_chunk) for text_chunk in text_chunks]
    chunk_metadatas = [{"chunk": index, "source": doc.metadata["source"]} for index in range(len(text_chunks))]
    pinecone_upsert(pinecone_vs, text_chunks, metadatas=chunk_metadatas, ids=chunk_ids)
    time.sleep(1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:56<00:00,  6.25s/it]


In [24]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00035,
 'namespaces': {'qa_chunks_vs': {'vector_count': 35}},
 'total_vector_count': 35}

In [25]:
# question = "Why is mobile optimization considered crucial for email marketing success?"
# question = "Why is it mentioned that customers prefer email over other forms of communication?"
question = "In what ways does AI contribute to the effectiveness of email marketing campaigns?"
sim_docs = pinecone_similarity_search(pinecone_vs, question, k=2)
sim_docs

[Document(page_content='conversions, click-through rates, and the ratio of unsubscribes to subscriptions.\n\n8. Curated Content: In 2023, the vast amount of content available online makes it\n\nchallenging for businesses to grab people’s attention, and it can be challenging to\n\nkeep up with the volume of content creation. Curated content is an e\x00cient way to\n\nstand out in this noisy digital world by collecting existing information and\n\npresenting it in a meaningful way. Email marketing is an ideal channel for sharing\n\ncurated content as it allows businesses to feature blog posts, industry news, third-\n\nparty research, videos, and podcasts, which can improve brand authority and\n\neducate the audience on topics relevant to the business. Since email marketing\n\nallows for more long-form content than other marketing channels, businesses have\n\na better opportunity to delve deeper into the curated content.\n\n9. Automation & AI: AI has become a valuable tool for marketers in

In [26]:
# question = "How does the document highlight the growing integration of cloud technology in business operations in Nepal?"
# question = "Why is a multi-layered approach crucial for cloud security in Nepal?"
# question = "What sensitive data do Nepal-based businesses typically handle, and why is safeguarding it in the cloud important?"
# question = "What advantages do real-time monitoring and enhanced visibility bring to businesses in Nepal?"
question = "Why is scalability mentioned as a key feature of cloud security solutions in Nepal?"

sim_docs = pinecone_similarity_search(pinecone_vs, question, k=2)
sim_docs

[Document(page_content='prompt detection and response to potential threats, thereby reducing the impact\n\nof security incidents.\n\n7. Scalability for Growing Businesses in Nepal:\n\nCloud security solutions in Nepal are scalable, allowing them to grow with your\n\nbusiness. As you expand your operations and data storage, these security\n\nmeasures adapt to the evolving landscape, ensuring consistent protection.\n\nConclusion\n\nIn an era where businesses in Nepal increasingly rely on cloud technology,\n\ninvesting in cloud security and cybersecurity is not just a choice but an imperative.\n\nThe ramiﬁcations of a security breach can be catastrophic, encompassing\n\n3/7\n\n1/4/24, 8:11 AM\n\nThe Critical Significance of Cloud Security and Cybersecurity in Nepal\n\nﬁnancial losses and damage to reputation. By giving paramount importance to\n\ncloud security in Nepal, businesses can protect their digital assets, ensure\n\ncompliance with regulations, and maintain the trust of customers 

In [27]:
question = "What impact is AIaaS expected to have on the accessibility and cost-effectiveness of AI technologies?"
# question = "How does cloud computing act as a gateway to transformative technologies like IoT, blockchain, and quantum computing?"

sim_docs = pinecone_similarity_search(pinecone_vs, question, k=2)
sim_docs

[Document(page_content='With evolving cyber threats, the demand for enhanced encryption, authentication,\n\nand disaster recovery services is set to soar. Cloud providers are prioritizing\n\nsecurity measures to safeguard sensitive data.\n\nSustainable Cloud Computing\n\nService providers are committing to net-zero goals, fostering greener cloud\n\ncomputing in 2024. This aligns with global e\x00orts to combat climate change,\n\nemphasizing sustainability within cloud operations and customer activities.\n\nSimpliﬁed Cloud Computing for All\n\nCloud providers are making deployment and management accessible through low-\n\ncode and no-code tools. Democratizing cloud services enables users with limited\n\ntechnical knowledge to leverage the power of the cloud.\n\n2/7\n\n1/4/24, 8:15 AM\n\nUnveiling the Future: Top Cloud Computing Trends Shaping 2024 and Beyond\n\nPrivacy Focus in the Cloud\n\nContinuous e\x00orts are underway to develop technological and legislative solutions\n\nfor maint

In [28]:
def get_text_chunks_from_pinecone(question):
    docs = pinecone_similarity_search(pinecone_vs, question, k=4)
    context = ""
    for doc in docs:
        context += doc.page_content + "\n\n"
    return context

In [29]:
context = get_text_chunks_from_pinecone(question)
print(context)

With evolving cyber threats, the demand for enhanced encryption, authentication,

and disaster recovery services is set to soar. Cloud providers are prioritizing

security measures to safeguard sensitive data.

Sustainable Cloud Computing

Service providers are committing to net-zero goals, fostering greener cloud

computing in 2024. This aligns with global e orts to combat climate change,

emphasizing sustainability within cloud operations and customer activities.

Simpliﬁed Cloud Computing for All

Cloud providers are making deployment and management accessible through low-

code and no-code tools. Democratizing cloud services enables users with limited

technical knowledge to leverage the power of the cloud.

2/7

1/4/24, 8:15 AM

Unveiling the Future: Top Cloud Computing Trends Shaping 2024 and Beyond

Privacy Focus in the Cloud

Continuous e orts are underway to develop technological and legislative solutions

for maintaining data privacy and building trust with customers. Data pr

In [30]:
class AnswerParser(BaseModel):
    answer : str = Field(description="Answer of the user question based on context provided.")

answer_parser = PydanticOutputParser(pydantic_object=AnswerParser)

In [31]:
def get_qa_prompt(question, context, answer_parser):
    qa_prompt = PromptTemplate(
            template="""Human: You are a world class algorithm for extracting information in structured formats.
            You are provided with the user question along with the relevant text chunks to the user question from vector store.
            Your task is to answer user question based on the relevant text chunks.
            
            {format_instructions}
            
            Use the given format to extract answer for the following user's question:
            {question}
            
            The relevant text chunks are as follow:
            {context}
            
            Note:- Make sure to answer in the correct format specified above. If you don't know the answer from the provided text context don't try to generate on your own. In that case reply with 'I do not having enough context to answer your question.'"
            
            Assistant: 
            
            
            """,
            input_variables=["question", "context"],
            partial_variables={"format_instructions": answer_parser.get_format_instructions()},
        )
    return qa_prompt

In [32]:
def get_question_answer_from_llm(question, parser):
    context = get_text_chunks_from_pinecone(question)
    qa_prompt = get_qa_prompt(question, context, parser)
    _input = qa_prompt.format_prompt(question=question, context=context, parser=parser)
    _output = llm(_input.to_string())
    try:
        obj = parser.parse(_output)
        return obj.answer
    except Exception as ex:
        print("Failed to parse answer response from LLM", ex)
        return None

In [33]:
question = "In what ways does AI contribute to the effectiveness of email marketing campaigns?"
answer = get_question_answer_from_llm(question, answer_parser)
print(answer)

AI contributes to the effectiveness of email marketing campaigns by helping with tasks such as organizing email lists, personalizing content to target specific audiences, and automating the writing of emails. This saves time for marketers and allows for better optimization and ROI of campaigns.


In [34]:
question = "Why is scalability mentioned as a key feature of cloud security solutions in Nepal?"
answer = get_question_answer_from_llm(question, answer_parser)
print(answer)

Scalability is mentioned as a key feature of cloud security solutions in Nepal because cloud security solutions are scalable, allowing them to grow with businesses in Nepal as they expand their operations and data storage. The security measures adapt to the evolving landscape as businesses grow, ensuring consistent protection.


In [35]:
question = "What impact is AIaaS expected to have on the accessibility and cost-effectiveness of AI technologies?"
answer = get_question_answer_from_llm(question, answer_parser)
print(answer)

The text suggests that AIaaS (AI as a Service) will make AI technologies more accessible and cost-effective for businesses of all sizes. By providing AI capabilities through the cloud, AIaaS can democratize access to advanced AI without the high costs of developing in-house AI expertise and infrastructure.
