In [156]:
import os
import re
import tqdm
import boto3
import pandas as pd

from typing import List

from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document

from langchain.prompts import PromptTemplate
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

from langchain.output_parsers import PydanticOutputParser
from langchain.pydantic_v1 import BaseModel, Field

from langchain_community.document_loaders import UnstructuredPDFLoader

In [112]:
profile_name = 'genese-llm-acc'
bedrock_region = 'us-west-2'

session = boto3.Session(profile_name=profile_name)
bedrock = session.client('bedrock-runtime' , bedrock_region, endpoint_url=f'https://bedrock.{bedrock_region}.amazonaws.com')
model_kwargs = {
    "max_tokens_to_sample": 4096,
    "temperature": 0,
    "top_k": 250,
    "top_p": 1,
    "stop_sequences": ["\n\nHuman"],
}
llm = Bedrock(model_id="anthropic.claude-v2", client=bedrock, model_kwargs=model_kwargs)
embeddings = BedrockEmbeddings(client=bedrock)

In [17]:
def get_pdf_paths(pdf_directory):
    pdf_paths = []
    for root, dirs, files in os.walk(pdf_directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                pdf_path = pdf_path.replace('\\', '/')
                pdf_paths.append(pdf_path)
    return pdf_paths

In [18]:
pdf_paths = get_pdf_paths('./docs')
pdf_paths

['./docs/Why Is Email Marketing Still Relevant in 2023_.pdf',
 './docs/The Critical Significance of Cloud Security and Cybersecurity in Nepal.pdf',
 './docs/Unveiling the Future_ Top Cloud Computing Trends Shaping 2024 and Beyond.pdf',
 './docs/Which Zoom Security Features Are Best for Your Industry_.pdf',
 './docs/Unlocking the Power of Cloud Computing_ Impactful Uses Across Industries.pdf',
 './docs/Boost Your Business Security with Cloud-Based Security as a Service (SECaaS).pdf',
 './docs/What is Business Intelligence (BI)_.pdf',
 './docs/6 Reasons to choose Zoho Workplace.pdf',
 './docs/Importance of Quality Assurance for Your Business.pdf']

In [98]:
def get_cleaned_page_content(page_content):
    ## remove url patterns
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    page_content = url_pattern.sub('', page_content)
    
    try:
        social_pattern = r'Facebook \d+\n\n'
        result = re.split(social_pattern, page_content)
        page_content = result[0]
    except:
        pass
    return page_content

In [100]:
def make_document(docs):
    cleaned_page_content = get_cleaned_page_content(docs[0].page_content)
    doc = Document(page_content=cleaned_page_content, metadata=docs[0].metadata)
    return doc

In [101]:
documents = []

for pdf_path in pdf_paths:
    loader = UnstructuredPDFLoader(pdf_path)
    docs = loader.load()
    documents.append(docs)

In [102]:
len(documents)

9

In [103]:
len(pdf_paths)

9

In [104]:
new_documents = list(map(make_document, documents))

In [105]:
new_documents[0]

Document(page_content="1/4/24, 8:17 AM\n\nWhy Is Email Marketing Still Relevant in 2023?\n\nInternational 3\n\n\ue093\ue093 \ue094\ue094 \ue09d\ue09d \ue09a\ue09a\n\naa\n\n\uf0e0 Contact us\n\nWhy Is Email Marketing Still Relevant in 2023?\n\nby sakar | Apr 25, 2023 | Email Marketing | 0 comments\n\nEmail marketing is a popular and reliable marketing method for businesses due to\n\nits stability, a\x00ordability, and control. Since email marketing is an owned media\n\nchannel, marketers have greater control over the messaging and reach of their\n\ncampaigns.\n\nAlthough trendy marketing tactics like gamiﬁcation and inﬂuencer marketing can\n\nGot any questions? We're happy\n\nbe useful for engaging customers online, they often require a larger budget and\n\nto help.\n\nmay not generate enough conversions to make a signiﬁcant impact. These\n\nstrategies also fall under earned or paid media, which o\x00er businesses less control\n\nand require more time and testing to see a positive retur

### summarize the pdf content

In [135]:
def get_summary_prompt(text):
    summary_prompt_template = PromptTemplate(
        template="""Human: You are provided with the text content from a pdf document.
        Your task is to summarize the text content provided below.
        {text}
        
        Note:- Discard PII information if present any. 
        
        NO PREAMBLE and NO POSTAMBLE

        Always follow the below instruction to generate summary of the content - 
        
        First few 3 to 4 lines in a pagraph will give summary in paragraph mode then will followed by key points as summary.

        For example:

        The article discusses the relevance of email marketing in 2023 and explores various trends impacting this marketing strategy and so on.
        
        Key Points:
        - point 1
        - point 2
        ...
        
        
        Assistant: Here is the summary of the text provided \n\n
        """,
        input_variables=["text"]
    )
    summary_prompt = summary_prompt_template.format_prompt(text=text).to_string()
    return summary_prompt

In [136]:
summary_prompt = get_summary_prompt(new_documents[0].page_content)
summary_result = llm(summary_prompt)

In [137]:
print(summary_result)


The article discusses the continued relevance of email marketing in 2023 due to its affordability, stability, and control. It explores various trends impacting email marketing strategies this year.

Key Points:

- Mobile optimization is crucial as most email activity is now on mobile devices. 

- Customers prioritize companies that take a stand on social issues. Email allows communicating CSR initiatives.

- Aligning sales and marketing teams via email improves consistent messaging. 

- Email signups are inbound marketing where customers initiate engagement.

- Email improves customer service by personalized, real-time communication.

- Interactive email content increases engagement and website traffic.

- Tighter privacy laws give customers control, but email is still trusted.

- Curated content shared via email improves brand authority.

- AI and automation optimize campaigns and save resources.


In [138]:
def get_llm_summary(page_content):
    summary_prompt = get_summary_prompt(page_content)
    summary_result = llm(summary_prompt)
    return summary_result

In [139]:
summaries = []

for doc in tqdm.tqdm(new_documents):
    summary = get_llm_summary(doc.page_content)
    summaries.append({"source": doc.metadata["source"], "summary": summary})

df = pd.DataFrame(summaries)
df.to_json('summaries.json', orient='records')
# df.to_csv('summaries.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [01:41<00:00, 11.23s/it]


In [154]:
df.tail(2)

Unnamed: 0,source,summary
7,./docs/6 Reasons to choose Zoho Workplace.pdf,\nThe text discusses the benefits of using Zoh...
8,./docs/Importance of Quality Assurance for You...,\nThe article discusses the importance of qual...


In [155]:
for summary in summaries:
    print(summary["source"])
    print("\n")
    print(summary["summary"])
    print("\n**************************************************************************************************************\n\n")
    break

./docs/Why Is Email Marketing Still Relevant in 2023_.pdf



The article discusses the continued relevance of email marketing in 2023 due to its affordability, stability, and control. It explores various trends impacting email marketing strategies this year.

Key Points:

- Mobile optimization is crucial as most email activity is now on mobile devices. 

- Customers prioritize social responsibility so email communication can provide details on a company's CSR initiatives.  

- Sales and marketing alignment is needed to ensure consistent messaging. Email can serve as a collaboration channel.

- Inbound marketing through email list sign-ups allows willing engagement versus disruptive outbound marketing.

- Excellent customer service boosts additional purchases so email communication improves the experience.

- Interactive content like videos and infographics increases engagement. 

- Tighter privacy laws give customers control but email is still more trusted than other channels.

- Curat