# LLM 101 PoC

Goal: Upload a pdf and ask a question about its content, and LLM+LangChain+VectorStore will answer based on the pdf.

# Installs

In [2]:
# !pip install -q --upgrade google-generativeai

In [1]:
# !pip install langchain-google-genai

In [2]:
# !pip install langchain_pinecone

In [3]:
# !pip install python-dotenv

In [4]:
# !pip install PyPDF2

In [5]:
#!pip install langchain --upgrade

# Library

In [6]:
import google.generativeai as genai
import os

from dotenv import load_dotenv
from IPython.display import display
from IPython.display import Markdown
import textwrap
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import SimpleSequentialChain
import requests
from bs4 import BeautifulSoup
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
# from langchain.chains.question_answering import load_qa_chain
# from langchain.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from pprint import pprint
from PyPDF2 import PdfReader

In [7]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

def get_text_from_web_article(url):
    try:
        response = requests.get(url)
        # Check if request was successful
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()
            parsed_text = text.replace('\n', ' ')
            return parsed_text
        else:
            print("Failed to fetch content from URL:", url)
            return None
    except Exception as e:
        print("An error occurred:", e)
        return None

def read_pdf(file_path):
    ''' Read a pdf file and get its content '''

    pdf_reader = PdfReader(file_path)
    file_text_content = pdf_reader
    text_content = ""
    number_of_pages = len(pdf_reader.pages)
    for i_page in range(number_of_pages):
        page = pdf_reader.pages[i_page]
        text_content += page.extract_text()
        
    return text_content

# Parameters

In [113]:
par__vector_store_index_name = "llm-101-poc"
par__pdf_file_path = "../data/articleAccessibleDesign.pdf"
par__verbose_mode = True

# Authentication

## Setting API key

In [15]:
load_dotenv()

True

In [10]:
# os.environ.get("GOOGLE_API_KEY")

In [11]:
# os.environ.get("PINECONE_API_KEY")

In [12]:
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))

# Get content from PDF file

In [16]:
pdf_raw_text_content = read_pdf(par__pdf_file_path)

In [119]:
# Splitting up texts into document chunks

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 0,
)

texts = text_splitter.create_documents([pdf_raw_text_content])

In [115]:
texts

[Document(page_content='Accessible design is easier than you think\nEasy-to-follow guidelines can make accessibility part of your everyday design\nprocess\nCintia Romero·Follow\nPublished inPinterest Design\n5 min read·Oct 27, 2021\nListen Share More\nAssistive device | Photo by Sigmund on UnsplashGet unlimited access to the best of Medium for less than $1/week.Become a member\n4/29/24, 5:47 PM Accessible design is easier than you think | by Cintia Romero | Pinterest Design', metadata={'text': 'Accessible design is easier than you think\nEasy-to-follow guidelines can make accessibility part of your everyday design\nprocess\nCintia Romero·Follow\nPublished inPinterest Design\n5 min read·Oct 27, 2021\nListen Share More\nAssistive device | Photo by Sigmund on UnsplashGet unlimited access to the best of Medium for less than $1/week.Become a member\n4/29/24, 5:47 PM Accessible design is easier than you think | by Cintia Romero | Pinterest Design'}),
 Document(page_content='https://pinterest

In [120]:
texts[5].page_content

'performance; instead, it helps the engineering team implement significant\naccessibility standards in the earliest stages, encouraging good coding practices.\nFixing an inaccessible product later may require extra effort, time and impact the\nbusiness plans and deliverables.\nI want to share easy-to-follow WCAG 2.1 design guidelines I use in my daily design\nprocess when evaluating accessibility in my projects. I organized these guidelines by'

In [87]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [88]:
embeddings

GoogleGenerativeAIEmbeddings(model='models/embedding-001', task_type=None, google_api_key=None, credentials=None, client_options=None, transport=None, request_options=None)

In [89]:
# Test embeddings
query_result = embeddings.embed_query(texts[0].page_content)
print(query_result)
     

[0.03696355, -0.048819743, -0.071134, -0.000659496, 0.096602514, 0.025297236, 0.04581643, -0.0070661874, 0.002829002, 0.041267667, 0.055633888, 0.029038604, -0.046973612, 0.02533029, -0.008227784, -0.036708344, 0.03761857, 0.024367457, -0.012174477, -0.034076054, 0.039838426, 0.0008575216, -0.02729256, -0.012594638, 0.047864903, -0.0032034782, 0.008483473, -0.050146427, -0.03825963, 0.018665303, -0.05137839, 0.033277135, 0.024017477, -0.005128521, -0.020429695, -0.028645296, -0.008169635, -0.018580435, 0.03139887, 0.052537683, -0.012141003, -0.054011457, -0.048670255, 0.047711387, -0.012443699, -0.005845518, 0.00812543, 0.018928312, 0.003458731, -0.014729743, 0.01407957, -0.009320859, 0.049470797, -0.006506783, 0.008152698, -0.036389835, 0.014015797, -0.00837734, -0.041277733, 0.041430674, 0.012842301, 0.014836599, 0.021147886, 0.011722856, -0.018962584, -0.08296827, -0.048350587, 0.015909199, 0.034880966, 0.012065526, 0.013454056, -0.026599122, 0.051010944, -0.030010102, 0.022194711, 

In [90]:
# os.environ['PINECONE_API_KEY'] = ""

In [91]:
pc = Pinecone(api_key=str(os.getenv('PINECONE_API_KEY')).strip("\""))

In [92]:
pc.create_index(
    name=par__vector_store_index_name,
    dimension=768, # Replace with your model dimensions
    metric="euclidean", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [93]:
# Upload vectors to Pinecone
index_name = par__vector_store_index_name
# search = Pinecone.from_documents(texts, embeddings, index_name=index_name)

vectorstore_from_docs = PineconeVectorStore.from_documents(
        texts,
        index_name=index_name,
        embedding=embeddings
    )

In [95]:
# Do a simple vector similarity search

query = "Hearing"
result = vectorstore_from_docs.similarity_search(query)

print(result)

[Document(page_content='Besides that, we have the PinAble community, committed to creating a welcoming,\naccessible Pinterest experience by ensuring everyone can explore and achieve their\nfull potential no matter their ability.Open in app\nSearch2\n4/29/24, 5:47 PM Accessible design is easier than you think | by Cintia Romero | Pinterest Design'), Document(page_content='Accessible design is easier than you think\nEasy-to-follow guidelines can make accessibility part of your everyday design\nprocess\nCintia Romero·Follow\nPublished inPinterest Design\n5 min read·Oct 27, 2021\nListen Share More\nAssistive device | Photo by Sigmund on UnsplashGet unlimited access to the best of Medium for less than $1/week.Become a member\n4/29/24, 5:47 PM Accessible design is easier than you think | by Cintia Romero | Pinterest Design'), Document(page_content='icons to support your use case. Color-only variations do not work well for those\nwho may be color blind or have vision impairments.\nAccount for

## LLM

In [30]:
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))

In [31]:
model = ChatGoogleGenerativeAI(model="gemini-pro",
                             temperature=0.3)

In [32]:
model

ChatGoogleGenerativeAI(model='gemini-pro', temperature=0.3, client=genai.GenerativeModel(
    model_name='models/gemini-pro',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction=None,
))

In [96]:
context = "\n".join(str(p.page_content) for p in texts)
print("The total words in the context: ", len(context))

The total words in the context:  12939


In [97]:
prompt_template = """Answer the question as precise as possible using the provided context. If the answer is
                    not contained in the context, say "Answer not available in context" \n\n
                    Context: \n {context}?\n
                    Question: \n {input} \n
                    Answer:
                  """

prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "input"]
)

In [44]:
retriever = vectorstore_from_docs.as_retriever()

In [45]:
combine_docs_chain = create_stuff_documents_chain(
    model
    ,prompt
)

In [46]:
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [114]:
question = "How to support hearing disability?"
# question = "How to support cognitive disability?"
question = "What is the benefit of designing accessible products?"
question = "What kinds of disability the article is about?"
# question = "How clear hierarchy can help?"
# question = "Does accessibility impact delay deliveries?"
# question = "Give me practical guidelines regarding disabilities."
retrieval_chain.invoke({"input": question})

stuff_answer = retrieval_chain.invoke({"input": question})
if par__verbose_mode : print(stuff_answer) 
to_markdown(stuff_answer["answer"])

{'input': 'What kinds of disability the article is about?', 'context': [Document(page_content='each disability type to simplify understanding and help you reach out to more users4/29/24, 5:47 PM Accessible design is easier than you think | by Cintia Romero | Pinterest Design\nhttps://pinterest.design/accessible-design-is-easier-than-you-think-29c1d3200d87 2/13by considering accessibility. In addition, they are easy to fit in your design process\nbandwidth.\nIn general, designing accessible products doesn’t\ndelay deliveries or impact team performance;'), Document(page_content='possible. It can be difficult for users to access or remember since it might\nappear outside the proximity of the user’s current focus.\nPhysical disability\nGive alternatives to skip links: Provide a way for skipping over long lists or long\ncontent. Some users cannot reach the keyboard or mouse, so they could use\nvoice controls to use your product.\nClearly communicate error messages: Provide large links and c

> Physical disability and Hearing disability

---
# End.