In [27]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langsmith import traceable
import os
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings 
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

In [2]:

# Define your variables
variables = {
    "LANGCHAIN_TRACING_V2": "True",
    "LANGCHAIN_ENDPOINT": "https://api.smith.langchain.com",
    "LANGCHAIN_API_KEY": "ls__e70c98c314364c01999d3b06f1d6cea6",
    "LANGCHAIN_PROJECT": "docbot_llama3",
    "GROQ_API_KEY": "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"
}

# Set each variable as an environment variable
for key, value in variables.items():
    os.environ[key] = value


In [3]:
from langchain.callbacks.tracers import LangChainTracer

tracer = LangChainTracer(project_name="docbot_llama3")

In [4]:
def pipeline(user_input: str):
    chat = ChatGroq(temperature=0, model_name="llama3-8b-8192")
    prompt = ChatPromptTemplate.from_messages([("human", "Write a code for {user_input}")])
    chain = prompt | chat
    return chain.invoke({"user_input": user_input})

pd = pipeline("read a pdf using pypdf2 and split it into chunks")

In [18]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

loader = PyPDFLoader("../data/Adobe-CSR-Report-2022.pdf")
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=0)
pages = loader.load_and_split(text_splitter=text_splitter)


In [30]:
def create_embeddings(docs,pdf_name):

    embeddings = HuggingFaceEmbeddings()

    vec_check = os.listdir('../doc_embedding_store')

    if pdf_name in vec_check:

            print(f'Embedding for document {pdf_name} already present in storage')

            print(f'*******-----Using Saved Embeddings-------******')

            vectorStore = FAISS.load_local(f'../doc_embedding_store/{pdf_name}', embeddings, 
                                           allow_dangerous_deserialization=True)

    else:

        print(f'*******-----Creating Embeddings-------******')

        print(f'Embedding for document {pdf_name} not created in storage')

        vectorStore = FAISS.from_documents(docs, embeddings)

        vectorStore.save_local(f'../doc_embedding_store/{pdf_name}')

    return vectorStore

In [31]:
embed = create_embeddings(docs=pages,pdf_name="Adobe")

Embedding for document Adobe already present in storage
*******-----Using Saved Embeddings-------******


In [47]:
llmtemplate = """[INST]
As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
- Answer the question based on the provided documents.
- Be direct and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no etc.
- Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
- If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
- Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
- Do not fabricate information or include questions in your responses.
- do not prompt to select answers. do not ask me questions
{question}
[/INST]
"""
chat = ChatGroq(temperature=0, model_name="llama3-8b-8192")
retriever = embed.as_retriever()
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(llmtemplate)

memory = ConversationBufferMemory(
    memory_key='chat_history', return_messages=True, output_key='answer')

conversation_chain = (ConversationalRetrievalChain.from_llm
                        (llm=chat,
                        retriever=retriever,
                        # condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                        memory=memory,
                        return_source_documents=True,verbose=True))
print("Conversational Chain created for the LLM using the vector store")


Conversational Chain created for the LLM using the vector store


In [None]:
res = conversation_chain.invoke("What is the gross GHG emission?")

In [49]:
print(res["answer"])

According to the provided data, the gross CO2 emissions can be calculated as follows:

Scope 1 GHG emissions: 6,568 tonnes CO2e (2022)
Scope 2 GHG emissions (location-based): 57,168 tonnes CO2e (2022)
Scope 3 GHG emissions: 463,438 tonnes CO2e (2022)

Total gross CO2 emissions: 6,568 + 57,168 + 463,438 = 527,174 tonnes CO2e (2022)

Note that this calculation only includes the Scope 1, Scope 2, and Scope 3 emissions reported in the provided data. The total gross CO2 emissions may be higher if other sources of emissions are not reported.


In [50]:
res = conversation_chain.invoke("What is the total Scope 1 emission?")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: What is the gross co2 emission?
Assistant: According to the provided data, the gross CO2 emissions can be calculated as follows:

Scope 1 GHG emissions: 6,568 tonnes CO2e (2022)
Scope 2 GHG emissions (location-based): 57,168 tonnes CO2e (2022)
Scope 3 GHG emissions: 463,438 tonnes CO2e (2022)

Total gross CO2 emissions: 6,568 + 57,168 + 463,438 = 527,174 tonnes CO2e (2022)

Note that this calculation only includes the Scope 1, Scope 2, and Scope 3 emissions reported in the provided data. The total gross CO2 emissions may be higher if other sources of emissions are not reported.
Follow Up Input: What is the total Scope 1 emission?
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m

In [51]:
print(res["answer"])

According to the provided data, the Scope 1 GHG emissions from Adobe's Corporate Social Responsibility Report are:

* 2022: 4,815 tonnes CO2e
* 2021: 3,469 tonnes CO2e
* 2020: 7,534 tonnes CO2e

These emissions are from the combustion of natural gas and diesel.


In [53]:
print(res['chat_history'])

[HumanMessage(content='What is the gross co2 emission?'), AIMessage(content='According to the provided data, the gross CO2 emissions can be calculated as follows:\n\nScope 1 GHG emissions: 6,568 tonnes CO2e (2022)\nScope 2 GHG emissions (location-based): 57,168 tonnes CO2e (2022)\nScope 3 GHG emissions: 463,438 tonnes CO2e (2022)\n\nTotal gross CO2 emissions: 6,568 + 57,168 + 463,438 = 527,174 tonnes CO2e (2022)\n\nNote that this calculation only includes the Scope 1, Scope 2, and Scope 3 emissions reported in the provided data. The total gross CO2 emissions may be higher if other sources of emissions are not reported.'), HumanMessage(content='What is the total Scope 1 emission?'), AIMessage(content="According to the provided data, the Scope 1 GHG emissions from Adobe's Corporate Social Responsibility Report are:\n\n* 2022: 4,815 tonnes CO2e\n* 2021: 3,469 tonnes CO2e\n* 2020: 7,534 tonnes CO2e\n\nThese emissions are from the combustion of natural gas and diesel.")]


In [None]:


def prepare_docs(pdf_docs):
    docs = []
    metadata = []
    content = []

    
    pdf_reader = PyPDF2.PdfReader(pdf_docs)
    for index, text in enumerate(pdf_reader.pages):
        doc_page = {'title': pdf_docs + " page " + str(index + 1),
                    'content': pdf_reader.pages[index].extract_text()}
        docs.append(doc_page)
    for doc in docs:
        content.append(doc["content"])
        metadata.append({
            "title": doc["title"]
        })
    return content, metadata


def get_text_chunks(content, metadata):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512,
        chunk_overlap=256,
    )
    split_docs = text_splitter.create_documents(content, metadatas=metadata)
    print(f"Split documents into {len(split_docs)} passages")
    return split_docs


def ingest_into_vectordb(split_docs):
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
    db = FAISS.from_documents(split_docs, embeddings)

    DB_FAISS_PATH = 'vectorstore/db_faiss'
    db.save_local(DB_FAISS_PATH)
    return db


def get_conversation_chain(vectordb):
    chat = ChatGroq(temperature=0, model_name="llama3-8b-8192")
    retriever = vectordb.as_retriever()
    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(llmtemplate)

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True, output_key='answer')

    conversation_chain = (ConversationalRetrievalChain.from_llm
                          (llm=chat,
                           retriever=retriever,
                           #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                           memory=memory,
                           return_source_documents=True))
    print("Conversational Chain created for the LLM using the vector store")
    return conversation_chain




In [None]:
pdf_reader = PyPDF2.PdfReader(stream="../data/Adobe-CSR-Report-2022.pdf")

In [None]:
prepare_docs(pdf_docs="../data/Adobe-CSR-Report-2022.pdf")

([' Adobe Corporate   \nSocial Responsibility  \nReport 2022 \nArtwork by  Yulia Vus  ',
  '2 \n2022 Adobe Corporate Social Responsibility Report\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n  \n \n \n \n \n \n \n \n  \n \n \n \n \n \n \n \n A Message From Our CEO \nAdobe was founded on simple yet enduring principles that remain with us to this day:  \nInnovation is at our core, employees are our greatest asset, and our customers,  \ncommunities and stakeholders are central to our success.  \nAs digital continues to rapidly reshape how we work, learn and \nplay, Adobe’s products are fueling the digital economy and empowering everyone, everywhere to imagine, create and deliver any digital experience. Purpose has been core to the company’s DNA from its earliest days. Tere are three key areas in which we are uniquely positioned and motivated to make a diference: Adobe for All, Creativity for All and Technology to Transform. \nAdobe for All \nAdobe’s purpose starts with our commitment 