# Install Libraries

In [1]:
#!pip install unstructured
#!pip install tiktoken
#!pip install pinecone-client
#!pip install pypdf

In [32]:
import os
import openai
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.llms import OpenAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain_huggingface import HuggingFaceEndpoint
from dotenv import load_dotenv
load_dotenv()

True

# 1. Document Loader

In [19]:
directory = "Docs/"
loader = PyPDFDirectoryLoader(directory)
documents = loader.load()
print("Number of original documents: ", len(documents))  # It says 3 because one page is one document (metadata contains page_number)
print("Document example: ", documents[-1])

Number of original documents:  3
Document example:  page_content="India's diplomatic influence is also growing on the global stage. The country actively \nparticipates in international forums and has strong bilateral relations with nations around the \nworld. India is a founding member of the Non-Aligned Movement and plays an active role in \nvarious international organizations, such as the United Nations and World Trade Organization.\nIn conclusion, India is a vast and diverse country with a rich cultural heritage, stunning \nlandscapes, and a rapidly growing economy. It is a nation where ancient traditions coexist with \nmodern aspirations. Despite its challenges, India continues to evolve and leave an indelible \nmark on the world, making it a fascinating and dynamic country to explore." metadata={'source': 'Docs\\Doc 2.pdf', 'page': 1}


# 2. Document Transformer
Here, we will use *RecursiveCharacterTextSplitter* instead of a simple *CharacterTextSplitter*, becasue the recursive one will first start to split the text naturally (at breakpoints on paragraphs, sentences, or other specified delimeters). After that, it will split the text on smaller chunks based on characters. **It ensures that each chunk is as meaningful as possible by preserving the text structure.**

In [20]:
chunk_size = 1_000
chunk_overlap = 20
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
print("Documents length: ", len(docs))

Documents length:  7


# 3. Data Embedding

In [21]:
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Test the embeddings model
embedding = embedding_model.embed_query("My test query to get embedded.")
print("Length of the embedding vector: ",len(embedding))

Length of the embedding vector:  384


# 4. Vector Database
For this we will use PINECONE DB. First, go to their website and create an index there. Here, we will just connect to that index.
Keep in mind, instead of using **original Pinecone**, we will use **langchain's proxy for Pinecone**.

In [28]:
# We will use only first two docs initially, and then the last two, but this is just to show how can Pinecone perform lazy adding
index_name = "mcq-creator"
index_pc = PineconeVectorStore.from_documents(docs[:5], embedding=embedding_model, index_name=index_name)
index_pc.add_documents(docs[5:])

['26c9725d-ef43-4e64-8600-d4d6cf5a864a',
 '5ab2dab8-fbd8-44d7-af89-6db7243804cc']

## 5. Retrieval
Now, we will use the retrieval from **langchain's proxy for Pinecone** index.

In [38]:
def get_similar_docs(query, k=2):
    similar_docs = index_pc.similarity_search(query, k=2)
    return similar_docs

## EXTRA: Instantiating LLM that will use the Pinecone DB

In [39]:
# Let's instantiate the LLM that will use the results of the similarity search on out PineconeDB in order to create prompts
llm = HuggingFaceEndpoint(repo_id="bigscience/bloom", temperature=1e-10)
llm

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\User\.cache\huggingface\token
Login successful


HuggingFaceEndpoint(repo_id='bigscience/bloom', temperature=1e-10, model='bigscience/bloom', client=<InferenceClient(model='bigscience/bloom', timeout=120)>, async_client=<InferenceClient(model='bigscience/bloom', timeout=120)>)

In [42]:
chain = load_qa_chain(llm, chain_type="stuff")  # We will learn more about chains in future lecture

# This function will help us get the answer to the question that we raise
def get_answer(query, verbose=False):
    relevant_docs = get_similar_docs(query)
    if verbose:
        print("RELEVANT DOCS: ", relevant_docs)
    response = chain.run(input_documents=relevant_docs, question=query)
    return response

## Questions

In [47]:
our_query = "How is India's economy? Answer VERY shortly."
answer = get_answer(our_query)
answer

" India's economy is growing rapidly. It is a service-oriented and industrialized economy. Major cities like Mumbai, Delhi, Bangalore, and Chennai are hubs of business and commerce, attracting investments and fostering innovation."

## Structure the Output

In [57]:
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

import re
import json

In [79]:
# This helps us create a schema for the desired output format (prompt engineering)

# Response schema defines the JSON of the output
response_schemas = [
    ResponseSchema(name="question", description="Question generated from provided input text data."),
    ResponseSchema(name="choices", description="Available options for a multiple-choice question in comma separated."),
    ResponseSchema(name="answer", description="Correct answer for the asked question.")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
print("OUTPUT PARSER: ", output_parser)  # This is an object (we won't use this directly), but instead its format instruction
format_instructions = output_parser.get_format_instructions()
print("FORMAT INSTRUCTIONS: ", format_instructions)  # This will be added to the prompt (to tell the LLM how to format output)

OUTPUT PARSER:  response_schemas=[ResponseSchema(name='question', description='Question generated from provided input text data.', type='string'), ResponseSchema(name='choices', description='Available options for a multiple-choice question in comma separated.', type='string'), ResponseSchema(name='answer', description='Correct answer for the asked question.', type='string')]
FORMAT INSTRUCTIONS:  The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"question": string  // Question generated from provided input text data.
	"choices": string  // Available options for a multiple-choice question in comma separated.
	"answer": string  // Correct answer for the asked question.
}
```


In [80]:
# Let's create a prompt template
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template("""
            When a text input is given by the user, please generate multiple choice questions from it along with the correct answer.
            \n{format_instructions}\n{user_prompt}
        """)
    ],
    input_variables=["user_prompt"],
    partial_variables={"format_instructions": format_instructions}  # This means that this will be determined right away, but also can be changed later (just pass 'format_instructions=format_instructions' to the user prompt)
) # unlike PromptTemplate, ChatPromptTemplate can contain the conversation history such as SystemMessage, HumanMessage and AssistantMessage

# Fill in the prompt
final_query = prompt.format_prompt(user_prompt=answer)
final_query  # To only print the messages, use 'final_query.to_messages()'

ChatPromptValue(messages=[HumanMessage(content='\n            When a text input is given by the user, please generate multiple choice questions from it along with the correct answer.\n            \nThe output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"question": string  // Question generated from provided input text data.\n\t"choices": string  // Available options for a multiple-choice question in comma separated.\n\t"answer": string  // Correct answer for the asked question.\n}\n```\n India\'s economy is growing rapidly. It is a service-oriented and industrialized economy. Major cities like Mumbai, Delhi, Bangalore, and Chennai are hubs of business and commerce, attracting investments and fostering innovation.\n        ')])

In [77]:
chat_model = ChatOpenAI()

final_query_output = chat_model.invoke(final_query.to_messages()).content
print(final_query_output)

```json
{
	"question": "Which country's economy is growing rapidly?",
	"choices": "A. China, B. India, C. USA, D. Brazil",
	"answer": "B. India"
}
```


In [78]:
# Structure the output into an actual json
markdown_text = final_query_output
json_string = re.search(r'{(.*?)}', markdown_text, re.DOTALL).group(1)
json_string

'\n\t"question": "Which country\'s economy is growing rapidly?",\n\t"choices": "A. China, B. India, C. USA, D. Brazil",\n\t"answer": "B. India"\n'