<a href="https://colab.research.google.com/github/axeltanjung/qa_rag_public_policy/blob/main/src/nlp_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Task 1 : Adding Document

**Install All Dependencies**

In [68]:
!pip install langchain
!pip install unstructured
!pip install pdf2image
!pip install pdfminer.six
!pip install unstructured_inference
!pip install pikepdf
!pip install pydf
!pip install sentence-transformers
!pip install chromadb
!pip install unstructured_pytesseract
!pip install python-dotenv
!pip install huggingface_hub
!pip install pillow_heif
!pip install pypdf
!pip install huggingface_hub
!pip install faiss-gpu
!pip install sentence-transformers
!pip install gradio
!pip install openai
!pip install python-dotenv
!pip install tiktoken



In [None]:
from langchain.document_loaders import OnlinePDFLoader

In [None]:
parties_report_url_paper = 'http://www.kumoro.staff.ugm.ac.id/file_artikel/Full%20paper-Serving%20the%20Political%20Parties.pdf'
document_loader = OnlinePDFLoader(parties_report_url_paper)

In [None]:
doc_data = document_loader.load()

In [None]:
doc_data

**Task 2 : Process Document**

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
)
text_splits = text_splitter.split_documents(doc_data)

In [None]:
for text in range(len(text_splits)):
  text_splits[text].page_content = text_splits[text].page_content.replace("\n", " ")

In [None]:
len(text_splits)

In [None]:
print(text_splits[1].page_content)

**Task 3 : Adding Embedding Model**

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
dir(embedding_model)

In [None]:
# List of question

q1 = 'What significant historical events have shaped the political landscape in Indonesia, particularly regarding decentralization and democratization?'
q2 = 'How do political parties influence the policy-making process at the local level in Indonesia, especially in relation to elections for regional government positions?'
q3 = 'What challenges faced by local governments in Indonesia in terms of ensuring accountability and responsiveness to the needs of their constituents?'
q4 = 'How does the issue of money politics manifest itself in Indonesian elections, particularly in the context of decentralization and democratization?'
q5 = 'What controversies surround the construction of the Musi III bridge in Palembang?'

list_question = [q1, q2, q3, q4, q5]


In [None]:
# Check the size of embbeded question model

for question in list_question:
  print(f'Size of embbedded model q: {len(embedding_model.embed_query(question))}')

In [None]:
doc_text = text_splits[5].page_content

In [None]:
embeddings_dict = {}

for i, question in enumerate(list_question, 1):
    query_vec = embedding_model.embed_query(question)
    embeddings_dict[f'query_vec_q{i}'] = query_vec

doc_vec = embedding_model.embed_query(doc_text)

In [None]:
embeddings_dict

In [None]:
len(doc_vec) == len(query_vec)

**Task 4 : Adding Vector Store**

In [None]:
from langchain_community.vectorstores import Chroma

In [None]:
vector_db = Chroma.from_documents(documents=text_splits,
                                    embedding=embedding_model)

In [None]:
help(vector_db.similarity_search)

In [None]:
dir(vector_db)

In [None]:
results = {}

for i, question in enumerate(list_question, 1):
    results[f'result_q{i}'] = vector_db.similarity_search(query=question, k=10)

result_with_score = {}

for i, question in enumerate(list_question, 1):
    result_with_score[f'result_q{i}'] = vector_db.similarity_search_with_score(query=question, k=10)

In [None]:
result_with_score['result_q1']

In [None]:
results['result_q1'][0].page_content

In [None]:
results['result_q1'][-1].page_content

**Task 5 : Create Retriever**

In [None]:
retriever = vector_db.as_retriever(search_type="similarity",
                                   search_kwargs={"k": 10})

In [None]:
# to perform similarity search based on query we can run
retrieved_docs = {}

for i, question in enumerate(list_question, 1):
    retrieved_docs[f'result_q{i}'] = retriever.invoke(question)

In [None]:
len(retrieved_docs['result_q1'])

In [None]:
print(retrieved_docs['result_q1'][0].page_content)

In [None]:
print(retrieved_docs['result_q1'][2].page_content)

**Task 6 : Adding Language Model as Generator**

In [None]:
import os
from google.colab import userdata

os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get('TOKEN')

In [None]:
from langchain_community.llms import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    model_kwargs={'max_token':1000},
    task="text-generation")

In [None]:
help(llm.generate)

In [None]:
print(llm.generate(prompts=['mention the highest mountain in the world']).generations[0][0].text)

**Task 7 : Query**

In [None]:
from langchain_core.prompts import PromptTemplate

"""
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.
"""

template = """

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

In [None]:
# Example input model
input_model = custom_rag_prompt.format(context='Cristiano Ronaldo is player of Barcelona',question='Where is Cristiano Ronaldo play football ?')

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

In [None]:
llm_invoke = {}

for i, question in enumerate(list_question, 1):
    llm_invoke[f'result_q{i}'] = llm.invoke(question)

In [None]:
llm_invoke

In [None]:
docs = {}

for i, question in enumerate(list_question, 1):
    docs[f'result_q{i}'] = retriever.invoke(question)

In [None]:
formated_q1 = format_docs(docs['result_q1'])
formated_q2 = format_docs(docs['result_q2'])
formated_q3 = format_docs(docs['result_q3'])
formated_q4 = format_docs(docs['result_q4'])
formated_q5 = format_docs(docs['result_q5'])

In [None]:
input_model_q1 = custom_rag_prompt.format(context=formated_q1,question=q1)
input_model_q2 = custom_rag_prompt.format(context=formated_q2,question=q2)
input_model_q3 = custom_rag_prompt.format(context=formated_q3,question=q3)
input_model_q4 = custom_rag_prompt.format(context=formated_q4,question=q4)
input_model_q5 = custom_rag_prompt.format(context=formated_q5,question=q5)

In [None]:
llm.generate(prompts=[input_model_q1])

In [None]:
llm.generate(prompts=[input_model_q2])

In [None]:
llm.generate(prompts=[input_model_q3])

In [None]:
llm.generate(prompts=[input_model_q4])

In [None]:
llm.generate(prompts=[input_model_q5])

In [None]:
print(qa_chain.invoke(q1))

In [None]:
print(qa_chain.invoke(q2))

In [None]:
print(qa_chain.invoke(q3))

In [None]:
print(qa_chain.invoke(q4))

In [None]:
print(qa_chain.invoke(q5))

In [None]:
print(llm.generate(prompts=[q1]))
print(llm.generate(prompts=[q2]))
print(llm.generate(prompts=[q3]))
print(llm.generate(prompts=[q4]))
print(llm.generate(prompts=[q5]))

## Using FAISS

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import FAISS

### Split the Document

In [None]:
chunk_size =10
chunk_overlap = 2

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [None]:
text_splits = r_splitter.split_text(doc_data[0].page_content)
print(text_splits)

In [None]:
c_splitter = CharacterTextSplitter(
    separator = '\n',
    chunk_size=100,
    chunk_overlap=10
)

In [None]:
charSplit = c_splitter.split_text(doc_data[0].page_content)

print(charSplit)

### Vector Store in LangChain

In [None]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

os.environ['OPENAI_API_KEY_2']

openai.api_key  = os.environ['OPENAI_API_KEY_2']
#openai.api_key  = os.getenv('OPENAI_API_KEY')

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap = 25
)

In [None]:
splits = text_splitter.split_text(doc_data[0].page_content)

In [None]:
type(doc_data[0].page_content)

In [None]:
len(splits)

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
db = FAISS.from_documents(doc_data, hf)

### Querying Data

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.llms import HuggingFaceHub
from getpass import getpass

In [None]:
HUGGINGFACEHUB_API_TOKEN = getpass()

In [None]:
repo_id = "google/flan-t5-xxl"

llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.5}
)

qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())

In [None]:
import gradio as gr

In [None]:
#write your Gradio implementation here


def chat_bot(Textbox : gr.components.Textbox) -> str:
    return qa.run(Textbox)

iface = gr.Interface(fn=chat_bot, inputs="text", outputs="text")
iface.launch()