<a href="https://colab.research.google.com/github/aidyai/rag-project/blob/main/documentchat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMPORTING RELEVANT LIBRARIES

In [None]:
!pip install langchain
!pip install gpt4all
!pip install qdrant-client
!pip install sentence-transformers
!pip install torch
!pip install pypdf

## Import Libraries

In [None]:
import re


from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler


from gpt4all import GPT4All

## LOADING DOCUMENTS

In [6]:
#downloading pdf
!wget "https://inecnigeria.org/wp-content/uploads/2024/02/2023-GENERAL-ELECTION-REPORT-1.pdf"

--2024-04-13 14:43:35--  https://inecnigeria.org/wp-content/uploads/2024/02/2023-GENERAL-ELECTION-REPORT-1.pdf
Resolving inecnigeria.org (inecnigeria.org)... 54.155.1.49
Connecting to inecnigeria.org (inecnigeria.org)|54.155.1.49|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17422886 (17M) [application/pdf]
Saving to: ‘2023-GENERAL-ELECTION-REPORT-1.pdf’


2024-04-13 14:43:38 (9.84 MB/s) - ‘2023-GENERAL-ELECTION-REPORT-1.pdf’ saved [17422886/17422886]



In [10]:
loader = PyPDFLoader("/content/2023-GENERAL-ELECTION-REPORT-1.pdf")
documents = loader.load()



### Data Preprocessing

In [11]:
def preprocess_text(text):
    text_lower = text.lower()
    # only allow these characters
    text_no_punctuation = re.sub(r'[^\w\s\$\%\.\,\"\'\!\?\(\)]', '',
                                 text_lower)
    # removes extra tabs space
    text_normalized_tabs = re.sub(r'(\t)+', '', text_no_punctuation)
    return text_normalized_tabs

In [12]:
for x in range(len(documents)):
    # do preprocessing
    documents[x].page_content=preprocess_text(documents[x].page_content)

In [15]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0,separator="\n")
docs = text_splitter.split_documents(documents)

## Qdrant Vector Database and BAAI Embeddings

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5",
                                   model_kwargs = {'device': "cpu"})
qdrant = Qdrant.from_documents(
    docs,
    embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="election_analytics",
    force_recreate=True
)

In [None]:
def format_docs(query):
    found_docs = qdrant.similarity_search_with_score(query,k=1)
    return "\n\n".join(doc[0].page_content for doc in found_docs)

## MAKING USE OF LARGE LANGUAGE MODELS

In [None]:
!mkdir models  #creating a directory called models

# download model
!wget https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf -O models/mistral-7b-instruct-v0.1.Q4_0.gguf

In [None]:
llm = GPT4All(
            model="mistral-7b-instruct-v0.1.Q4_0.gguf",
            max_tokens=300,
            n_threads = 4,
            temp=0.3,
            top_p=0.2,
            top_k=40,
            n_batch=8,
            seed=100,
            allow_download=True,
            verbose=True)

In [None]:
template = '''[INST]: You are an Electoral Analytic bot from the UN Analyzing the just concluded 2023 General Elections In Nigeria, below presents a context from which the a question will be asked, give your valuable insights as well.[\INST]\n
Context: {context}.\n
Question: {question}\n
Answer: '''

In [None]:
rag_prompt = PromptTemplate(template=template, input_variables=["context","question"])


callbacks = [StreamingStdOutCallbackHandler()]
llm_chain = LLMChain(prompt=rag_prompt, llm=llm, verbose=True)

In [None]:
query = "list the votes each party had from lowest to highest with percentages??"
resp = llm_chain.invoke(
    input={"question":query,
           "context": format_docs(query)
          }
)
print(resp['text'])