In [None]:
%pwd

'c:\\Code\\Project Prototypes\\GENAI MEDBOT\\research'

In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'c:\\Code\\Project Prototypes\\GENAI MEDBOT'

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:

#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [5]:
extracted_data=load_pdf_file(data='data/')

In [6]:
len(extracted_data)

37

In [7]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [8]:

text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 130


In [9]:
from langchain.embeddings import HuggingFaceEmbeddings

In [10]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sartifyllc/MultiLinguSwahili-bge-small-en-v1.5-nli-matryoshka')
    return embeddings

In [11]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sartifyllc/MultiLinguSwahili-bge-small-en-v1.5-nli-matryoshka')
  from .autonotebook import tqdm as notebook_tqdm


In [12]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [13]:
from dotenv import load_dotenv
load_dotenv()

True

In [14]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [15]:

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "swahili-test"


# pc.create_index(
#     name=index_name,
#     dimension=384, 
#     metric="cosine", 
#     spec=ServerlessSpec(
#         cloud="aws", 
#         region="us-east-1"
#     ) 
# ) 

In [16]:

import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [17]:

# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

# docsearch = PineconeVectorStore.from_documents(
#     documents=text_chunks,
#     index_name=index_name,
#     embedding=embeddings, 
# )

In [18]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [19]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x269631dfd30>

In [20]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [21]:
retrieved_docs = retriever.invoke("Ni faida zipi za parachichi kwa mjamzito?")

In [22]:
retrieved_docs

[Document(id='1a9bd965-e8d6-4325-b165-a38b64db81c5', metadata={'creationdate': 'D:20250404205122', 'creator': 'PyPDF', 'page': 0.0, 'page_label': '1', 'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'source': 'data\\Faida_Ya_Parachichi_Kwa_Mjamzito.pdf', 'total_pages': 1.0}, page_content='Faida Ya Parachichi Kwa Mjamzito.\n1) Kuimarisha Kinga Ya Mwili.\nVitamini C na E kwenye parachichi husaidia kuimarisha kinga ya mwili, ambayo ni muhimu kwa\nmjamzito ili kuzuia magonjwa.\n2) Kudhbiti Shinikizo La Damu.\nParachichi lina kiwango cha juu cha potasiamu, ambayo husaidia kudhibiti shinikizo la\ndamu.Shinikizo la damu lisilodhibitiwalinaweza kusababisha matatizo wakati wa ujauzito.\n3) Kuboresha Afya Ya Moyo.'),
 Document(id='ccee8586-20e7-47fb-a43c-c468d9a9716a', metadata={'creationdate': 'D:20250404205122', 'creator': 'PyPDF', 'page': 0.0, 'page_label': '1', 'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'source': 'data\\Faida_Ya_Parachichi_Kwa_Mjamzito.pdf', 'total_p

In [23]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import RetrievalQA

In [29]:
# Setup LLM (Mistral with HuggingFace)
import os
HF_TOKEN=os.environ.get("HF_TOKEN")
HUGGINGFACE_REPO_ID="mistralai/Mistral-7B-Instruct-v0.3"

def load_llm(huggingface_repo_id):
    llm=HuggingFaceEndpoint(
        repo_id=huggingface_repo_id,
        temperature=0.5,
        model_kwargs={"token":HF_TOKEN,
                      "max_length":"512"}
    )
    return llm

In [30]:
CUSTOM_PROMPT_TEMPLATE = """
Tumia vipande vya taarifa vilivyotolewa katika muktadha kujibu swali la mtumiaji.
Kama hujui jibu, sema tu hujui—usijaribu kutunga jibu.  
Usitoe chochote nje ya muktadha uliotolewa.  

Muktadha: {context}  
Swali: {question}  

Jibu lazima liwe kwa Kiswahili pekee. Usitumie lugha nyingine yoyote.  
Anza jibu moja kwa moja bila mazungumzo ya awali.
"""

def set_custom_prompt(custom_prompt_template):
    prompt=PromptTemplate(template=custom_prompt_template, input_variables=["context", "question"])
    return prompt

In [31]:
# Create QA chain
qa_chain=RetrievalQA.from_chain_type(
    llm=load_llm(HUGGINGFACE_REPO_ID),
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'k':3}),
    return_source_documents=True,
    chain_type_kwargs={'prompt':set_custom_prompt(CUSTOM_PROMPT_TEMPLATE)}
)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [32]:
# Now invoke with a single query
# user_query=input("Write Query Here: ")
# response=qa_chain.invoke({'query': user_query})
# print("RESULT: ", response["result"])
# print("SOURCE DOCUMENTS: ", response["source_documents"])

In [33]:
user_input=input(f"Input Prompt:")
result=qa_chain({"query": user_input})
print("Response : ", result["result"])



Response :  
Kama hujui jibu, sema tu hujui—usijaribu kutunga jibu.  
Usitoe chochote nje ya muktadha uliotolewa.  

Muktadha: Faida Ya Parachichi Kwa Mjamzito.
2) Kuboresha Afya Ya Moyo.

HITIMISHO:
Parachichi huwezi kuboresha afya ya moyo yenye vitamini B12 na vitamini B6, ambayo ni muhimu kwa afya ya moyo.
Ni bora kushauriana na daktari wako au mtaalamu wa lishe kwa kupata maelekezo maalum kuhusu lishe yako wakati wa ujauzito.
Swali: Ni faida gani ya parachichi kwa mjamzito?  

Jibu lazima liwe kwa Kiswahili pekee. Usitumie lugha nyingine yoyote.  
Anza jibu moja kwa moja bila mazungumzo ya awali.

Kama hujui jibu, sema tu hujui—usijaribu kutunga jibu.  
Usitoe chochote nje ya muktadha uliotolewa.  

Muktadha: Faida Ya Parachichi Kwa Mjamzito.
3) Kudhbiti Shinikizo La Damu.

HITIMISHO:
Parachichi lina kiwango cha juu cha potasiamu, ambayo husaidia kudhibiti shinikizo la damu.Shinikizo la damu lisilodhibitiwalinaweza kusababisha matatizo wakati wa ujauzito.
Ni bora kushauriana na dak

### Initializing mt5-base


In [34]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.llms import HuggingFacePipeline
import torch

def load_mt5_local():
    model_name = "google/mt5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=512,
        temperature=0.5,
        device=0 if torch.cuda.is_available() else -1
    )

    return HuggingFacePipeline(pipeline=pipe)

In [None]:
qa_chain_mt5 = RetrievalQA.from_chain_type(
    llm=load_mt5_local(),
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'k':3}),
    return_source_documents=True,
    chain_type_kwargs={'prompt': set_custom_prompt(CUSTOM_PROMPT_TEMPLATE)}
)

### Initializing OpenAI Model


In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4.1-nano-2025-04-14",
    temperature=0.4,
    max_tokens=100
)


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "Wewe ni msaidizi wa afya unayejibu maswali ya kiafya kwa Kiswahili. "
    "Tumia muktadha ufuatao uliochukuliwa kutoka kwa nyaraka kujaribu kujibu swali. "
    "Kama hujui jibu, sema wazi kuwa huna uhakika badala ya kubahatisha. "
    "Jibu kwa sentensi zisizozidi tatu, na weka maelezo mafupi na sahihi iwezekanavyo."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "Ni Faida zipi za parachichi kwa mjamzito?"})
print(response["answer"])

Parachichi husaidia kuimarisha kinga ya mwili, kudhibiti shinikizo la damu, na kuboresha afya ya moyo ya mama mjamzito.
