In [108]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_ollama import ChatOllama

In [109]:
from pypdf import PdfReader
import re

def extract_utf8_text(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = ""

    for page in reader.pages:
        raw_text = page.extract_text()
        if raw_text:
            # Encode to UTF-8 and ignore non-UTF-8 characters
            clean_text = raw_text.encode("utf-8", errors="ignore").decode("utf-8")
            full_text += clean_text + "\n"

    return full_text

# Example usage
text = extract_utf8_text("washing_machine_user_guide.pdf")    


def strip_non_text(text):
    # Remove non-printable characters and common placeholders
    return re.sub(r"[^\x20-\x7E\n]", "", text)

safe_text = strip_non_text(text) 

with open("user_guide.txt", "w", encoding="utf-8") as file:
    file.write(safe_text)

In [110]:
documents = TextLoader("user_guide.txt").load()

In [113]:
type(documents[0])

langchain_core.documents.base.Document

In [114]:
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=20)
chunks = splitter.split_documents(documents)

In [72]:
oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="nomic-embed-text")

In [115]:
db = Chroma.from_documents(chunks, embedding=oembed)

In [116]:
query = "appliance must be properly grounded"
dc = db.similarity_search(query)

In [120]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [121]:
template = """Answer the question based only on the following context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [128]:
model = ChatOllama(
    model="llama3.1:8b",
    temperature=0
)

In [123]:
retriever = db.as_retriever()

In [124]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

In [130]:
chain = (

    {"context": retriever | format_docs, "question":RunnablePassthrough()}
     | prompt 
    | model 
    | StrOutputParser()
)

In [138]:
chain.invoke("what is 3C")

'According to the context, 3C refers to:\n\n"Check the motor for operation.\nTry restarting the cycle."'