### Loading PDF and Removing /n

In [1]:
file_path = "../data/ug_student_handbook.pdf"

In [2]:
from langchain_community.document_loaders import PyPDFLoader
import pprint
loader = PyPDFLoader(file_path)


In [3]:
pages_pdf = loader.load()

In [4]:
#Removing \n in pages
for page in pages_pdf:
    page.page_content = ' '.join(page.page_content.split())

### Document Splitting with Recusrive CharacterTextSplitter

In [5]:
import uuid
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap =50
)

In [7]:
texts = text_splitter.split_documents(pages_pdf)

In [8]:
#metadata list only
metadata_list = [text.metadata for text in texts]

In [9]:
#id_list
id_list = [str(uuid.uuid4()) for _ in texts]


### Embedding

changing text -> vectors

In [11]:
from sentence_transformers import SentenceTransformer

In [12]:
embedding = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [13]:
#Storing all the content in a list
page_contents = []
for text in texts:
    page_contents.append(text.page_content)

In [14]:
#Encode all the content into list of vectors
page_contents_vectors = [embedding.encode(content) for content in page_contents]

In [15]:
(len(page_contents_vectors),len(id_list), len(metadata_list))

(229, 229, 229)

### Chroma DB

In [16]:
import chromadb

chroma_client = chromadb.PersistentClient(path="./chroma_langchain_db")

In [17]:
collection = chroma_client.get_or_create_collection(name="handbook_embeddings_collections")

In [18]:
collection.add(
    documents=page_contents,
    embeddings=page_contents_vectors,
    ids=id_list,
    metadatas=metadata_list
)

### Retrieving Query

In [19]:
query_text = "School Uniform?"

In [20]:
query_vector = embedding.encode(query_text)

In [21]:
result = collection.query(
    query_embeddings=query_vector,
    n_results=3
)

validate context

In [None]:
for i, doc in enumerate(result["documents"][0]):
    print(f"\n--- Result {i+1} ---\n")
    print(doc[:1000])


In [None]:
result['documents']

### Ollama

In [27]:
from ollama import chat
from ollama import ChatResponse

In [None]:
response : ChatResponse = chat(
    model="mistral",
    messages = [
        {
            'role' : 'system',
            'content' : "You are a chatbot for assisting university student using a handbook as your resource."
        },
        {
            'role' : 'user',
            'content' : "Location of a campus",
        },
        {
            'role' : 'assistant',
            'content' : "{handbook}"
        }
    ]
    )       

print(response['message']['content'])
print(response.message.content)