#### Loading the data

In [1]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

documents=PyPDFDirectoryLoader("data").load()

In [2]:
# checking the documents
documents

[Document(metadata={'source': 'data\\Atomic Habits James Clear 2.pdf', 'page': 0}, page_content='OIntroduction\nMy Story\nN THE FINAL day of my sophomore year of high school, I was hit in the\nface with a baseball bat. As my classmate took a full swing, the bat\nslipped out of his hands and came \x00ying toward me before striking me\ndirectly between the eyes. I have no memory of the moment of impact.\n\ue062e bat smashed into my face with such force that it crushed my nose\ninto a distorted U-shape. \ue062e collision sent the so\ue09d tissue of my brain\nslamming into the inside of my skull. Immediately, a wave of swelling surged\nthroughout my head. In a fraction of a second, I had a broken nose, multiple\nskull fractures, and two shattered eye sockets.\nWhen I opened my eyes, I saw people staring at me and running over to\nhelp. I looked down and noticed spots of red on my clothes. One of my\nclassmates took the shirt oﬀ his back and handed it to me. I used it to plug\nthe stream of

In [3]:
# GLOBAL VARIABLES
CHROMA_PATH='chroma'
DATA="data"

### Spliting data from documents(one page) into smaller chunks

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_docs(documents:list[Document]):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False
    )

    return text_splitter.split_documents(documents)

In [5]:
chunks=split_docs(documents)

In [6]:
chunks

[Document(metadata={'source': 'data\\Atomic Habits James Clear 2.pdf', 'page': 0}, page_content='OIntroduction\nMy Story\nN THE FINAL day of my sophomore year of high school, I was hit in the\nface with a baseball bat. As my classmate took a full swing, the bat\nslipped out of his hands and came \x00ying toward me before striking me\ndirectly between the eyes. I have no memory of the moment of impact.\n\ue062e bat smashed into my face with such force that it crushed my nose'),
 Document(metadata={'source': 'data\\Atomic Habits James Clear 2.pdf', 'page': 0}, page_content='\ue062e bat smashed into my face with such force that it crushed my nose\ninto a distorted U-shape. \ue062e collision sent the so\ue09d tissue of my brain\nslamming into the inside of my skull. Immediately, a wave of swelling surged\nthroughout my head. In a fraction of a second, I had a broken nose, multiple\nskull fractures, and two shattered eye sockets.'),
 Document(metadata={'source': 'data\\Atomic Habits James C

In [7]:
def calculate_chunk_id(chunks):

    last_page_id=None
    curr_chunk_index=0

    for chunk in chunks:
        source=chunk.metadata.get("source")
        page=chunk.metadata.get("page")
        curr_page_id=f"{source}:{page}"
        
        # checking for new page before alloting final chunk id
        if curr_page_id==last_page_id:
            curr_chunk_index+=1
        else:
            curr_chunk_index=0
        
        # denoting chunk ids
        chunk_id=f"{curr_page_id}:{curr_chunk_index}"

        # incrementing page id
        last_page_id = curr_page_id

        # updating metadata for chunks
        chunk.metadata["chunk_id"]=chunk_id

    return chunks

In [8]:
# trying chunks id function just created

trial_chunks=calculate_chunk_id(chunks)

In [9]:
for i in range(20):
    print(trial_chunks[i].metadata["chunk_id"])

data\Atomic Habits James Clear 2.pdf:0:0
data\Atomic Habits James Clear 2.pdf:0:1
data\Atomic Habits James Clear 2.pdf:0:2
data\Atomic Habits James Clear 2.pdf:0:3
data\Atomic Habits James Clear 2.pdf:0:4
data\Atomic Habits James Clear 2.pdf:1:0
data\Atomic Habits James Clear 2.pdf:1:1
data\Atomic Habits James Clear 2.pdf:1:2
data\Atomic Habits James Clear 2.pdf:1:3
data\Atomic Habits James Clear 2.pdf:1:4
data\Atomic Habits James Clear 2.pdf:1:5
data\Atomic Habits James Clear 2.pdf:1:6
data\Atomic Habits James Clear 2.pdf:1:7
data\Atomic Habits James Clear 2.pdf:2:0
data\Atomic Habits James Clear 2.pdf:2:1
data\Atomic Habits James Clear 2.pdf:2:2
data\Atomic Habits James Clear 2.pdf:2:3
data\Atomic Habits James Clear 2.pdf:2:4
data\Atomic Habits James Clear 2.pdf:2:5
data\Atomic Habits James Clear 2.pdf:2:6


In [10]:
# creating embeddings from the text

# from langchain_community.embeddings.bedrock import BedrockEmbeddings 
# to be replaced with ollama embedings and nomic embed later for local run
from langchain_community.embeddings.ollama import OllamaEmbeddings
# embeddings = OllamaEmbeddings(model="nomic-embed-text")
# also run ollama pull nomic-embed-text for it to work, dont know how to run this in deployment phase
def get_embeddings():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

#### adding to chroma DB

In [11]:
from langchain_chroma import Chroma

def emed_to_chromaDB(chunks: list[Document]):
    db=Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=get_embeddings()
    )
    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_id(chunks)
    
    # existing items
    existing_items=db.get(include=[])
    existing_ids=set(existing_items["ids"])

    print(f"existing no of docs in DB:{len(existing_ids)}")

    # only adding the documents that dont exist in the DB
    new_chunks=[]
    for chunk in chunks_with_ids:
        if chunk.metadata['chunk_id'] not in existing_ids:
            new_chunks.append(chunk)
    
    # adding the new chunks to the DB
    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["chunk_id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)

    else:
        print("✅ No new documents to add")


In [12]:
# trying to add new documents to embeddings

emed_to_chromaDB(chunks)

existing no of docs in DB:135
✅ No new documents to add


## includng query in rag

In [13]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

prompt_temp=""" 
    Answer the question with the folloeing context only
    {context}

    ___
    answer the question carefully with the above context {ques}
 """

def query_process(query_text: str):
    embedding_func=get_embeddings()
    db=Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=get_embeddings()
    )

    results=db.similarity_search_with_score(query_text,k=5)
    context_text="\n\n --- \n\n".join([doc.page_content for doc ,_score in results])
    prompt_temp_str=ChatPromptTemplate.from_template(prompt_temp)
    prompt=prompt_temp_str.format(context=context_text, ques=query_text)

    # printing the prompt
    model = Ollama(model="llama3.1:8b")
    response_text = model.invoke(prompt)
    source_for_answer = [doc.metadata.get("chunk_id") for doc, _score in results]
    final_answer=f"{response_text}. I got the answers from the following sources {source_for_answer}"
    print(final_answer)
    return final_answer


In [14]:
# calling the query function for trials
query_process("what is this book all about")

Based on the provided context, it appears that this book is a practical guide to understanding human behavior and creating lasting habits for personal growth and development.

The book focuses on providing a step-by-step plan for building better habits that can be sustained over a lifetime, rather than just temporary fixes. It draws on various fields such as biology, neuroscience, philosophy, psychology, and more to provide wisdom and practical advice.

The author's approach emphasizes the importance of starting small, with a focus on making progress through tiny wins and breakthroughs. The book aims to help readers build habits that can improve their lives in areas such as health, relationships, productivity, and personal growth.

Overall, the main theme of this book seems to be about empowering individuals to take control of their behavior and create lasting change through the development of positive habits.. I got the answers from the following sources ['data\\Atomic Habits James Cl

"Based on the provided context, it appears that this book is a practical guide to understanding human behavior and creating lasting habits for personal growth and development.\n\nThe book focuses on providing a step-by-step plan for building better habits that can be sustained over a lifetime, rather than just temporary fixes. It draws on various fields such as biology, neuroscience, philosophy, psychology, and more to provide wisdom and practical advice.\n\nThe author's approach emphasizes the importance of starting small, with a focus on making progress through tiny wins and breakthroughs. The book aims to help readers build habits that can improve their lives in areas such as health, relationships, productivity, and personal growth.\n\nOverall, the main theme of this book seems to be about empowering individuals to take control of their behavior and create lasting change through the development of positive habits.. I got the answers from the following sources ['data\\\\Atomic Habits