# Data Ingestion

### Chunking

In [1]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
def create_chunk(strategy:str) -> list[Document]:# the type annotations doesnot force to be followed
    """
    Chunking based on user preference
    Arguments:
        strategy: name of the chunking strategy. Supported strategies = "document","recursive"
    Outputs:
        list of documents/chunks
    """
    if strategy == "document":
        # document chunking method
        pdf_dir_loader = DirectoryLoader(
                                "../data/booking_files",
                                glob="**/*.pdf", # filename pattern 
                                loader_cls=PyMuPDFLoader, # loader class to use
                                )
        chunks = pdf_dir_loader.load()
        print(f"Chunks created: {len(chunks)}")
        return chunks

    elif strategy == "recursive":
        # recursive character chunking method
        pdf_dir_loader = DirectoryLoader(
                                "../data/booking_files",
                                glob="**/*.pdf", # filename pattern 
                                loader_cls=PyMuPDFLoader, # loader class to use
                                )
        docs = pdf_dir_loader.load()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,        # characters per chunk
            chunk_overlap=200,      # overlap to preserve context
            separators=["\n\n", "\n", " ", ""]
        )

        chunks = text_splitter.split_documents(docs)
        print(f"Chunks created: {len(chunks)}")
        return chunks
   
    else:
        return f"chunking startegy not supported!!!!"
        # raise Exception(f"{strategy} chunking startegy not supported!!!!")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chunks = create_chunk(strategy="recursive")

Chunks created: 4


In [3]:
chunks

[Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2026-01-07T17:51:25+05:45', 'source': '..\\data\\booking_files\\bigyan_shrestha(data_science)_cv.pdf', 'file_path': '..\\data\\booking_files\\bigyan_shrestha(data_science)_cv.pdf', 'total_pages': 2, 'format': 'PDF 1.7', 'title': '', 'author': 'Bigyan Shrestha', 'subject': '', 'keywords': '', 'moddate': '2026-01-07T17:51:25+05:45', 'trapped': '', 'modDate': "D:20260107175125+05'45'", 'creationDate': "D:20260107175125+05'45'", 'page': 0}, page_content="BIGYAN SHRESTHA \nData Science Intern | BIT | KIST college & SS \nbigyans04@gmail.com | 9807904120 | Koteshwor, Kathmandu, Nepal \n \nINTRODUCTION \nData science candidate having foundation in python, data analysis, machine \nlearning and deep learning. Keen to apply my skills to extract insights from data \nand help organizations to make data-driven decision-making. \n \nEDUCATION \nKIST college & SS \nBIT, 2022-2026 \n \nCERTIFICAT

### Embedding

In [4]:
## Embedding
import numpy as np
from  sentence_transformers import SentenceTransformer # this is our embedding model
from typing import List

class EmbeddingManager: 
    """
    handles document embedding generation using sentence transformer model
    """
    def __init__(self,model_name:str="all-MiniLM-L6-v2"): 
        """
        Constructor to initialize the EmbeddingManager
        Arg = hugging face sentence transformer model name
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """
        Load the specified sentence transformer model
        """
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Loaded embedding model successfully. Embedding Dimensions = {self.model.get_sentence_embedding_dimension}")
        except Exception as  e:
            print(f"Error loading model{self.model_name} = {e}")

    def generate_embeddings(self,texts:List[str]) -> np.array:
        """
        Generates embeddings for list of text
        Args: List of texts to embed
        Return:numpy array with shape = (len(texts),embedding_dimensions)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        print(f"Creating embeddings for {len(texts)} texts.")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Embeddings generated successfully with shape = {embeddings.shape}")
        return embeddings

embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Loaded embedding model successfully. Embedding Dimensions = <bound method SentenceTransformer.get_sentence_embedding_dimension of SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)>


<__main__.EmbeddingManager at 0x1ee32eb9e10>

In [5]:
texts = []
chunk_metadata = []
for chunk in chunks:
    texts.append(chunk.page_content)
    chunk_metadata.append(chunk.metadata)
print(len(texts))
print(texts)
chunk_metadata = [chunk.metadata  for chunk in chunks]
print(len(chunk_metadata))
print(chunk_metadata)
embeddings = embedding_manager.generate_embeddings(texts=texts)
embeddings

4
["BIGYAN SHRESTHA \nData Science Intern | BIT | KIST college & SS \nbigyans04@gmail.com | 9807904120 | Koteshwor, Kathmandu, Nepal \n \nINTRODUCTION \nData science candidate having foundation in python, data analysis, machine \nlearning and deep learning. Keen to apply my skills to extract insights from data \nand help organizations to make data-driven decision-making. \n \nEDUCATION \nKIST college & SS \nBIT, 2022-2026 \n \nCERTIFICATIONS \nBroadway Infosys \nData Science With Python, 2025 \n \nPROJECTS \nSales analytics dashboard \n• Built a sales and deliveries dashboard analyzing 10 years of tesla's sales \ndata.  \n• Implemented filters for in-depth analysis. Link: \nhttps://public.tableau.com/shared/CFZNQQ4SD?:display_count=n&:origin\n=viz_share_link \nExpenses-prediction \n• Built a complete ML pipeline for predicting personal expenses with \nautomated preprocessing, feature engineering, and hyperparameter-\ntuned regression models. \n• Evaluated multiple models using R² score

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.66it/s]

Embeddings generated successfully with shape = (4, 384)





array([[-0.06830518,  0.0603593 ,  0.01853083, ..., -0.067569  ,
        -0.06738527,  0.04129963],
       [-0.0511873 ,  0.01309703, -0.03875072, ..., -0.04730863,
         0.01562451,  0.01251245],
       [ 0.03273609, -0.02490785, -0.03396541, ..., -0.05335611,
        -0.0343329 ,  0.09346751],
       [-0.01059269, -0.06126603, -0.05471329, ...,  0.10227432,
        -0.02142297,  0.07012384]], shape=(4, 384), dtype=float32)

### Metadata Store

In [6]:
import pymysql
import pymysql.cursors
from dotenv import load_dotenv
import os
load_dotenv() # Loads variables from .env into os.environ


class Metadata:
    def __init__(self):
        self.MYSQL_UID = os.getenv('MYSQL_UID')
        self.MYSQL_PWD = os.getenv('MYSQL_PWD')
        self.connection = None

    def _create_connection(self):
        self.connection = pymysql.connect(
                host='localhost',
                user=self.MYSQL_UID,
                password=self.MYSQL_PWD,
                database='rag_metadata',
                cursorclass=pymysql.cursors.DictCursor
            )
        return self.connection
    
    def write(self,metadata):
        connection = self._create_connection()
        with connection:
            with connection.cursor() as cursor:
                for each in metadata:
                    cols = list(each.keys())
                    values = tuple(each.values())
                    cols_q = ", ".join(cols)
                    placeholders = ", ".join(['%s']*len(cols))
                    # print(cols)
                    # print(values)
                    query = f"INSERT INTO booking_rag_metadata ({cols_q}) VALUES ({placeholders})"
                    # print(query)
                    cursor.execute(query,values)
            connection.commit()
        # print(metadata)

    def delete_all(self):
        connection = self._create_connection()
        with connection:
            with connection.cursor() as cursor:
                query = f"TRUNCATE TABLE booking_rag_metadata;"
                cursor.execute(query)
            connection.commit()  

    def write_booking_details(self,booking_details):
        connection = self._create_connection()
        with connection:
            with connection.cursor() as cursor:
                cols = list(booking_details.keys())
                values = tuple(booking_details.values())
                cols_q = ", ".join(cols)
                placeholders = ", ".join(['%s']*len(cols))
                # print(cols)
                # print(values)
                query = f"INSERT INTO booking_details ({cols_q}) VALUES ({placeholders})"
                # print(query)
                cursor.execute(query,values)
            connection.commit()
        # print(metadata)


In [7]:
# booking_details = {"name":"safaf","email":"bfhef","date":"dhfgd","time":"dsfweyf"}
# Metadata().write_booking_details(booking_details=booking_details)

### Vectore store

In [10]:
import pinecone
from dotenv import load_dotenv
import os
from datetime import datetime
load_dotenv() # Loads variables from .env into os.environ

class VectorStore:
    def __init__(self):
        self.PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") # access variables
        self.PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
        self.PINECONE_HOST = os.getenv("PINECONE_HOST")
        pc = pinecone.Pinecone(
            api_key=self.PINECONE_API_KEY
        )
        self.pinecone_index = pc.Index(host=self.PINECONE_HOST)

    def store(self,embeddings,texts,chunk_metadata):
        """
        function to store embedding vectors in pinecone
        Argument:
            embeddings: embedding vectors
            texts: corresponding texts of embedding vectors
        """

        vectors = []
        metadata  = []
        for i, embedding in enumerate(embeddings): # enumerate() lets you loop over items and get their index at the same time.
            vectors.append(
               {
                "id": f"doc-{i}",
                "values": embedding.tolist(),
                "metadata": {"text": texts[i]}
                } 
            )
            metadata.append(
                {
                "id": f"doc-{i}",
                "uploaded_time": datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                "source": chunk_metadata[i]['source'] 
                }
            )
            
        # writting the metadata in the database
        Metadata().delete_all() # delete metadata
        Metadata().write(metadata=metadata)
        # self.pinecone_index.delete(delete_all=True)
        self.pinecone_index.upsert(vectors=vectors)

    def empty_index(self):
        self.pinecone_index.delete(delete_all=True)
        print("PineCone index is emptied")

vectorstore = VectorStore()
vectorstore

<__main__.VectorStore at 0x1ee652e88e0>

In [11]:
# vectorstore.empty_index()
vectorstore.store(embeddings=embeddings,texts=texts,chunk_metadata=chunk_metadata)

# Conversational RAG Pipeline
## Retriever

In [None]:
from typing import List,Any,Dict,Tuple
class RAGRetriever:
    """
    Handles query based retrieval from vector store
    """
    def __init__(self,vector_store:VectorStore,embedding_manager:EmbeddingManager):
        """
        Initialize the retriever
        Args:
        vector_store: vector store containing document embeddings
        embedding_manager: manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query:str, top_k:int=3) -> Dict:
        """
        Retrieve relevant documents for a query
        Args:
            query: the search query
            top_k: no. of top results to return
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: {query}")

        # generate query embeddings
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # search in vector store
        try:
            results = self.vector_store.pinecone_index.query(
                vector=query_embedding.tolist(), 
                top_k=top_k,
                include_metadata=True,
                include_values=False
            )
            return results
        except Exception as e:
            print(f"Error during retrieval : {e}")
            return []
        
rag_retriever = RAGRetriever(vector_store=vectorstore,embedding_manager=embedding_manager)
rag_retriever
        

In [None]:
# testing the retriever
results = rag_retriever.retrieve(query="what is deep learning")
results

## Augmentation and Generation

### Using redis for chat memory

In [None]:
import redis
import json
from dotenv import load_dotenv
import os

load_dotenv() # Loads variables from .env into os.environ
REDIS_DB_USERNAME = os.getenv("REDIS_DB_USERNAME") # access variables
REDIS_DB_PASSWORD = os.getenv("REDIS_DB_PASSWORD") # access variables

SESSION_TTL_SECONDS = 500 # in seconds
redis_client = redis.Redis(
    host='redis-12530.c257.us-east-1-3.ec2.cloud.redislabs.com',
    port=12530,
    decode_responses=True,
    username=REDIS_DB_USERNAME,
    password=REDIS_DB_PASSWORD,
)

def get_chat_history(session_id: str = 1) -> list:
    key = f"chat:{session_id}"
    redis_client.expire(f"chat:{session_id}", SESSION_TTL_SECONDS) # sets/refreshes time to live period of the specified key
    data = redis_client.lrange(key,0,-1)
    return data if data else []

def save_chat_history(history: dict, session_id: str = 1) -> None:
    key = f"chat:{session_id}"
    redis_client.expire(f"chat:{session_id}", SESSION_TTL_SECONDS) # sets/refreshes time to live period of the specified key
    redis_client.rpush(key, json.dumps(history)) # pushes the additional history at the end of the list


In [None]:
# aug and generation
import requests
import json
from dotenv import load_dotenv
import os

load_dotenv() # Loads variables from .env into os.environ
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # access variables

def rag_simple(query,retriever,top_k=3):
    # retrieve the context
    results = retriever.retrieve(query=query,top_k=top_k)
    results = [each['metadata'] for each in results['matches']]
    context = "\n\n".join([each['text'] for each in results]) if results else "" # ternary operator not list comprehension with condition

    # using redis for chat memory
    history = get_chat_history()

    # if not (context and history):
    #     return "No relevant context found for the question"

    # prompt for the groq model
    booking_system_prompt = """
        Your job is to decide whether:
         1. the user wants to book an interview or,
         2. the user is asking a general question

        If the user is asking a general question:
        - Set route to "rag"
        - Answer using the provided context and conversation history
        - Put the answer in the "reply" field

        If the user wants to book an interview:
        - Set route to "booking"
        - Extract the following fields:
            - name
            - email
            - date
            - time

        Rules:
        - Respond ONLY in valid JSON
        - JSON fields must be exactly: route, booking, reply
        - If a booking field is missing or unclear, set that field to null. Do NOT guess missing booking fields
        - Convert dates to YYYY-MM-DD
        - Convert times to 24-hour HH:MM
        - If route is "booking", reply must be null
        - If route is "rag", booking must be null
        """

    prompt = f"""
            Conversation so far:
            {history}

            Context:
            {context}
            
            Query:
            {query}
            """
    # print(prompt)
    response = requests.post(
    url="https://openrouter.ai/api/v1/chat/completions",
    headers={
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        # "HTTP-Referer": "<YOUR_SITE_URL>", # Optional. Site URL for rankings on openrouter.ai.
        # "X-Title": "<YOUR_SITE_NAME>", # Optional. Site title for rankings on openrouter.ai.
    },
    data=json.dumps({
        "model": "stepfun/step-3.5-flash:free",
        # "model": "liquid/lfm-2.5-1.2b-thinking:free",
        "messages": [
        {"role": "system", "content": booking_system_prompt},
        {
            "role": "user",
            "content": prompt
        }
        ],
        "reasoning": {"enabled": True}
    })
    )
    print(response.json())
    # print(type(json.loads(response.json())))
    try:
        output  = json.loads(response.json()['choices'][0]['message']['content'])
        print(output)
    except json.JSONDecodeError as e:
        print(f"error: {e}")

    ## save_chat_history(history={"user":query,"assistance":output})
    hist = {"user":query,"assistance":None}
    if output['route'] == "booking":
        if output["booking"] is not None:
            missing_fields = [k for k,v in output["booking"].items() if v is None] # fields having none values
            if len(missing_fields) != 0:
                hist["assistance"] = f"Please provide the missing fields: {','.join(missing_fields)}"
                save_chat_history(history= hist)
                return hist
            else:
                print(output['booking'])
                Metadata().write_booking_details(output["booking"])
                hist["assistance"] = "Your interview is scheduled successfully"
                save_chat_history(history= hist)
                # save the booking details
                return hist
        else:
            all_fields = ["name","email","date","time"]
            hist['assistance'] = f"Please provide the missing fields: {','.join(all_fields)}"
            save_chat_history(history= hist)
            return hist
    elif output["route"] == "rag":
        hist["assistance"] = output["reply"]
        save_chat_history(history= hist)
        return hist


In [None]:
# I want to book an interview for the name of John and email is jhondoe45@gmail.com
# print(rag_simple(query="what is deep learning?",retriever=rag_retriever))
# print(rag_simple(query="I want to book an interview",retriever=rag_retriever))
print(rag_simple(query="Name = Bigyan Shrestha, email = bigyans04@gmail.com, date = jan 25, 2024, time = 4 pm",retriever=rag_retriever))

In [None]:
# redis_client.delete(f"chat:{session_id}")