In [2]:
# ---------- IMPORTS ----------
# Remove streamlit import
import os
import fitz  # PyMuPDF
import io
import pytesseract
from PIL import Image
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from huggingface_hub import InferenceClient

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update path as needed

In [4]:
# ---------- HUGGING FACE CONFIG ----------
HF_API_KEY = "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"
os.environ["HF_TOKEN"] = HF_API_KEY

client = InferenceClient(
    provider="novita",
    api_key=os.environ["HF_TOKEN"],
)

MODEL_CHAT = "moonshotai/Kimi-K2-Instruct"
encoder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  encoder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [5]:
# ---------- OCR FUNCTION ----------
def ocr_bytes(img_bytes: bytes) -> str:
    try:
        img = Image.open(io.BytesIO(img_bytes))
        return pytesseract.image_to_string(img, lang="eng").strip()
    except Exception as e:
        return f"[OCR failed: {e}]"

In [6]:
def ingest_files(file_paths):
    docs = []
    total_files = len(file_paths)
    print(f"Processing {total_files} file(s)...")

    for file_idx, path in enumerate(file_paths):
        if not os.path.isfile(path):
            print(f"File not found: {path}")
            continue
        ext = os.path.splitext(path)[1].lower()
        file_bytes = open(path, "rb").read()
        text = ""
        if ext == ".pdf":
            doc = fitz.open(stream=file_bytes, filetype="pdf")
            for page_num, page in enumerate(doc):
                text += page.get_text()
                images = page.get_images(full=True)
                for img_idx, img in enumerate(images):
                    try:
                        xref = img[0]
                        pix = fitz.Pixmap(doc, xref)
                        if pix.width < 3000 and pix.n < 5:
                            img_data = pix.tobytes("png")
                            text += "\n" + ocr_bytes(img_data)
                    except Exception:
                        continue
        else:
            text = ocr_bytes(file_bytes)
        docs.append(Document(page_content=text, metadata={"source": os.path.basename(path)}))
        print(f"Processed {file_idx + 1}/{total_files}: {os.path.basename(path)}")
    chunks = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80).split_documents(docs)
    print(f"Total chunks created: {len(chunks)}")
    vectorstore = FAISS.from_documents(chunks, encoder)
    return vectorstore

In [9]:
def query_kimi(messages, context):
    system = (
        "Answer strictly from the context below in detail. "
        "If not found, reply exactly: \"I couldn’t find that information in the provided documents.\"\n\n"
        f"Context:\n{context}"
    )
    full_messages = [{"role": "system", "content": system}] + messages
    try:
        completion = client.chat.completions.create(
            model=MODEL_CHAT,
            messages=full_messages,
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"⚠️ {MODEL_CHAT} is currently unavailable. Error: {e}"

In [10]:
# ... (all your setup code)

if __name__ == "__main__":
    hardcoded_file_list = [
        r"C:\Users\APL01615\Downloads\Apollo-Offer_letter.pdf",
    ]
    vectorstore = ingest_files(hardcoded_file_list)
    messages = []
    user_q = "What is the offer date?"  # <-- Replace with your question
    messages.append({"role": "user", "content": user_q})
    docs = vectorstore.similarity_search(user_q, k=5)
    context = "\n\n".join(d.page_content for d in docs)
    answer = query_kimi(messages, context)
    messages.append({"role": "assistant", "content": answer})
    print(f"AI: {answer}\n")

Processing 1 file(s)...
Processed 1/1: Apollo-Offer_letter.pdf
Total chunks created: 9
AI: The offer date is 20-Jul-2024.



In [11]:
messages

[{'role': 'user', 'content': 'What is the offer date?'},
 {'role': 'assistant', 'content': 'The offer date is 20-Jul-2024.'}]

In [12]:
docs 

[Document(id='879f69f4-5be9-4f71-bb4a-e4336a74368a', metadata={'source': 'Apollo-Offer_letter.pdf'}, page_content="Apollo \nMr.AKASH SG \nAPL94855 \nTPHARMACY \nBasic \nComponents \nYour total emoluments will be as follows: \n1. Welcome to the family of APOLLO PHARMACIES LIMITED. With reference to your application and the subsequent interview \nyou had with us, we are pleased to appoint you as 'Assistant-iT with effect from 20-Jul-2024. \nFxed Dearness Allowance \nHouse Rent Allowance \nConveyance Allowance \nOther Allowance \nTotal (A) -Gross \nPF Employer Contribution \nESIC Employer Contribution \nGratuity \nApollo Pharmacies Limited \nStatutory Bonus \nSandya Ele Narsing Naningada See R Fanci \nc PH¡b S00 0N2 \nTelangara Iadix Tel 91402348 1000 Emal h bdapoopharaa \nCost To The Company \nAPPOINTMENT ORDER \nMonthly \n8500 \n1000 \n7600 \nS50 \n950 \n19000 \n1368 \n618 \n457 \n583 \n22026"),
 Document(id='7ca2247d-663e-4070-81d1-75d978a6f56a', metadata={'source': 'Apollo-Offer_lette

In [13]:
vectorstore 

<langchain_community.vectorstores.faiss.FAISS at 0x2070560f770>

In [14]:
context

'Apollo \nMr.AKASH SG \nAPL94855 \nTPHARMACY \nBasic \nComponents \nYour total emoluments will be as follows: \n1. Welcome to the family of APOLLO PHARMACIES LIMITED. With reference to your application and the subsequent interview \nyou had with us, we are pleased to appoint you as \'Assistant-iT with effect from 20-Jul-2024. \nFxed Dearness Allowance \nHouse Rent Allowance \nConveyance Allowance \nOther Allowance \nTotal (A) -Gross \nPF Employer Contribution \nESIC Employer Contribution \nGratuity \nApollo Pharmacies Limited \nStatutory Bonus \nSandya Ele Narsing Naningada See R Fanci \nc PH¡b S00 0N2 \nTelangara Iadix Tel 91402348 1000 Emal h bdapoopharaa \nCost To The Company \nAPPOINTMENT ORDER \nMonthly \n8500 \n1000 \n7600 \nS50 \n950 \n19000 \n1368 \n618 \n457 \n583 \n22026\n\nMonthly \n8500 \n1000 \n7600 \nS50 \n950 \n19000 \n1368 \n618 \n457 \n583 \n22026 \n**Statutory Bonus will be paid as per the Payment of Bonus Act 1965. \n**Gratuity will be paid as per the Payment of Grat

In [None]:
answer

'The offer date is 20-Jul-2024.'

In [16]:
def ingest_files(file_paths):
    docs = []
    total_files = len(file_paths)
    print(f"Processing {total_files} file(s)...")

    for file_idx, path in enumerate(file_paths):
        if not os.path.isfile(path):
            print(f"File not found: {path}")
            continue
        ext = os.path.splitext(path)[1].lower()
        file_bytes = open(path, "rb").read()
        text = ""
        if ext == ".pdf":
            doc = fitz.open(stream=file_bytes, filetype="pdf")
            for page_num, page in enumerate(doc):
                text += page.get_text()
                images = page.get_images(full=True)
                for img_idx, img in enumerate(images):
                    try:
                        xref = img[0]
                        pix = fitz.Pixmap(doc, xref)
                        if pix.width < 3000 and pix.n < 5:
                            img_data = pix.tobytes("png")
                            text += "\n" + ocr_bytes(img_data)
                    except Exception:
                        continue
        else:
            text = ocr_bytes(file_bytes)
        docs.append(Document(page_content=text, metadata={"source": os.path.basename(path)}))
        print(f"Processed {file_idx + 1}/{total_files}: {os.path.basename(path)}")
    chunks = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80).split_documents(docs)
    print(f"Total chunks created: {len(chunks)}")
    vectorstore = FAISS.from_documents(chunks, encoder)
    return chunks, vectorstore

In [17]:
hardcoded_file_list = [
    r"C:\Users\APL01615\Downloads\Apollo-Offer_letter.pdf",
]
chunks, vectorstore = ingest_files(hardcoded_file_list)

Processing 1 file(s)...
Processed 1/1: Apollo-Offer_letter.pdf
Total chunks created: 9


In [18]:
for i, chunk in enumerate(chunks):
    print(f"Chunk {i}:\n{chunk.page_content}\n{'-'*40}")

Chunk 0:
Apollo 
Mr.AKASH SG 
APL94855 
TPHARMACY 
Basic 
Components 
Your total emoluments will be as follows: 
1. Welcome to the family of APOLLO PHARMACIES LIMITED. With reference to your application and the subsequent interview 
you had with us, we are pleased to appoint you as 'Assistant-iT with effect from 20-Jul-2024. 
Fxed Dearness Allowance 
House Rent Allowance 
Conveyance Allowance 
Other Allowance 
Total (A) -Gross 
PF Employer Contribution 
ESIC Employer Contribution 
Gratuity 
Apollo Pharmacies Limited 
Statutory Bonus 
Sandya Ele Narsing Naningada See R Fanci 
c PH¡b S00 0N2 
Telangara Iadix Tel 91402348 1000 Emal h bdapoopharaa 
Cost To The Company 
APPOINTMENT ORDER 
Monthly 
8500 
1000 
7600 
S50 
950 
19000 
1368 
618 
457 
583 
22026
----------------------------------------
Chunk 1:
Monthly 
8500 
1000 
7600 
S50 
950 
19000 
1368 
618 
457 
583 
22026 
**Statutory Bonus will be paid as per the Payment of Bonus Act 1965. 
**Gratuity will be paid as per the Payment o

In [19]:
# Get the numpy array of vectors
vectors = vectorstore.index.reconstruct_n(0, vectorstore.index.ntotal)
print(vectors.shape)  # (num_chunks, embedding_dim)

(9, 384)


In [30]:
vectors

array([[-0.05967538, -0.01463658,  0.01250011, ...,  0.00342519,
        -0.02861695, -0.06042347],
       [-0.02299916,  0.07121049, -0.06003917, ..., -0.05372573,
        -0.0669684 , -0.08709749],
       [-0.04377664,  0.00890749, -0.05430579, ..., -0.01191773,
        -0.00987066, -0.04182865],
       ...,
       [-0.01858059,  0.07115753, -0.04918792, ..., -0.00386528,
         0.00830338, -0.07280252],
       [-0.01583445, -0.01499954, -0.01744444, ..., -0.01365258,
        -0.00553316,  0.0088584 ],
       [-0.03201348,  0.01019677, -0.01097743, ..., -0.07771742,
        -0.05359341, -0.04757938]], shape=(9, 384), dtype=float32)

# 🧠 What’s happening:

- When printing `vectors`, NumPy shows just a truncated preview of the array (especially for large arrays), often skipping values in the middle with ellipsis `(...)` to keep things tidy.
- When you print `vectors[0]`, you're accessing the first vector directly, and NumPy displays the entire array in full precision and scientific notation `(-5.96753843e-02 = -0.0596753843)`.

# ✅ TL;DR:
There’s no mismatch—it’s just:
- `vectors`: a 2D array preview
- `vectors[0]`: a 1D array with full precision and scientific notation

If you want to see all vectors in full, you can disable truncation using:

```
import numpy as np
np.set_printoptions(threshold=np.inf)
print(vectors)

```


In [50]:
import numpy as np
np.set_printoptions(threshold=np.inf)
print(vectors[0])


[-5.96753843e-02 -1.46365808e-02  1.25001101e-02 -8.31744447e-02
 -6.95094913e-02 -2.27528587e-02  9.57817063e-02  1.08857438e-01
 -1.16643542e-02  1.89864002e-02  5.36847860e-03 -7.81863704e-02
 -4.01677303e-02 -1.09478245e-02 -5.55164553e-02  1.15711764e-02
  6.93162978e-02 -1.11063421e-01  2.88079511e-02 -8.25239345e-02
  2.09994819e-02  5.53049706e-02  1.69067774e-02  1.59063917e-02
 -1.68571575e-03  4.85432111e-02 -4.96478863e-02  3.07872798e-02
  2.00468842e-02 -5.33335544e-02  1.82922594e-02 -8.07864219e-03
  8.42312630e-03 -4.08536531e-02  2.12059133e-02  1.89320631e-02
 -5.86566329e-02  1.40019665e-02 -2.32889690e-02 -7.22377468e-03
 -1.70342661e-02 -4.69816290e-02 -1.24275096e-01 -2.66569536e-02
 -2.39547491e-02 -4.75363880e-02  4.63521853e-03 -2.22894456e-03
  2.95839701e-02  1.01663612e-01  5.90783022e-02 -2.77847927e-02
 -6.23793639e-02 -8.74352921e-03 -7.32409805e-02 -1.22984564e-02
 -5.51576689e-02 -2.75444780e-02 -7.10233115e-03  1.54501060e-02
 -1.08891673e-01  4.92868

In [48]:
vectors[0]

array([-5.96753843e-02, -1.46365808e-02,  1.25001101e-02, -8.31744447e-02,
       -6.95094913e-02, -2.27528587e-02,  9.57817063e-02,  1.08857438e-01,
       -1.16643542e-02,  1.89864002e-02,  5.36847860e-03, -7.81863704e-02,
       -4.01677303e-02, -1.09478245e-02, -5.55164553e-02,  1.15711764e-02,
        6.93162978e-02, -1.11063421e-01,  2.88079511e-02, -8.25239345e-02,
        2.09994819e-02,  5.53049706e-02,  1.69067774e-02,  1.59063917e-02,
       -1.68571575e-03,  4.85432111e-02, -4.96478863e-02,  3.07872798e-02,
        2.00468842e-02, -5.33335544e-02,  1.82922594e-02, -8.07864219e-03,
        8.42312630e-03, -4.08536531e-02,  2.12059133e-02,  1.89320631e-02,
       -5.86566329e-02,  1.40019665e-02, -2.32889690e-02, -7.22377468e-03,
       -1.70342661e-02, -4.69816290e-02, -1.24275096e-01, -2.66569536e-02,
       -2.39547491e-02, -4.75363880e-02,  4.63521853e-03, -2.22894456e-03,
        2.95839701e-02,  1.01663612e-01,  5.90783022e-02, -2.77847927e-02,
       -6.23793639e-02, -

In [36]:
vectors[0].shape

(384,)

In [28]:
vectors[0].size

384

In [27]:
print(f"Number of values in the array: {vectors[0].size}")

Number of values in the array: 384


In [22]:
import numpy as np

for i in range(min(3, vectorstore.index.ntotal)):
    vec = np.array(vectorstore.index.reconstruct(i))
    print(f"Vector {i} (first 10 dims): {vec[:10]}")

Vector 0 (first 10 dims): [-0.05967538 -0.01463658  0.01250011 -0.08317444 -0.06950949 -0.02275286
  0.09578171  0.10885744 -0.01166435  0.0189864 ]
Vector 1 (first 10 dims): [-0.02299916  0.07121049 -0.06003917  0.00018828 -0.00394256  0.10561497
  0.03507214  0.02356591 -0.0598201  -0.01997691]
Vector 2 (first 10 dims): [-0.04377664  0.00890749 -0.05430579 -0.04004178 -0.0003233   0.04786632
  0.00635159 -0.00507341 -0.07039154 -0.10327   ]


In [23]:
# Show all chunk texts
for i, chunk in enumerate(chunks):
    print(f"Chunk {i}:\n{chunk.page_content}\n{'-'*40}")

# Show the first vector
import numpy as np
vec = np.array(vectorstore.index.reconstruct(0))
print(f"First vector shape: {vec.shape}")
print(f"First 10 dimensions: {vec[:10]}")

Chunk 0:
Apollo 
Mr.AKASH SG 
APL94855 
TPHARMACY 
Basic 
Components 
Your total emoluments will be as follows: 
1. Welcome to the family of APOLLO PHARMACIES LIMITED. With reference to your application and the subsequent interview 
you had with us, we are pleased to appoint you as 'Assistant-iT with effect from 20-Jul-2024. 
Fxed Dearness Allowance 
House Rent Allowance 
Conveyance Allowance 
Other Allowance 
Total (A) -Gross 
PF Employer Contribution 
ESIC Employer Contribution 
Gratuity 
Apollo Pharmacies Limited 
Statutory Bonus 
Sandya Ele Narsing Naningada See R Fanci 
c PH¡b S00 0N2 
Telangara Iadix Tel 91402348 1000 Emal h bdapoopharaa 
Cost To The Company 
APPOINTMENT ORDER 
Monthly 
8500 
1000 
7600 
S50 
950 
19000 
1368 
618 
457 
583 
22026
----------------------------------------
Chunk 1:
Monthly 
8500 
1000 
7600 
S50 
950 
19000 
1368 
618 
457 
583 
22026 
**Statutory Bonus will be paid as per the Payment of Bonus Act 1965. 
**Gratuity will be paid as per the Payment o

In [51]:
docs = vectorstore.similarity_search(user_q, k=5)
context = "\n\n".join(d.page_content for d in docs)
answer = query_kimi(messages, context)

In [54]:
docs

[Document(id='69624fea-b4a5-4790-b51f-b2c1e13d92a2', metadata={'source': 'Apollo-Offer_letter.pdf'}, page_content="Apollo \nMr.AKASH SG \nAPL94855 \nTPHARMACY \nBasic \nComponents \nYour total emoluments will be as follows: \n1. Welcome to the family of APOLLO PHARMACIES LIMITED. With reference to your application and the subsequent interview \nyou had with us, we are pleased to appoint you as 'Assistant-iT with effect from 20-Jul-2024. \nFxed Dearness Allowance \nHouse Rent Allowance \nConveyance Allowance \nOther Allowance \nTotal (A) -Gross \nPF Employer Contribution \nESIC Employer Contribution \nGratuity \nApollo Pharmacies Limited \nStatutory Bonus \nSandya Ele Narsing Naningada See R Fanci \nc PH¡b S00 0N2 \nTelangara Iadix Tel 91402348 1000 Emal h bdapoopharaa \nCost To The Company \nAPPOINTMENT ORDER \nMonthly \n8500 \n1000 \n7600 \nS50 \n950 \n19000 \n1368 \n618 \n457 \n583 \n22026"),
 Document(id='71afb3fc-0d38-41ba-a39c-29d20ac62871', metadata={'source': 'Apollo-Offer_lette

In [52]:
context

'Apollo \nMr.AKASH SG \nAPL94855 \nTPHARMACY \nBasic \nComponents \nYour total emoluments will be as follows: \n1. Welcome to the family of APOLLO PHARMACIES LIMITED. With reference to your application and the subsequent interview \nyou had with us, we are pleased to appoint you as \'Assistant-iT with effect from 20-Jul-2024. \nFxed Dearness Allowance \nHouse Rent Allowance \nConveyance Allowance \nOther Allowance \nTotal (A) -Gross \nPF Employer Contribution \nESIC Employer Contribution \nGratuity \nApollo Pharmacies Limited \nStatutory Bonus \nSandya Ele Narsing Naningada See R Fanci \nc PH¡b S00 0N2 \nTelangara Iadix Tel 91402348 1000 Emal h bdapoopharaa \nCost To The Company \nAPPOINTMENT ORDER \nMonthly \n8500 \n1000 \n7600 \nS50 \n950 \n19000 \n1368 \n618 \n457 \n583 \n22026\n\nMonthly \n8500 \n1000 \n7600 \nS50 \n950 \n19000 \n1368 \n618 \n457 \n583 \n22026 \n**Statutory Bonus will be paid as per the Payment of Bonus Act 1965. \n**Gratuity will be paid as per the Payment of Grat

- `context` is a string of the most relevant chunk texts.
- `context` is what the LLM sees.

In [53]:
answer 

'The offer date is 20-Jul-2024.'

# which type of data is send to the LLM model chunk data or vector data

In a Retrieval-Augmented Generation (RAG) pipeline like yours:

- **The vector data (embeddings)** is used only for searching and retrieving relevant chunks.
- **The chunk data (text)** is what is actually sent to the LLM (language model) as context.

# How it works in your code:

1. User asks a question.
2. The question is embedded (converted to a vector).
3. The vectorstore (FAISS) is used to find the most similar document chunks (using vector similarity).
4. The text of those chunks is combined into a context string.
5. This context string (plain text) is sent to the LLM (e.g., Kimi-K2-Instruct) along with the user’s question.

# In summary:

- **Vector data:**  Used for retrieval/search only (never sent to the LLM).
- **Chunk data (text):** Sent to the LLM as context for answering the question.