In [19]:
from pymilvus import MilvusClient, connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
import os
from collections import defaultdict

milvus_client = MilvusClient(uri="http://localhost:19530")


In [20]:
# collection_name="files"
# utility.drop_collection(collection_name)

In [21]:
dim = 384
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
collection_name="files"
# Check if collection exists
from pymilvus import utility
if utility.has_collection(collection_name):
    print(f"Collection '{collection_name}' exists. Dropping it...")
    utility.drop_collection(collection_name)
else:
    print(f"Collection '{collection_name}' does not exist.")
    
schema = CollectionSchema(fields, description="Whole-PDF embeddings")
collection = Collection(collection_name, schema)
index_params = {"index_type": "IVF_FLAT", "metric_type": "IP", "params": {"nlist": 128}}
collection.create_index(field_name="embedding", index_params=index_params)


Collection 'files' exists. Dropping it...


Status(code=0, message=)

In [22]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def pdf_to_text(path):
    doc = fitz.open(path)
    text = " ".join([page.get_text("text") for page in doc])
    return text.strip()


dataset_folder = "/home/natcha/rag_document/dataset"
pdf_files = [f for f in os.listdir(dataset_folder) if f.endswith(".pdf")]

for pdf_file in pdf_files:
    pdf_path = os.path.join(dataset_folder, pdf_file)
    doc = fitz.open(pdf_path)

    for i, page in enumerate(doc):
        page_text = page.get_text("text").strip()
        if not page_text:
            continue

        embedding = model.encode([page_text])[0].tolist()
        doc_name = f"{pdf_file}_page_{i}"
        data = [
                [doc_name], 
                [embedding]
            ]
        collection.insert(data)

collection.load()
print("✅ All PDFs stored in Milvus!")


✅ All PDFs stored in Milvus!


In [18]:
def query_with_pdf(pdf_path, top_k=5):
    doc = fitz.open(pdf_path)
    results_all = []
    similarity_scores = defaultdict(list)   
    top1_scores = []                        

    for page_num, page in enumerate(doc):
        page_text = page.get_text("text").strip()
        if not page_text:
            continue

        # Encode page text
        query_vec = model.encode([page_text])[0].tolist()
        search_params = {"metric_type": "IP", "params": {"nprobe": 10}}
        results = collection.search(
            [query_vec],
            "embedding",
            param=search_params,
            limit=top_k,
            output_fields=["doc_name"]
        )
        results_all.append((page_num, results))

        
        for hit in results[0]:
            full_name = hit.entity.get("doc_name")
            pdf_base = full_name.split("_page_")[0]
            similarity_percentage = hit.score * 100
            similarity_scores[pdf_base].append(similarity_percentage)


    return results_all

query_pdf_filepath = "/home/natcha/rag_document/1-s2.0-S0957417425026181-main.pdf"
results_all = query_with_pdf(query_pdf_filepath, top_k=5)


FileNotFoundError: no such file: '/home/natcha/rag_document/1-s2.0-S0957417425026181-main.pdf'

In [None]:
first_similarities = []

for page_num, hits in results_all:
    if hits[0]:  
        first_hit = hits[0][0]
        similarity_percentage = first_hit.score * 100
        first_similarities.append({
            "Query Page": page_num,
            "Matched PDF": first_hit.entity.get('doc_name'),
            "Similarity (%)": f"{similarity_percentage:.2f}"
        })
overall_avg = sum([float(item["Similarity (%)"]) for item in first_similarities]) / len(first_similarities) if first_similarities else 0
print(f"\nOverall Average Similarity\n{overall_avg:.2f}%")


Overall Average Similarity
53.66%


In [None]:
# Print page-level matches
for page_num, hits in results_all:
    print(f"--- Query Page {page_num} ---")
    for hit in hits[0]:
        similarity_percentage = hit.score * 100
        print(f"Matched PDF: {hit.entity.get('doc_name')} | Score: {similarity_percentage:.2f}%")


--- Query Page 0 ---
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_0 | Score: 57.86%
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_9 | Score: 51.06%
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_2 | Score: 50.42%
Matched PDF: 1-s2.0-S0749596X25000695-main.pdf_page_17 | Score: 49.57%
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_1 | Score: 48.21%
--- Query Page 1 ---
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_3 | Score: 53.91%
Matched PDF: 1-s2.0-S0957417425022948-main.pdf_page_12 | Score: 53.27%
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_0 | Score: 49.08%
Matched PDF: 1-s2.0-S074756322500216X-main.pdf_page_1 | Score: 48.92%
Matched PDF: 1-s2.0-S2772485925000481-main.pdf_page_1 | Score: 47.65%
--- Query Page 2 ---
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_3 | Score: 54.33%
Matched PDF: 1-s2.0-S0957417425022948-main.pdf_page_12 | Score: 53.25%
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_1 | Score: 50.36%
Matched PDF: 1-s2.0-S074

In [None]:
import ollama
import fitz  # PyMuPDF

def extract_page_text(pdf_path, page_number):
    doc = fitz.open(pdf_path)
    if page_number < len(doc):
        text = doc[page_number].get_text("text")
    else:
        text = ""
    doc.close()
    return text.strip()

query_pdf = extract_page_text("/home/natcha/rag_document/1-s2.0-S0957417425026181-main.pdf", 1)      # page 1
matched_pdf = extract_page_text("/home/natcha/rag_document/dataset/1-s2.0-S2949719125000275-main.pdf", 3)
similarity_percentage = 53.91

# Run a prompt on qwen3:0.6b-q4_K_M
response = ollama.generate(
    model="qwen3:0.6b-q4_K_M",
    prompt=f"""
You are analyzing two pieces of text from different PDF pages.
Compare the following two PDF page texts and justify why their similarity score is {similarity_percentage}%.  

Here is the query page text:
---
{query_pdf}
---

Here is the matched database page text:
---
{matched_pdf}
---

Tasks:
1. Highlight sentences or paragraphs from the query page that have similar information or main idea.  
2. Show the corresponding sentences or paragraphs from the matched PDF.  
3. Explain briefly why these parts are considered similar in clear, natural language. 
"""
)

print(response["response"])


<think>
Okay, let's tackle this. The user wants me to highlight and explain similarities between the query and matched PDF texts. First, task 1 is to identify key sentences or paragraphs that have similar ideas. Then, show them from both texts and explain why they're similar.

Starting with the query page. I need to look for the main ideas. The first paragraph talks about LLM-generated text issues like bias, information accuracy, and potential problems. The matched PDF has similar content about the same topics. So that's a good start.

Next, the second paragraph in the query mentions paraphrasing attacks. The matched PDF also has similar content on paraphrasing attacks, which are vulnerabilities in current detectors. Then, the third paragraph in the query discusses detection methods like zero-shot and watermarking, which are similar to the matched PDF's approaches.

Now, the matched PDF has specific details: for example, in the zero-shot detectors, they use RoBERTa and GROVER, which ar

In [None]:
import re

# Get the actual text from the response object
response_text = response["response"]  # or response.response if it's an attribute

# Use regex to capture everything after </think>
match = re.search(r"</think>\s*(.*)", response_text, re.DOTALL)

if match:
    extracted_text = match.group(1).strip()
else:
    extracted_text = ""

print("Extracted text:\n")
print(extracted_text)

Extracted text:

**1. Highlighted Sentences/Paragraphs:**  
- **Query Page:**  
  - "LLM-generated text issues like bias, information accuracy, and potential problems with automatic text generation" (first paragraph).  
  - "Paraphrasing attacks in LLM-generated text detection" (second paragraph).  
  - "Detection methods like zero-shot and watermarking" (third paragraph).  

- **Matched PDF:**  
  - "LLM-generated text issues such as bias and information accuracy" (second paragraph).  
  - "Paraphrasing attacks in LLM-generated text detection" (third paragraph).  
  - "Detection methods including zero-shot and watermarking" (third paragraph).  

**2. Explanation:**  
- The query’s first paragraph highlights the same core problems as the matched PDF’s second paragraph, focusing on biases, accuracy, and detection vulnerabilities.  
- The matched PDF’s second paragraph directly repeats the issue with paraphrasing attacks, which aligns with the query’s third paragraph on detection vulnera