In [1]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
import os
from collections import defaultdict

connections.connect("default", host="localhost", port="19530")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# collection_name="files"
# utility.drop_collection(collection_name)

In [3]:
embedding_dim = 384
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=embedding_dim),
]
collection_name="files"
# Check if collection exists
if utility.has_collection(collection_name):
    print(f"Collection '{collection_name}' exists. Dropping it...")
    utility.drop_collection(collection_name)
else:
    print(f"Collection '{collection_name}' does not exist.")
    
schema = CollectionSchema(fields, description="Whole-PDF embeddings")
collection = Collection(collection_name, schema)
index_params = {"index_type": "IVF_FLAT", "metric_type": "IP", "params": {"nlist": 128}}
collection.create_index(field_name="embedding", index_params=index_params)


Collection 'files' exists. Dropping it...


Status(code=0, message=)

In [4]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def pdf_to_text(path):
    doc = fitz.open(path)
    text = " ".join([page.get_text("text") for page in doc])
    return text.strip()


dataset_folder = "/home/natcha/rag_document/dataset"
pdf_files = [f for f in os.listdir(dataset_folder) if f.endswith(".pdf")]

for pdf_file in pdf_files:
    pdf_path = os.path.join(dataset_folder, pdf_file)
    doc = fitz.open(pdf_path)

    for i, page in enumerate(doc):
        page_text = page.get_text("text").strip()
        if not page_text:
            continue

        embedding = model.encode([page_text])[0].tolist()
        doc_name = f"{pdf_file}_page_{i}"
        data = [
                [doc_name], 
                [embedding]
            ]
        collection.insert(data)

collection.load()
print("✅ All PDFs stored in Milvus!")


✅ All PDFs stored in Milvus!


In [5]:
def query_with_pdf(pdf_path, top_k=5):
    doc = fitz.open(pdf_path)
    results_all = []
    similarity_scores = defaultdict(list)   # per-PDF averages
    top1_scores = []                        # only first/top match per page

    for page_num, page in enumerate(doc):
        page_text = page.get_text("text").strip()
        if not page_text:
            continue

        # Encode page text
        query_vec = model.encode([page_text])[0].tolist()
        search_params = {"metric_type": "IP", "params": {"nprobe": 10}}
        results = collection.search(
            [query_vec],
            "embedding",
            param=search_params,
            limit=top_k,
            output_fields=["doc_name"]
        )
        results_all.append((page_num, results))

        # Collect similarity scores for per-PDF averages (all top-k)
        for hit in results[0]:
            full_name = hit.entity.get("doc_name")
            pdf_base = full_name.split("_page_")[0]
            similarity_percentage = hit.score * 100
            similarity_scores[pdf_base].append(similarity_percentage)


    return results_all

query_pdf_filepath = "/home/natcha/rag_document/1-s2.0-S0957417425026181-main.pdf"
results_all = query_with_pdf(query_pdf_filepath, top_k=5)


In [6]:
first_similarities = []

for page_num, hits in results_all:
    if hits[0]:  # Check if there is at least one result
        first_hit = hits[0][0]
        similarity_percentage = first_hit.score * 100
        first_similarities.append({
            "Query Page": page_num,
            "Matched PDF": first_hit.entity.get('doc_name'),
            "Similarity (%)": f"{similarity_percentage:.2f}"
        })
overall_avg = sum([float(item["Similarity (%)"]) for item in first_similarities]) / len(first_similarities) if first_similarities else 0
print(f"\nOverall Average Similarity\n{overall_avg:.2f}%")


Overall Average Similarity
53.66%


In [7]:
# Print page-level matches
for page_num, hits in results_all:
    print(f"--- Query Page {page_num} ---")
    for hit in hits[0]:
        similarity_percentage = hit.score * 100
        print(f"Matched PDF: {hit.entity.get('doc_name')} | Score: {similarity_percentage:.2f}%")


--- Query Page 0 ---
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_0 | Score: 57.86%
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_9 | Score: 51.06%
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_2 | Score: 50.42%
Matched PDF: 1-s2.0-S0749596X25000695-main.pdf_page_17 | Score: 49.57%
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_1 | Score: 48.21%
--- Query Page 1 ---
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_3 | Score: 53.91%
Matched PDF: 1-s2.0-S0957417425022948-main.pdf_page_12 | Score: 53.27%
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_0 | Score: 49.08%
Matched PDF: 1-s2.0-S074756322500216X-main.pdf_page_1 | Score: 48.92%
Matched PDF: 1-s2.0-S2772485925000481-main.pdf_page_1 | Score: 47.65%
--- Query Page 2 ---
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_3 | Score: 54.33%
Matched PDF: 1-s2.0-S0957417425022948-main.pdf_page_12 | Score: 53.25%
Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_1 | Score: 50.36%
Matched PDF: 1-s2.0-S074

In [11]:
import fitz  # PyMuPDF
from collections import defaultdict
import ollama
import re

# ---- PDF Helper to extract text ----
def extract_page_text(pdf_path, page_number):
    doc = fitz.open(pdf_path)
    if 0 <= page_number < len(doc):
        text = doc[page_number].get_text("text")
    else:
        text = ""
    doc.close()
    return text.strip()

# ---- Milvus Search Function ----
def query_with_pdf(pdf_path, top_k=5):
    doc = fitz.open(pdf_path)
    results_all = []
    similarity_scores = defaultdict(list)

    for page_num, page in enumerate(doc):
        page_text = page.get_text("text").strip()
        if not page_text:
            continue

        # Encode page text
        query_vec = model.encode([page_text])[0].tolist()
        search_params = {"metric_type": "IP", "params": {"nprobe": 10}}
        results = collection.search(
            [query_vec],
            "embedding",
            param=search_params,
            limit=top_k,
            output_fields=["doc_name"]
        )
        results_all.append((page_num, results))

        # Collect similarity scores
        for hit in results[0]:
            full_name = hit.entity.get("doc_name")
            pdf_base = full_name.split("_page_")[0]
            similarity_percentage = hit.score * 100
            similarity_scores[pdf_base].append(similarity_percentage)

    doc.close()
    return results_all

# ---- LLM Comparison Function ----
def compare_with_llm(query_pdf_path, results_all):
    for page_num, hits in results_all:
        query_text = extract_page_text(query_pdf_path, page_num)
        if not query_text:
            continue

        print(f"\n=== Query Page {page_num} ===")

        for hit in hits[0]:
            match_name = hit.entity.get("doc_name")
            similarity_percentage = hit.score * 100

            # Parse file + page number
            match_pdf_file, match_page_str = match_name.split("_page_")
            match_page = int(match_page_str)

            # Full dataset path
            matched_pdf_path = f"/home/natcha/rag_document/dataset/{match_pdf_file}"

            # Extract matched page text
            matched_text = extract_page_text(matched_pdf_path, match_page)

            # Skip if empty
            if not matched_text:
                continue

            # Run LLM explanation
            response = ollama.generate(
                model="qwen3:0.6b-q4_K_M",
                prompt=f"""
You are analyzing two pieces of text from different PDF pages.
Compare the following two PDF page texts and justify why their similarity score is {similarity_percentage:.2f}%:

Query file: {query_pdf_path}, Page {page_num}  
Matched file: {match_pdf_file}.pdf, Page {match_page}  

---
Query Page Text:
{query_text}

---
Matched Page Text:
{matched_text}

Tasks:
1. Highlight sentences or paragraphs from the query page that have similar information or main idea.  
2. Show the corresponding sentences or paragraphs from the matched PDF.  
3. Explain briefly why these parts are considered similar in clear, natural language.
"""
            )

            print(f"\nMatched PDF: {match_name} | Score: {similarity_percentage:.2f}%")
            response_text = response["response"]  
            match = re.search(r"</think>\s*(.*)", response_text, re.DOTALL)
            if match:
                extracted_text = match.group(1).strip()
            else:
                extracted_text = ""
            print(extracted_text)

# ---- Run Everything ----
query_pdf_filepath = "/home/natcha/rag_document/1-s2.0-S0957417425026181-main.pdf"

# Step 1: Search
results_all = query_with_pdf(query_pdf_filepath, top_k=5)

# Step 2: Compare with LLM
compare_with_llm(query_pdf_filepath, results_all)



=== Query Page 0 ===

Matched PDF: 1-s2.0-S2949719125000275-main.pdf_page_0 | Score: 57.86%
The similarity score of 57.86% between the query and matched pages is due to their shared focus on methodologies for distinguishing between LLM-generated and human-written texts. Here's the breakdown:

**Similarity Highlights:**
1. **Dataset Focus:**  
   Both texts discuss the same dataset: human-written term summaries, LLM-generated texts, and paraphrased content. The query emphasizes the use of this dataset for stylometry-based detection, while the matched text highlights its role in LLM-generated text detection.

2. **Methodology and Techniques:**  
   The query integrates machine learning features (e.g., StyloMetrix and n-gram-based features) to classify text types, mirroring the matched text's approach to identify stylistic patterns. Both papers address how LLMs distinguish between models, emphasizing the role of model attribution and ethical AI use.

**Justification:**  
The similarity s

Extracted text:

**1. Highlighted Sentences/Paragraphs:**  
- **Query Page:**  
  - "LLM-generated text issues like bias, information accuracy, and potential problems with automatic text generation" (first paragraph).  
  - "Paraphrasing attacks in LLM-generated text detection" (second paragraph).  
  - "Detection methods like zero-shot and watermarking" (third paragraph).  

- **Matched PDF:**  
  - "LLM-generated text issues such as bias and information accuracy" (second paragraph).  
  - "Paraphrasing attacks in LLM-generated text detection" (third paragraph).  
  - "Detection methods including zero-shot and watermarking" (third paragraph).  

**2. Explanation:**  
- The query’s first paragraph highlights the same core problems as the matched PDF’s second paragraph, focusing on biases, accuracy, and detection vulnerabilities.  
- The matched PDF’s second paragraph directly repeats the issue with paraphrasing attacks, which aligns with the query’s third paragraph on detection vulnera