# Milestone 3
## Ingestion of raw data and storage into a repository
## Data preprocessing and feature engineering
## Data validation/verification
###  Connect to Qdrant Cluster

In [1]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import SentenceSplitter
from qdrant_client import QdrantClient
from qdrant_client.http import models
import re
import json
import pandas as pd
from collections import Counter


# 🔹 Qdrant connection details (modify if needed)
QDRANT_HOST = "https://40003a30-70d7-4886-9de5-45e25681c36e.europe-west3-0.gcp.cloud.qdrant.io"  # Example: "localhost" or Qdrant cloud URL
COLLECTION_NAME = "AUIChatVectoreCol"

# 🔹 Connect to Qdrant
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    host="40003a30-70d7-4886-9de5-45e25681c36e.europe-west3-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.uea3Q5G9lcLfqCwxzTpRKWcMh5XM0pvPB2RaeOaDPxM",
    https=True
)
# Check if collection exists, if not create it
"""try:
    collection_info = qdrant_client.get_collection(COLLECTION_NAME)
    print(f"Collection '{COLLECTION_NAME}' exists with {collection_info.points_count} vectors")
except Exception:
    print(f"Creating new collection '{COLLECTION_NAME}'")
    # Create collection with appropriate configuration
    qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=768,  # msmarco-distilbert-base-v4 embedding size
            distance=models.Distance.COSINE
        )
    )
"""

# 🔹 Store raw documents in Qdrant (instead of local storage)
#document_store = QdrantDocumentStore(qdrant_client, collection_name=COLLECTION_NAME)

# 🔹 Load msmarco-distilbert-base-v4 as embedding model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/msmarco-distilbert-base-v4")

# Initialize the Qdrant vector store with full data stored in Qdrant
vector_store = QdrantVectorStore(
    client=qdrant_client, 
    collection_name=COLLECTION_NAME,
    # These key settings ensure we store everything in Qdrant
    text_key="text",  # Store actual text directly in Qdrant
    metadata_key="metadata",  # Store metadata directly in Qdrant
    content_key="content",  # Store full node content in Qdrant
    embed_dim=768,  # Must match embedding dimension
    # Critical: Store document content directly in Qdrant
    stores_text=True,  # Tell LlamaIndex we're storing text in Qdrant
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from llama_index.core import StorageContext

# 🔹 Load university documents from PDFs
#documents = SimpleDirectoryReader("resources/").load_data()



* Cleaning and better chuncking applied before embeding to enhance teh score

In [3]:
# Text cleaning function
def clean_text(text):
    """Cleans and normalizes text before embedding."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^a-zA-Z0-9,.!?;:\'\"()\[\]\s]', '', text)
    return text

# Load and process documents
def process_documents(directory="resources/"):
    # Load documents
    print(f"Loading documents from {directory}")
    documents = SimpleDirectoryReader(directory).load_data()
    print(f"Loaded {len(documents)} documents")
    
    # Apply smart chunking
    text_splitter = SentenceSplitter(chunk_size=400, chunk_overlap=50)
    nodes = text_splitter.get_nodes_from_documents(documents)
    print(f"Created {len(nodes)} chunks")
    
    # Apply cleaning to chunks and set metadata
    for node in nodes:
        node.text = clean_text(node.text)
        # Include full text in metadata for easy retrieval
        node.metadata = {
            "file_name": node.metadata.get("file_name", "Unknown"),
            "page_label": node.metadata.get("page_label", "Unknown"),
            "text": node.text,  # Store text directly in metadata
            "chunk_id": str(node.id_)  # Add ID for reference
        }
    
    # Verify chunk sizes
    chunk_sizes = [len(node.text.split()) for node in nodes]
    print(f"Min Chunk Size: {min(chunk_sizes)} words")
    print(f"Max Chunk Size: {max(chunk_sizes)} words")
    print(f"Average Chunk Size: {sum(chunk_sizes)/len(chunk_sizes):.2f} words")
    
    return nodes

In [4]:
# Create index and store everything in Qdrant (no local storage)
def create_and_store_index(nodes):
    # Use storage context with just the vector store (no local persistence)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    # Create an index with all data stored in Qdrant
    index = VectorStoreIndex(
        nodes,
        embed_model=embed_model,
        storage_context=storage_context,
        store_nodes_override=True  # Force storing node content in vector DB
    )
    
    print("All data successfully stored in Qdrant!")
    
    return index

In [5]:
# Query function to retrieve from Qdrant
def query_qdrant(query_text, limit=5):
    # Generate embedding for the query
    query_vector = embed_model.get_text_embedding(query_text)
    
    # Run similarity search in Qdrant
    search_results = qdrant_client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_vector,
        limit=limit,
        # Request complete payload with all fields
        with_payload=True
    )
    
    print(f"Found {len(search_results)} results for query: '{query_text}'")
    
    # Print results
    for i, result in enumerate(search_results):
        print(f"\nResult {i+1} - Score: {result.score}")
        print(f"Document ID: {result.id}")
        
        # Access text - try different possible locations
        text = None
        
        # Try direct text field
        if 'text' in result.payload:
            text = result.payload.get('text')
            file_name = result.payload.get('metadata', {}).get('file_name', 
                      result.payload.get('file_name', 'Unknown'))
        
        # Try metadata.text
        elif 'metadata' in result.payload and 'text' in result.payload['metadata']:
            text = result.payload['metadata']['text']
            file_name = result.payload['metadata'].get('file_name', 'Unknown')
        
        # Try node_content
        elif '_node_content' in result.payload:
            node_content = json.loads(result.payload.get('_node_content', '{}'))
            text = node_content.get("text", "No text found")
            file_name = node_content.get('metadata', {}).get('file_name', 'Unknown')
        
        # Try content
        elif 'content' in result.payload:
            content = result.payload['content']
            if isinstance(content, str):
                try:
                    content_data = json.loads(content)
                    text = content_data.get('text', content)
                except:
                    text = content
            else:
                text = str(content)
            file_name = result.payload.get('file_name', 'Unknown')
        
        if text:
            print(f"File: {file_name}")
            print(f"Text: {text[:500]}...")  # Print first 500 characters
        else:
            print("No text found in payload")
            print(f"Available payload keys: {list(result.payload.keys())}")
        
        print("-" * 50)


In [12]:
from llama_index.core.llms import MockLLM
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI


llm = HuggingFaceInferenceAPI(
    model_name="mistralai/Mistral-7B-Instruct-v0.3", token='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'
)

# Create a query engine for more advanced querying
def create_query_engine(query_text):
    """Create a query engine for proper RAG queries"""
    from llama_index.core import Settings
    
    # Initialize settings for the query
    Settings.embed_model = embed_model
    
    # Create vector store using the same parameters as before
    temp_vector_store = QdrantVectorStore(
        client=qdrant_client,
        collection_name=COLLECTION_NAME,
        text_key="text",
        metadata_key="metadata",
        content_key="content",
        embed_dim=768,
        stores_text=True
    )
    
    # Create an empty index with the vector store
    index = VectorStoreIndex.from_vector_store(temp_vector_store)
    
    # Create a query engine
    query_engine = index.as_query_engine(llm=llm)
    
    # Execute query
    response = query_engine.query(query_text)
    
    print(f"\nRAG Response for query: '{query_text}'")
    print("-" * 50)
    print(response)
    print("-" * 50)
    
    return response


In [13]:
def validate_qdrant_storage():
    collection_info = qdrant_client.get_collection(COLLECTION_NAME)
    print(f"Total vectors in '{COLLECTION_NAME}': {collection_info.points_count}")

    search_results, _ = qdrant_client.scroll(
        collection_name=COLLECTION_NAME,
        limit=5,
        with_payload=True,
        with_vectors=True
    )

    if not search_results:
        print("🚨 No records retrieved from Qdrant!")
        return False

    print(f"Retrieved {len(search_results)} records for inspection")

    for result in search_results:
        print(f"\nRecord ID: {result.id}")
        print(f"Payload keys: {list(result.payload.keys())}")

        if hasattr(result, "vector") and result.vector is not None:
            print(f"✅ Vector dimensions: {len(result.vector)}")
        else:
            print("🚨 WARNING: Vector is None")

    print("✅ Storage validation successful!")
    return True  # Ensure validation correctly reports success

# Run validation
if validate_qdrant_storage():
    print("\n--- Running Example Direct Query ---")
    query_qdrant("What are the requirements for the PiP program?")
    
    print("\n--- Running Example RAG Query Engine ---")
    create_query_engine("What are the requirements for the PiP program?")
else:
    print("🚨 Storage validation failed - check Qdrant configuration")


Total vectors in 'AUIChatVectoreCol': 28
Retrieved 5 records for inspection

Record ID: 02d03289-2960-41ad-8808-db7165391a04
Payload keys: ['file_name', 'page_label', 'text', 'chunk_id', '_node_content', '_node_type', 'document_id', 'doc_id', 'ref_doc_id']
✅ Vector dimensions: 768

Record ID: 0658fd25-e537-4a5b-8141-abadde357a84
Payload keys: ['file_name', 'page_label', 'text', 'chunk_id', '_node_content', '_node_type', 'document_id', 'doc_id', 'ref_doc_id']
✅ Vector dimensions: 768

Record ID: 0986a7a4-a56e-4984-9240-d6784266cdd1
Payload keys: ['file_name', 'page_label', 'text', 'chunk_id', '_node_content', '_node_type', 'document_id', 'doc_id', 'ref_doc_id']
✅ Vector dimensions: 768

Record ID: 1a2de18c-f519-464f-93b9-8b02387c4f1a
Payload keys: ['file_name', 'page_label', 'text', 'chunk_id', '_node_content', '_node_type', 'document_id', 'doc_id', 'ref_doc_id']
✅ Vector dimensions: 768

Record ID: 3885c336-7360-4f45-94c7-ee4698c263e6
Payload keys: ['file_name', 'page_label', 'text', '

  search_results = qdrant_client.search(


Found 5 results for query: 'What are the requirements for the PiP program?'

Result 1 - Score: 0.34414944
Document ID: 43f016c7-cdf2-42c5-b141-e08956076602
File: PiP 24-25 Program Requirements.pdf
Text: 1 public requirements 20242025 eligibility requirements:  applicants must be recent graduates of, and be familiar with, the american style liberal arts model.  applicants must speak english fluently. proficiency in french or arabic is encouraged and appreciated, but not necessary.  applicants must have recently graduated with an undergraduate degree within the last two academic years (fallwinter 202223 or later). please note:  aui welcomes pip applications from candidates from all nationalities ...
--------------------------------------------------

Result 2 - Score: 0.19244635
Document ID: cb149f6b-1cbd-4502-9050-034d6eeadc33
File: Undergraduate Admission Freshmen Non-Degree Seeking.pdf
Text: freshmen nondegreeseeking students are not required to follow a specic sequence in courses or 

In [14]:
# Process documents
nodes = process_documents()
    
    # Create and store index (everything in Qdrant)
index = create_and_store_index(nodes)
    
    # Validate storage
print("\n--- Validating Qdrant Storage ---")

storage_valid = validate_qdrant_storage()
    
if storage_valid:
    print("\n--- Running Example Direct Query ---")
    query_qdrant("What are the requirements for the PiP program?")
        
    print("\n--- Running Example RAG Query Engine ---")
    create_query_engine("What are the requirements for the PiP program?")
else:
    print("Storage validation failed - please check your Qdrant configuration")


Loading documents from resources/
Loaded 13 documents
Created 28 chunks
Min Chunk Size: 58 words
Max Chunk Size: 268 words
Average Chunk Size: 201.14 words
All data successfully stored in Qdrant!

--- Validating Qdrant Storage ---
Total vectors in 'AUIChatVectoreCol': 56
Retrieved 5 records for inspection

Record ID: 02d03289-2960-41ad-8808-db7165391a04
Payload keys: ['file_name', 'page_label', 'text', 'chunk_id', '_node_content', '_node_type', 'document_id', 'doc_id', 'ref_doc_id']
✅ Vector dimensions: 768

Record ID: 041ec998-4182-4daf-a389-0ffe25e01a81
Payload keys: ['file_name', 'page_label', 'text', 'chunk_id', '_node_content', '_node_type', 'document_id', 'doc_id', 'ref_doc_id']
✅ Vector dimensions: 768

Record ID: 0658fd25-e537-4a5b-8141-abadde357a84
Payload keys: ['file_name', 'page_label', 'text', 'chunk_id', '_node_content', '_node_type', 'document_id', 'doc_id', 'ref_doc_id']
✅ Vector dimensions: 768

Record ID: 0986a7a4-a56e-4984-9240-d6784266cdd1
Payload keys: ['file_name'

  search_results = qdrant_client.search(


Found 5 results for query: 'What are the requirements for the PiP program?'

Result 1 - Score: 0.349983
Document ID: 896c5b4e-950f-4c44-a330-1ac7a99815f8
File: PiP 24-25 Program Requirements.pdf
Text: 1 public requirements 20242025 eligibility requirements:  applicants must be recent graduates of, and be familiar with, the american style liberal arts model.  applicants must speak english fluently. proficiency in french or arabic is encouraged and appreciated, but not necessary.  applicants must have recently graduated with an undergraduate degree within the last two academic years (fallwinter 202223 or later). please note:  aui welcomes pip applications from candidates from all nationalities ...
--------------------------------------------------

Result 2 - Score: 0.34414944
Document ID: 43f016c7-cdf2-42c5-b141-e08956076602
File: PiP 24-25 Program Requirements.pdf
Text: 1 public requirements 20242025 eligibility requirements:  applicants must be recent graduates of, and be familiar wit

### step 3: Query Vector Database

In [None]:
query_text = "What are the requirements for the PiP program?"

# 🔹 Generate embedding for the query
query_vector = embed_model.get_text_embedding(query_text)

# 🔹 Run similarity search in Qdrant
search_results = qdrant_client.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector,
    limit=5,  # Retrieve top 5 most relevant documents
)

# 🔹 Print results
for result in search_results:
    print(f"Score: {result.score}")
    print(f"Document ID: {result.id}")
    print(f"File: {result.payload.get('file_name', 'Unknown')}")
    
    # Extract stored text
    node_content = json.loads(result.payload.get('_node_content', '{}'))  # Decode JSON
    document_text = node_content.get("text", "No text found")

    print(f"Text: {document_text[:500]}...")  # Print first 500 characters for readability
    print("-" * 50)


In [None]:
collection_info = qdrant_client.get_collection(COLLECTION_NAME)
print(f"Total vectors in '{COLLECTION_NAME}': {collection_info.points_count}")

# Fetch some vectors for inspection
search_results = qdrant_client.scroll(
    collection_name=COLLECTION_NAME,
    limit=5
)[0]  # scroll returns a tuple, we want the first element


for result in search_results:
    print(f"ID: {result.id}")
    print(f"Payload keys: {result.payload.keys()}")
    # Check if _node_content is in the payload
    if '_node_content' in result.payload:
        node_content = json.loads(result.payload['_node_content'])
        print(f"Node content keys: {node_content.keys()}")
    print("-" * 50)

print("✅ Qdrant now stores both vectors & text!")



### Sample Chunks to Confirm Cleaning 
  * lowercase ✅
  * free of unnecessary symbols.✅
  * No excessive spaces or newlines✅
  * Chunks should end logically (not cut off mid-sentence)✅

In [None]:
# 🔹 Print the first 5 cleaned chunks
print("\n🔍 Sample of Cleaned & Chunked Text:")
for i, node in enumerate(nodes[:5]):  # Show first 5 chunks
    print(f"Chunk {i+1}:")
    print(node.text)  # Display chunked and cleaned text
    print("-" * 50)


In [None]:
import numpy as np
import pandas as pd

# 🔹 Compute chunk statistics
chunk_sizes = [len(node.text.split()) for node in nodes]
df = pd.DataFrame({'Chunk Size': chunk_sizes})

# 🔹 Print key statistics
print(df.describe())  # Show min, max, mean, std deviation

# 🔹 Detect anomalies (extreme values)
threshold = 500  # Adjust based on expected range
large_chunks = df[df["Chunk Size"] > threshold]
small_chunks = df[df["Chunk Size"] < 50]  # Too small to be useful

print(f"🚨 Large Chunks (>{threshold} words): {len(large_chunks)}")
print(f"🚨 Small Chunks (<50 words): {len(small_chunks)}")


### 🚨 Problem of accessing embedings in QDRANT =under research=

In [None]:
# 🔹 Retrieve first 10 stored vectors in Qdrant
search_results = qdrant_client.scroll(
    collection_name=COLLECTION_NAME,
    limit=10
)

# 🔹 Validate embedding integrity
for result in search_results[0]:
    embedding = result.vector  # ✅ Access embeddings correctly
    text = result.payload.get("text", "No text found")

    if embedding is None or len(embedding) == 0:  # Ensure embedding is not empty
        print(f"🚨 Missing embedding for chunk: {result.id}")
    if len(text.split()) < 10:  # Check if chunk is too small
        print(f"⚠️ Small chunk detected: {text}")

print("✅ Embeddings validation completed.")


* checking for dups 

In [None]:
from collections import Counter

# 🔹 Count occurrences of each chunk
chunk_texts = [node.text for node in nodes]
duplicates = [text for text, count in Counter(chunk_texts).items() if count > 1]

print(f"🚨 Duplicate Chunks Found: {len(duplicates)}")
if duplicates:
    print(duplicates[:5])  # Show first few duplicates


* Schema Consistency
  * each chunk follows expected metadata format

In [None]:
# 🔹 Retrieve a few stored chunks
search_results = qdrant_client.scroll(
    collection_name=COLLECTION_NAME,
    limit=5
)

for result in search_results[0]:
    metadata = result.payload  # Metadata dictionary

    # Expected keys
    required_keys = ["file_name", "text", "page_label"]
    missing_keys = [key for key in required_keys if key not in metadata]

    if missing_keys:
        print(f"🚨 Metadata issue in chunk {result.id}: Missing {missing_keys}")

print("✅ Schema validation completed.")


## data validation using llama index 
* Validate Schema & Metadata in LlamaIndex
   * Ensures metadata keys are properly stored for each chunk.
   * Helps prevent retrieval errors due to missing metadata.

In [None]:
# ✅ Load storage context from Qdrant
storage_context = StorageContext.from_defaults(
    persist_dir="./storage",  # Ensure it's the correct path
    vector_store=vector_store,  # ✅ Load stored vectors
    docstore=document_store  # ✅ Load stored text documents
)

# ✅ Reload the index from Qdrant
index = load_index_from_storage(storage_context, embed_model=embed_model)

print("✅ Index successfully loaded from Qdrant.")

In [None]:
# 🔹 Retrieve all stored nodes (documents) from Qdrant
stored_nodes = index.docstore.get_all_nodes()
print(f"🔍 Total Nodes in Qdrant Document Store: {len(stored_nodes)}")

# ✅ Validate Schema & Metadata for Each Stored Node
expected_metadata_keys = ["file_name", "page_label", "text"]

for node_id, node in list(stored_nodes.items())[:5]:  # Limit to first 5 for debugging
    print(f"🔹 Node ID: {node_id}")
    print(f"🔹 Metadata: {node.metadata}")

    # Check if metadata contains expected keys
    missing_keys = [key for key in expected_metadata_keys if key not in node.metadata]
    if missing_keys:
        print(f"🚨 Schema Issue: Node {node_id} is missing {missing_keys}")

    print(f"🔹 Chunk Text Preview: {node.text[:200]}...")  # Show first 200 characters
    print("-" * 50)

print("✅ Schema validation completed.")


In [None]:
# ✅ Validate Vectors in Qdrant (Check if vectors exist for each document)
collection_info = qdrant_client.get_collection(COLLECTION_NAME)
print(f"🔍 Total Vectors in '{COLLECTION_NAME}': {collection_info.points_count}")

# ✅ Fetch some vectors to check if they match the stored documents
search_results = qdrant_client.scroll(collection_name=COLLECTION_NAME, limit=5)[0]  # Get first 5 results

for result in search_results:
    print(f"🆔 Vector ID: {result.id}")
    print(f"🔹 Stored Metadata Keys: {result.payload.keys()}")

    # ✅ Ensure text content is stored correctly
    if "text" in result.payload:
        print(f"✅ Text Found: {result.payload['text'][:200]}...")  # Show first 200 chars
    else:
        print("🚨 Missing Text Content in Qdrant!")

    print("-" * 50)

print("✅ Qdrant vector-text validation completed.")

# faill switch

In [3]:
"""from qdrant_client.http import models

# ⚠️ WARNING: This will DELETE all stored vectors in the collection
qdrant_client.delete_collection(collection_name=COLLECTION_NAME)

# ✅ Recreate the collection with the correct vector configuration
qdrant_client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(
        size=768,  # Adjust based on your embedding model (768 for msmarco-distilbert-base-v4)
        distance=models.Distance.COSINE  # Use COSINE for similarity-based retrieval
    )
)

print("✅ Collection reset successfully!")"""


  qdrant_client.recreate_collection(


✅ Collection reset successfully!


Collecting llama_index.llms.huggingface_api
  Downloading llama_index_llms_huggingface_api-0.4.1-py3-none-any.whl (7.2 kB)
Installing collected packages: llama-index.llms.huggingface-api
Successfully installed llama-index.llms.huggingface-api
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\users\otman\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.
