In [1]:
# Cell 1: Setup and Imports
import os
from dotenv import load_dotenv
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery, QueryType
from azure.core.credentials import AzureKeyCredential
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import AzureSearch
from langchain.schema import Document
import pandas as pd

load_dotenv()

# Load credentials
service_name = os.getenv("AZURE_SEARCH_SERVICE_NAME")
admin_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
index_name = "aios-search-index"
endpoint = f"https://{service_name}.search.windows.net"

print(f"Connecting to: {endpoint}")
print(f"Index: {index_name}")

Connecting to: https://aios-ai-search.search.windows.net
Index: aios-search-index


In [2]:
# Cell 2: Initialize Search Clients
# Direct Azure Search Client
search_client = SearchClient(
    endpoint=endpoint,
    index_name=index_name,
    credential=AzureKeyCredential(admin_key)
)

# Count documents
results = search_client.search(search_text="*", include_total_count=True)
print(f"Total documents in index: {results.get_count()}")

Total documents in index: 100


In [3]:
# Cell 3: Test Keyword Search
def keyword_search(query, top=5):
    """Test basic keyword/BM25 search"""
    results = search_client.search(
        search_text=query,
        top=top,
        include_total_count=True
    )
    
    print(f"Query: '{query}'")
    print(f"Total matches: {results.get_count()}\n")
    
    docs = []
    for i, result in enumerate(results, 1):
        print(f"{i}. [{result['source']}] {result['title'][:100]}...")
        print(f"   Score: {result['@search.score']:.4f}")
        print(f"   Date: {result.get('published_at', 'N/A')}")
        print()
        docs.append(result)
    
    return docs

# Test keyword searches
keyword_results = keyword_search("DARPA autonomous systems")

Query: 'DARPA autonomous systems'
Total matches: 84

1. [Defense News] Joby, L3Harris developing autonomous aircraft for defense missions...
   Score: 6.6209
   Date: 2025-08-01 15:44:34

2. [UK Defence Journal] U.S. drone completes three-day unmanned flight trial...
   Score: 4.1382
   Date: 2025-08-03 10:11:07

3. [Defense News] Pentagon officials tout rapid experimentation at courtyard showcase...
   Score: 4.1127
   Date: 2025-07-17 13:56:08

4. [UK Defence Journal] Bell to build new VTOL X-plane with new rotor breakthrough...
   Score: 4.0542
   Date: 2025-07-11 00:51:50

5. [Defense News] Germany buys aircraft self-defense systems from Elbit for $260 million...
   Score: 3.6781
   Date: 2025-07-28 11:29:57



In [10]:
# Cell 4: CORRECT LangChain Azure OpenAI Embeddings Setup
from langchain_openai import AzureOpenAIEmbeddings
from azure.search.documents.models import VectorizedQuery

# CORRECT initialization for LangChain with Azure OpenAI
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://aios-openai-model.openai.azure.com",
    azure_deployment="text-embedding-3-small",  # Use azure_deployment, not deployment_name
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-02-01"
)

# Test the embeddings
try:
    test_embedding = embeddings.embed_query("test")
    print(f"✅ LangChain Embeddings working! Vector dimension: {len(test_embedding)}")
except Exception as e:
    print(f"❌ Error: {e}")

def vector_search(query, top=5):
    """Test pure vector/semantic search"""
    # Generate query embedding
    query_vector = embeddings.embed_query(query)
    
    # Create vector query
    vector_query = VectorizedQuery(
        vector=query_vector,
        k_nearest_neighbors=top,
        fields="content_vector"
    )
    
    results = search_client.search(
        search_text=None,  # No keyword search
        vector_queries=[vector_query],
        top=top
    )
    
    print(f"Vector Query: '{query}'")
    print(f"Top {top} semantically similar documents:\n")
    
    docs = []
    for i, result in enumerate(results, 1):
        print(f"{i}. [{result['source']}] {result['title'][:100]}...")
        print(f"   Score: {result['@search.score']:.4f}")
        print()
        docs.append(result)
    
    return docs

# Test semantic search
vector_results = vector_search("cutting-edge military robotics and AI integration")

✅ LangChain Embeddings working! Vector dimension: 1536
Vector Query: 'cutting-edge military robotics and AI integration'
Top 5 semantically similar documents:

1. [Defense News] Air Force experiments with using AI to seek combat targets...
   Score: 0.7108

2. [Military Times] Pentagon taps four commercial tech firms to expand military use of AI...
   Score: 0.7062

3. [Defense News] Pentagon picks finalists for Replicator 2 counter-drone demo...
   Score: 0.6760

4. [UK Defence Journal] British Army unveils lethal ASGARD targeting system...
   Score: 0.6751

5. [UK Defence Journal] Airborne autonomy is the next frontier in naval power...
   Score: 0.6662



In [11]:
# Cell 5: Test Hybrid Search (Keyword + Vector)
def hybrid_search(query, top=5):
    """Test hybrid search combining keyword and vector"""
    # Generate query embedding
    query_vector = embeddings.embed_query(query)
    
    # Create vector query
    vector_query = VectorizedQuery(
        vector=query_vector,
        k_nearest_neighbors=top,
        fields="content_vector"
    )
    
    # Hybrid search - Azure AI Search automatically combines and re-ranks
    results = search_client.search(
        search_text=query,  # Keyword search
        vector_queries=[vector_query],  # Vector search
        top=top
    )
    
    print(f"Hybrid Query: '{query}'")
    print(f"Combined keyword + semantic results:\n")
    
    docs = []
    for i, result in enumerate(results, 1):
        print(f"{i}. [{result['source']}] {result['title'][:100]}...")
        print(f"   Score: {result['@search.score']:.4f}")
        print(f"   Date: {result.get('published_at', 'N/A')[:10] if result.get('published_at') else 'N/A'}")
        print()
        docs.append(result)
    
    return docs

# Test hybrid search
hybrid_results = hybrid_search("automated maintenance repair inspection technologies")

Hybrid Query: 'automated maintenance repair inspection technologies'
Combined keyword + semantic results:

1. [UK Defence Journal] MOD awards £12m vehicle repair contract to RBSL...
   Score: 0.0331
   Date: 2025-07-31

2. [Defense News] Pentagon picks finalists for Replicator 2 counter-drone demo...
   Score: 0.0320
   Date: 2025-07-24

3. [Defense News] US Army envisions a common launcher to fit allies' weapons...
   Score: 0.0262
   Date: 2025-07-24

4. [UK Defence Journal] U.S. approves $600m to sustain Ukraine's war effort...
   Score: 0.0167
   Date: 2025-08-06

5. [SpaceNews] Frontgrade Launches Compact, Intelligent Power Supply for VNX+ Systems in Aerospace, Defense, and Sp...
   Score: 0.0161
   Date: 2025-07-31



In [12]:
# Cell 6: Compare Search Methods
# Test the same query with all three methods
test_query = "space defense systems"

print("="*60)
print("COMPARISON: Same query, different search methods")
print("="*60)

print("\n1. KEYWORD SEARCH:")
print("-"*40)
keyword_docs = keyword_search(test_query, top=3)

print("\n2. VECTOR SEARCH:")
print("-"*40)
vector_docs = vector_search(test_query, top=3)

print("\n3. HYBRID SEARCH:")
print("-"*40)
hybrid_docs = hybrid_search(test_query, top=3)

# Compare results
print("\n" + "="*60)
print("ANALYSIS:")
print("-"*40)

keyword_titles = [doc['title'][:50] for doc in keyword_docs]
vector_titles = [doc['title'][:50] for doc in vector_docs]
hybrid_titles = [doc['title'][:50] for doc in hybrid_docs]

print(f"Keyword found: {keyword_titles[0] if keyword_titles else 'None'}")
print(f"Vector found:  {vector_titles[0] if vector_titles else 'None'}")
print(f"Hybrid found:  {hybrid_titles[0] if hybrid_titles else 'None'}")

COMPARISON: Same query, different search methods

1. KEYWORD SEARCH:
----------------------------------------
Query: 'space defense systems'
Total matches: 97

1. [Defense News] New 'Vulcan' rocket to fly first military mission next week...
   Score: 9.7502
   Date: 2025-08-06 19:32:30

2. [Defense News] US defense industry vulnerable to China, government watchdog warns...
   Score: 8.7440
   Date: 2025-08-01 20:07:43

3. [Defense News] Army to grow air defense force by 30%...
   Score: 8.5357
   Date: 2025-08-05 17:05:46


2. VECTOR SEARCH:
----------------------------------------
Vector Query: 'space defense systems'
Top 3 semantically similar documents:

1. [Defense News] US Army readies to release new missile defense strategy soon...
   Score: 0.6686

2. [Defense News] Denmark to complete $3.4 billion of air defense purchases by year-end...
   Score: 0.6632

3. [SpaceNews] Golden Dome requires non-traditional thinking and an agile approach...
   Score: 0.6558


3. HYBRID SEARCH:
--

In [17]:
# Cell 7: LangChain Integration for RAG (FIXED)
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain.schema import Document

# Create LangChain-compatible vector store
vector_store = AzureSearch(
    azure_search_endpoint=f"https://{service_name}.search.windows.net",
    azure_search_key=admin_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    search_type="hybrid"  # Enable hybrid search
)

# Test retriever functionality
def test_langchain_retriever(query, k=5):
    """Test LangChain retriever for RAG pipeline"""
    # Create retriever without k in search_kwargs to avoid duplicate
    retriever = vector_store.as_retriever()
    
    # Use invoke instead of get_relevant_documents (deprecated)
    docs = retriever.invoke(query)[:k]  # Limit results after retrieval
    
    print(f"LangChain Retriever Query: '{query}'")
    print(f"Retrieved {len(docs)} documents:\n")
    
    for i, doc in enumerate(docs, 1):
        # Extract title from content or metadata
        content_preview = doc.page_content[:150].replace('\n', ' ')
        print(f"{i}. {content_preview}...")
        if doc.metadata:
            print(f"   Source: {doc.metadata.get('source', 'Unknown')}")
            print(f"   Score: {doc.metadata.get('@search.score', 'N/A')}")
        print()
    
    return docs

# Alternative: Direct similarity search (more control)
def test_direct_similarity_search(query, k=5):
    """Test direct similarity search through vector store"""
    
    # Use similarity_search directly
    docs = vector_store.similarity_search(
        query=query,
        k=k,
        search_type="hybrid"
    )
    
    print(f"Direct Similarity Search: '{query}'")
    print(f"Found {len(docs)} documents:\n")
    
    for i, doc in enumerate(docs, 1):
        content_preview = doc.page_content[:150].replace('\n', ' ')
        print(f"{i}. {content_preview}...")
        if hasattr(doc, 'metadata') and doc.metadata:
            print(f"   Source: {doc.metadata.get('source', 'Unknown')}")
        print()
    
    return docs

# Test both methods
query = "What are the latest developments in autonomous naval systems?"

print("Method 1: Using Retriever")
print("-"*40)
langchain_docs = test_langchain_retriever(query, k=5)

print("\n\nMethod 2: Direct Similarity Search")
print("-"*40)
direct_docs = test_direct_similarity_search(query, k=5)

Method 1: Using Retriever
----------------------------------------
LangChain Retriever Query: 'What are the latest developments in autonomous naval systems?'
Retrieved 4 documents:

1. Recently, the Royal Navy's Carrier Strike Group deployed alongside the Italian aircraft carrier Cavour, cutting a commanding silhouette through the Me...
   Source: UK Defence Journal
   Score: 0.03306011110544205

2. A Seahawk medium displacement unmanned surface vessel participates in U.S. Pacific Fleet's Unmanned Systems Integrated Battle Problem in 2022. (Chief ...
   Source: Defense News
   Score: 0.02982456237077713

3. Tested during recent NATO exercises in Estonia, ASGARD marks a significant shift in the Army's lethality and digital transformation strategy, as outli...
   Source: UK Defence Journal
   Score: 0.02736726962029934

4. The U.S. military services are experimenting with various counter-drone technology, including the AN/MPQ-64 Sentinel radar system, tested during a rec...
   Source: De

In [14]:
# Cell 8: Test Filtering
def search_with_filters(query, source_filter=None, topic_filter=None, days_back=None):
    """Test search with metadata filters"""
    filters = []
    
    if source_filter:
        # Handle source names with special characters
        source_filter_escaped = source_filter.replace("'", "''")
        filters.append(f"source eq '{source_filter_escaped}'")
    
    if topic_filter:
        filters.append(f"topic_name eq '{topic_filter}'")
    
    if days_back:
        from datetime import datetime, timedelta
        cutoff_date = (datetime.utcnow() - timedelta(days=days_back)).isoformat()
        filters.append(f"published_at ge '{cutoff_date}'")
    
    filter_str = " and ".join(filters) if filters else None
    
    # Generate embedding for vector search
    query_vector = embeddings.embed_query(query) if query else None
    
    # Prepare vector query if we have a query
    vector_queries = []
    if query_vector:
        vector_query = VectorizedQuery(
            vector=query_vector,
            k_nearest_neighbors=5,
            fields="content_vector"
        )
        vector_queries.append(vector_query)
    
    # Perform search
    results = search_client.search(
        search_text=query if query else "*",  # Use * for all documents if no query
        vector_queries=vector_queries if query_vector else None,
        filter=filter_str,
        top=5
    )
    
    print(f"Query: '{query if query else 'All documents'}'")
    if filter_str:
        print(f"Filters: {filter_str}")
    print()
    
    docs = []
    for i, result in enumerate(results, 1):
        print(f"{i}. [{result.get('source', 'Unknown')}] {result.get('title', 'No title')[:80]}...")
        print(f"   Date: {result.get('published_at', 'N/A')[:10] if result.get('published_at') else 'N/A'}")
        print(f"   Topic: {result.get('topic_name', 'N/A')}")
        docs.append(result)
    
    return docs

# Test with different filters
print("Test 1: Filter by source")
print("-"*40)
filtered_results = search_with_filters(
    "defense technology",
    source_filter="Defense News"
)

print("\n\nTest 2: Recent articles only (last 3 days)")
print("-"*40)
recent_results = search_with_filters(
    "military AI",
    days_back=3
)

Test 1: Filter by source
----------------------------------------
Query: 'defense technology'
Filters: source eq 'Defense News'

1. [Defense News] Pentagon picks finalists for Replicator 2 counter-drone demo...
   Date: 2025-07-24
   Topic: DefenseNews
2. [Defense News] US Army readies to release new missile defense strategy soon...
   Date: 2025-08-05
   Topic: DefenseNews
3. [Defense News] New missile defense radar lands in Guam to be put to the test...
   Date: 2025-08-01
   Topic: DefenseNews
4. [Defense News] Lockheed launches hub to prototype Golden Dome command systems...
   Date: 2025-08-05
   Topic: DefenseNews
5. [Defense News] Army readies to launch 2026 competition for counter-drone laser weapon...
   Date: 2025-08-06
   Topic: DefenseNews


Test 2: Recent articles only (last 3 days)
----------------------------------------
Query: 'military AI'
Filters: published_at ge '2025-08-05T19:53:31.353851'

1. [Defense News] New 'Vulcan' rocket to fly first military mission next wee

In [15]:
# Cell 9: Analyze Search Quality
def analyze_search_quality(query):
    """Compare result relevance across search types"""
    
    print(f"Analyzing query: '{query}'")
    print("="*60)
    
    # Get results from each method
    kw_results = keyword_search(query, top=10)
    vec_results = vector_search(query, top=10)
    hyb_results = hybrid_search(query, top=10)
    
    # Extract doc IDs and titles
    def get_info(results):
        return [(r.get('doc_id', 'unknown'), r.get('title', 'No title')[:50]) for r in results]
    
    kw_info = get_info(kw_results)
    vec_info = get_info(vec_results)
    hyb_info = get_info(hyb_results)
    
    kw_ids = [info[0] for info in kw_info]
    vec_ids = [info[0] for info in vec_info]
    hyb_ids = [info[0] for info in hyb_info]
    
    # Calculate overlap
    kw_vec_overlap = len(set(kw_ids) & set(vec_ids))
    kw_hyb_overlap = len(set(kw_ids) & set(hyb_ids))
    vec_hyb_overlap = len(set(vec_ids) & set(hyb_ids))
    all_three_overlap = len(set(kw_ids) & set(vec_ids) & set(hyb_ids))
    
    print(f"\nResult Overlap (out of 10):")
    print(f"  Keyword ∩ Vector: {kw_vec_overlap}")
    print(f"  Keyword ∩ Hybrid: {kw_hyb_overlap}")
    print(f"  Vector ∩ Hybrid: {vec_hyb_overlap}")
    print(f"  All three: {all_three_overlap}")
    
    # Show unique results from each
    unique_kw = set(kw_ids) - set(vec_ids) - set(hyb_ids)
    unique_vec = set(vec_ids) - set(kw_ids) - set(hyb_ids)
    unique_hyb = set(hyb_ids) - set(kw_ids) - set(vec_ids)
    
    print(f"\nUnique to Keyword Search: {len(unique_kw)}")
    if unique_kw:
        for doc_id in list(unique_kw)[:2]:
            idx = kw_ids.index(doc_id)
            print(f"  - {kw_info[idx][1]}")
    
    print(f"\nUnique to Vector Search: {len(unique_vec)}")
    if unique_vec:
        for doc_id in list(unique_vec)[:2]:
            idx = vec_ids.index(doc_id)
            print(f"  - {vec_info[idx][1]}")
    
    print(f"\nUnique to Hybrid Search: {len(unique_hyb)}")
    if unique_hyb:
        for doc_id in list(unique_hyb)[:2]:
            idx = hyb_ids.index(doc_id)
            print(f"  - {hyb_info[idx][1]}")
    
    # Score distribution
    print(f"\n\nScore Ranges:")
    print(f"  Keyword: {min([r['@search.score'] for r in kw_results]):.3f} - {max([r['@search.score'] for r in kw_results]):.3f}")
    print(f"  Vector:  {min([r['@search.score'] for r in vec_results]):.3f} - {max([r['@search.score'] for r in vec_results]):.3f}")
    print(f"  Hybrid:  {min([r['@search.score'] for r in hyb_results]):.3f} - {max([r['@search.score'] for r in hyb_results]):.3f}")
    
    return {
        'keyword': kw_results,
        'vector': vec_results,
        'hybrid': hyb_results
    }

# Analyze different query types
analysis = analyze_search_quality("autonomous underwater vehicles")

Analyzing query: 'autonomous underwater vehicles'
Query: 'autonomous underwater vehicles'
Total matches: 20

1. [Defense News] Leonardo's buy of Iveco Defence Vehicles secures Italian armor stable...
   Score: 7.0285
   Date: 2025-07-31 08:24:38

2. [Defense News] Joby, L3Harris developing autonomous aircraft for defense missions...
   Score: 6.6209
   Date: 2025-08-01 15:44:34

3. [UK Defence Journal] MOD awards £12m vehicle repair contract to RBSL...
   Score: 4.9118
   Date: 2025-07-31 14:02:44

4. [UK Defence Journal] U.S. drone completes three-day unmanned flight trial...
   Score: 4.0148
   Date: 2025-08-03 10:11:07

5. [Defense News] Pentagon officials tout rapid experimentation at courtyard showcase...
   Score: 3.9094
   Date: 2025-07-17 13:56:08

6. [UK Defence Journal] L3Harris unveils 'wolf pack' swarm munitions...
   Score: 3.5948
   Date: 2025-07-18 21:04:20

7. [UK Defence Journal] First British-made Boxer delivered to British Army...
   Score: 3.5473
   Date: 2025-08-07

In [16]:
# Cell 10: Test Edge Cases and Query Types
print("Testing Edge Cases and Different Query Types")
print("="*60)

# Test 1: Very specific technical/acronym query
print("\n1. Technical/Acronym Query (should favor keyword):")
print("-"*40)
results = hybrid_search("DARPA SBIR Phase II", top=3)

# Test 2: Conceptual/semantic query (should favor vector)
print("\n2. Conceptual Query (should favor vector):")
print("-"*40)
results = vector_search("innovative approaches to battlefield intelligence", top=3)

# Test 3: Entity-specific query
print("\n3. Entity Search (should work well with both):")
print("-"*40)
results = hybrid_search("Lockheed Martin space systems", top=3)

# Test 4: Empty/minimal results query
print("\n4. Rare Term Query (testing empty results handling):")
print("-"*40)
try:
    results = hybrid_search("quantum blockchain defense", top=3)
    if not results:
        print("   No results found (as expected for rare terms)")
except Exception as e:
    print(f"   Handled gracefully: {e}")

# Test 5: Multi-concept query
print("\n5. Complex Multi-Concept Query:")
print("-"*40)
results = hybrid_search("artificial intelligence maritime surveillance counter-drone", top=3)

# Summary statistics
print("\n" + "="*60)
print("SEARCH QUALITY SUMMARY")
print("-"*40)

# Get total documents
total_results = search_client.search(search_text="*", include_total_count=True, top=1)
total_count = total_results.get_count()

print(f"Total documents indexed: {total_count}")
print(f"Index name: {index_name}")
print(f"Vector dimensions: 1536")
print(f"Search types available: Keyword (BM25), Vector (Cosine), Hybrid")
print("\nRecommendations:")
print("  - Use keyword search for: Acronyms, specific terms, exact matches")
print("  - Use vector search for: Conceptual queries, similarity, themes")
print("  - Use hybrid search for: General queries, best overall results")

Testing Edge Cases and Different Query Types

1. Technical/Acronym Query (should favor keyword):
----------------------------------------
Hybrid Query: 'DARPA SBIR Phase II'
Combined keyword + semantic results:

1. [Defense News] Pentagon picks finalists for Replicator 2 counter-drone demo...
   Score: 0.0298
   Date: 2025-07-24

2. [UK Defence Journal] Bell to build new VTOL X-plane with new rotor breakthrough...
   Score: 0.0167
   Date: 2025-07-11

3. [Defense News] Anduril wins $100M deal to build US Army's next-gen C2 ecosystem...
   Score: 0.0164
   Date: 2025-07-21


2. Conceptual Query (should favor vector):
----------------------------------------
Vector Query: 'innovative approaches to battlefield intelligence'
Top 3 semantically similar documents:

1. [Defense News] Air Force experiments with using AI to seek combat targets...
   Score: 0.6661

2. [Defense News] Pentagon picks finalists for Replicator 2 counter-drone demo...
   Score: 0.6452

3. [Defense News] US Army readie