In [94]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def skip(line, cell):
    return

In [95]:
from azure.cosmos import CosmosClient, exceptions, PartitionKey
import json
from azure.identity import DefaultAzureCredential
import os
import openai_helper 
import nest_asyncio
import asyncio

# Define your Cosmos DB account information
endpoint = "https://anildwacosmoswestus.documents.azure.com:443/"


# Initialize the Cosmos client
client = CosmosClient(endpoint, credential=DefaultAzureCredential())

In [96]:
database_name = 'booksdb'
container_name = 'books'

client.create_database_if_not_exists(id=database_name)
# Connect to the database and container
database = client.get_database_client(database_name)

In [97]:
vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path":"/textVector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":1536
        }
    ]
}


vector_indexing_policy = {
    
    "indexingMode": "consistent",
    "automatic": True,
    "includedPaths": [
        {
            "path": "/*"
        }
    ],
    "excludedPaths": [
        {
            "path": "/_etag/?"
        },
        {
            "path": "/textVector/*"
        }
        
    ],
    "vectorIndexes": [
        {
            "path": "/textVector",
            "type": "quantizedFlat"
        }
    ]
}

full_text_paths_policy = {
   "defaultLanguage": "en-US",
   "fullTextPaths": [
       {
           "path": "/fileName",
           "language": "en-US"
       },
       {
           "path": "/text",
           "language": "en-US"
       }
   ]
}




vector_indexing_policy_diskANN = {
    
    "indexingMode": "consistent",
    "automatic": True,
    "includedPaths": [
        {
            "path": "/*"
        }
    ],
    "excludedPaths": [
        {
            "path": "/_etag/?"
        },
        {
            "path": "/textVector/*"
        }
    ],
    "fullTextIndexes": [
        {
            "path": "/text"
        }
    ],
    "vectorIndexes": [
        {
            "path": "/textVector",
            "type": "diskANN"
        }
    ]
}

In [98]:
for db in client.list_databases():
    print(db)

{'id': 'booksdb', '_rid': 'adFqAA==', '_self': 'dbs/adFqAA==/', '_etag': '"00008a00-0000-0700-0000-680936200000"', '_colls': 'colls/', '_users': 'users/', '_ts': 1745434144}


In [99]:
#database.delete_container(container=container_name)

In [100]:
container_name = "books002"

In [101]:
container_name = "books_small"

In [102]:
container = None
try:
    container = database.create_container(id=container_name, partition_key=PartitionKey(path="/id"), 
                          vector_embedding_policy=vector_embedding_policy,
                          indexing_policy=vector_indexing_policy_diskANN,
                          full_text_policy=full_text_paths_policy,
                          offer_throughput=10000) 
except exceptions.CosmosResourceExistsError:
    print(f"Container {container_name} already exists. Using existing container.")
    container = database.get_container_client(container_name)

Container books_small already exists. Using existing container.


In [103]:
%%skip

# read csv files 
import os
import pandas as pd
import uuid
import tiktoken

encoding = tiktoken.encoding_for_model("text-embedding-ada-002")

MAX_TOKENS = 8192

count = 0
total=len(os.listdir('q1_files'))
data = []
for file in os.listdir('q1_files'):
    df = pd.read_csv('q1_files/' + file)
    text = df['text'].iloc[0]
    #embedding_result = await openai_helper.generate_embeddings([df['text'].iloc[0]])
    tokens = encoding.encode(text)
    token_length = len(tokens)

    if len(tokens) > MAX_TOKENS:
        tokens = tokens[:MAX_TOKENS]
        text = encoding.decode(tokens)

    print(f"Trimmed token length: {len(tokens)}")
    print(f"Trimmed text: {text[:300]}...")  # Show sample
    book_item = {
        "id": str(uuid.uuid4()),
        "fileName": file,
        "text": df['text'].iloc[0],
        "textVector": embedding_result[0].embedding,
    }
    print(token_length)
    break
    #data.append(book_item)
    #res = container.upsert_item(book_item)
    


#res = await container.upsert_item(book_item)
#count += 1
#print(f"processing {count} of {total}")
    

In [104]:
%%skip
with open('q1.txt', 'r') as file:
    q1_lines = [line.strip() for line in file if line.strip()]

# Flatten multi-line entries (like Janet_My_Mother_and_Me.csv)
from itertools import groupby

q1_files = []
temp = []
for line in q1_lines:
    temp.append(line)
    if line.endswith('.csv'):
        q1_files.append('_'.join(temp))
        temp = []

## Vector Search + Full Text Search


In [105]:
questions = ["Give me a list of books published in the year 2000",
"Give me a list of book of travel category",
"Give me a list of books from author Agatha Christie",
"Give me some haunted incidents from california state",
"Give me some wines found in Italy",
"Give me wines tasted by Roger Voss",
"Give me some wines in the variety of Red Blend",
"Give me some business category news",
"Give me a list of students graduated in the year 2025"]

In [106]:
%%skip
items = container.query_items( 
query="""SELECT top 20 c.fileName FROM c WHERE c.fileName = 'book_Snow_Crash.csv'
""", 
parameters=[ 
  #{"name": "@embedding", "value": search_query_embedded} 
 ], 
 enable_cross_partition_query=True)


for item in items:
    print(item)

## Methods

In [107]:
import asyncio

async def get_vector_search_results(container_name, embedding_model, search_query, top_k=5, threshold=0.7):
    embedding_result = await openai_helper.generate_embeddings([search_query], model=embedding_model)
    search_query_embedded = embedding_result[0].embedding
    def run_query():
        
        container = database.get_container_client(container_name)    
        items = container.query_items( 
            query="""
            SELECT top @top_k c.fileName, VectorDistance(c.textVector, @embedding) AS textSimilarityScore 
            FROM c
            WHERE VectorDistance(c.textVector, @embedding) > @threshold
            ORDER BY VectorDistance(c.textVector, @embedding) 
            """, 
            parameters=[
                {"name": "@embedding", "value": search_query_embedded},
                {"name": "@top_k", "value": top_k},
                {"name": "@threshold", "value": threshold}
            ], 
            enable_cross_partition_query=True)
        return [item for item in items]

    return await asyncio.to_thread(run_query)


async def get_fulltext_search_results(container_name, search_query, top_k=5):
    def run_query():
        container = database.get_container_client(container_name)
        search_query_arr = search_query.split(" ")
        #print(search_query_arr)
        query_string = f"""
        SELECT TOP @top_k c.fileName
        FROM c
        ORDER BY RANK FullTextScore(c.text, {search_query_arr})
        """

        items = container.query_items(
            query=query_string,
            parameters=[
                {"name": "@top_k", "value": top_k},
            ],
            enable_cross_partition_query=True
        )

        item_files = [item for item in items]
        return item_files

    return await asyncio.to_thread(run_query)

async def get_fulltext_all_search_results(container, search_query, top_k=5):
    search_query_arr = search_query.split(" ")
    #print(search_query_arr)
    query_string = f"""
    SELECT TOP @top_k c.fileName
    FROM c
    WHERE FullTextContainsAll(c.text, {search_query_arr})
    """

    items = container.query_items(
        query=query_string,
        parameters=[
            {"name": "@top_k", "value": top_k},
        ],
        enable_cross_partition_query=True
    )

    item_files = [item for item in items]
    return item_files

def combine_search_results(vector_results, fulltext_results):
    file_names_vector = {item['fileName'] for item in vector_results}
    file_names_fts = {item['fileName'] for item in fulltext_results}

    # Union of the two sets
    union_file_names = file_names_vector.union(file_names_fts)

    # Convert to sorted list of dicts with only fileName field
    union_items = [{'fileName': name} for name in sorted(union_file_names)]

    return union_items
    # Print result
    #print(f"Total unique items in union: {len(union_items)}")
    #for item in union_items:
    #    print(item)
    
async def run_evaluation(question_index, container_name, embedding_model):
    #question_index = 4
    print(f"Question: {questions[question_index]}")
    with open(os.path.join("questions", f'question{question_index + 1}.txt'), 'r') as file:
        q1_lines = [line.strip() for line in file if line.strip()]

    # Flatten multi-line entries (like Janet_My_Mother_and_Me.csv)
    from itertools import groupby

    q1_files = []
    temp = []
    for line in q1_lines:
        temp.append(line.replace("txt", "csv"))
        if line.endswith('.txt'):
            q1_files.append('_'.join(temp))
            temp = []
    #search_query = "teenaged computer hacker"
    search_query = questions[question_index]
    #embedding_result = await openai_helper.generate_embeddings([search_query], model="text-embedding-ada-002")
    #search_query_embedded = embedding_result[0].embedding
    #search_query_arr = search_query.split(" ")

    #items_vector = get_vector_search_results(container, search_query_embedded, top_k=155, threshold=0.8)
    #items_fts = get_fulltext_search_results(container, search_query, top_k=155)

    vector_task = get_vector_search_results(container_name, embedding_model, search_query, top_k=400, threshold=0.8)
    fts_task = get_fulltext_search_results(container_name, search_query, top_k=400)
    items_vector, items_fts = await asyncio.gather(vector_task, fts_task)

    combined_results = combine_search_results(items_vector, items_fts)
    return combined_results, q1_files


def check_recall(q_files, combined_results):
    q1_set = set(q_files)
    item_files_set = {item['fileName'] for item in combined_results}

    missing_in_items = q1_set - item_files_set
    extra_in_items = item_files_set - q1_set

    print("Count of files in golden_answers but missing in search:", len(missing_in_items))

    # correct recall calculation
    recall = (len(q1_set) - len(missing_in_items)) / len(q1_set) * 100
    print(f"Recall: {recall:.2f}%")



    true_positives = q1_set & item_files_set
    false_positives = item_files_set - q1_set

    precision = len(true_positives) / len(item_files_set) * 100 if item_files_set else 0
    print(f"Precision: {precision:.2f}%")

    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
        print(f"F1 Score: {f1_score:.2f}%")
    else:
        print("F1 Score: Undefined (no precision or recall)")

    #print("Files in q1.txt but missing in search:")
    #for f in sorted(missing_in_items):
    #    print(f)


## using text-embedding-ada-002 1536 dimensions

In [108]:
container_name = "books002" # text-embedding-ada-002
embedding_model = "text-embedding-ada-002" # text-embedding-ada-002

async def run_all_evaluations():
    tasks = [run_evaluation(i, container_name, embedding_model) for i in range(9)]
    results = await asyncio.gather(*tasks)

    for i, (combined_results, q1_files) in enumerate(results):
        print(f"\n=== Evaluation for Question {i} ===")
        print(f"Question: {questions[i]}")
        check_recall(q1_files, combined_results)



# Needed only once in a Jupyter session
nest_asyncio.apply()

await run_all_evaluations()

    

Question: Give me a list of books published in the year 2000
Question: Give me a list of book of travel category
Question: Give me a list of books from author Agatha Christie
Question: Give me some haunted incidents from california state
Question: Give me some wines found in Italy
Question: Give me wines tasted by Roger Voss
Question: Give me some wines in the variety of Red Blend
Question: Give me some business category news
Question: Give me a list of students graduated in the year 2025

=== Evaluation for Question 0 ===
Question: Give me a list of books published in the year 2000
Count of files in golden_answers but missing in search: 56
Recall: 82.82%
Precision: 59.47%
F1 Score: 69.23%

=== Evaluation for Question 1 ===
Question: Give me a list of book of travel category
Count of files in golden_answers but missing in search: 5
Recall: 87.80%
Precision: 8.72%
F1 Score: 15.86%

=== Evaluation for Question 2 ===
Question: Give me a list of books from author Agatha Christie
Count of f

## using text-embedding-3-small 1536 dimensions


In [109]:
container_name = "books_small" # text-embedding-ada-002
embedding_model = "text-embedding-3-small" # text-embedding-ada-002

async def run_all_evaluations():
    tasks = [run_evaluation(i, container_name, embedding_model) for i in range(9)]
    results = await asyncio.gather(*tasks)

    for i, (combined_results, q1_files) in enumerate(results):
        print(f"\n=== Evaluation for Question {i} ===")
        print(f"Question: {questions[i]}")
        check_recall(q1_files, combined_results)



# Needed only once in a Jupyter session
nest_asyncio.apply()

await run_all_evaluations()

Question: Give me a list of books published in the year 2000
Question: Give me a list of book of travel category
Question: Give me a list of books from author Agatha Christie
Question: Give me some haunted incidents from california state
Question: Give me some wines found in Italy
Question: Give me wines tasted by Roger Voss
Question: Give me some wines in the variety of Red Blend
Question: Give me some business category news
Question: Give me a list of students graduated in the year 2025

=== Evaluation for Question 0 ===
Question: Give me a list of books published in the year 2000
Count of files in golden_answers but missing in search: 57
Recall: 82.52%
Precision: 67.25%
F1 Score: 74.10%

=== Evaluation for Question 1 ===
Question: Give me a list of book of travel category
Count of files in golden_answers but missing in search: 5
Recall: 87.80%
Precision: 9.00%
F1 Score: 16.33%

=== Evaluation for Question 2 ===
Question: Give me a list of books from author Agatha Christie
Count of f

In [392]:
combined_results, q1_files =  await run_evaluation(1)
check_recall(q1_files, combined_results)

Question: Give me a list of book of travel category
embedding model: text-embedding-ada-002
['Give', 'me', 'a', 'list', 'of', 'book', 'of', 'travel', 'category']
Count of files in q1.txt but missing in search: 5
Recall: 87.80%
Files in q1.txt but missing in search:
book_1_000_Places_to_See_Before_You_Die.csv
book_Cuba_:_Brendan_Sainsbury.csv
book_Lady_on_the_Hill.csv
book_Neither_Here_Nor_There:.csv
book_Notes_from_a_Small_Island.csv


In [376]:
combined_results, q1_files =  await run_evaluation(8)
check_recall(q1_files, combined_results)

Question: Give me a list of students graduated in the year 2025
embedding model: text-embedding-ada-002
['Give', 'me', 'a', 'list', 'of', 'students', 'graduated', 'in', 'the', 'year', '2025']
Count of files in q1.txt but missing in search: 0
Recall: 100.00%
Files in q1.txt but missing in search:


In [372]:
combined_results, q1_files =  await run_evaluation(5)
check_recall(q1_files, combined_results)

Question: Give me wines tasted by Roger Voss
embedding model: text-embedding-ada-002
['Give', 'me', 'wines', 'tasted', 'by', 'Roger', 'Voss']
Count of files in q1.txt but missing in search: 81
Recall: 17.35%
Files in q1.txt but missing in search:
wine_A._Margaine_NV_Le_Brut_Premier_Cru__(Champagne).csv
wine_Alphonse_Mellot_2014_Génération_Dix-Neuf__(Sancerre).csv
wine_Armand_de_Brignac_NV_Blanc_de_Blancs_Brut_Chardonnay_(Champagne).csv
wine_Cartuxa_2011_Pêra-Manca_Red_(Alentejo).csv
wine_Casa_Ferreirinha_2004_Barca_Velha_Red_(Douro).csv
wine_Casa_Ferreirinha_2007_Reserva_Especial_Red_(Douro).csv
wine_Casa_Ferreirinha_2008_Barca-Velha_Red_(Douro).csv
wine_Castello_Banfi_1997__Brunello_di_Montalcino.csv
wine_Château_Beauséjour_2012__Puisseguin_Saint-Émilion.csv
wine_Château_Bellevue_la_Forêt_2012_Red_(Fronton).csv
wine_Château_Beychevelle_2014_Amiral_de_Beychevelle__(Saint-Julien).csv
wine_Château_Bouscaut_2013__Pessac-Léognan.csv
wine_Château_Bréhat_2013__Castillon_Côtes_d

In [115]:
embedding_result = await openai_helper.generate_embeddings([questions[0]], model=embedding_model)
search_query_embedded = embedding_result[0].embedding
container_name = "books002" 
embedding_model = "text-embedding-ada-002" # text-embedding-ada-002
items_vector = await get_vector_search_results(container_name,embedding_model, questions[0], top_k=400, threshold=0.6)
print(f"Total items found: {len(items_vector)}")
for item in items_vector:
    print(item)

Total items found: 400
{'fileName': 'book_New_York_2000.csv', 'textSimilarityScore': 0.8291248783696955}
{'fileName': 'book_The_Best_American_Mystery_Stories_2004.csv', 'textSimilarityScore': 0.8281981356883056}
{'fileName': 'book_The_Best_American_Travel_Writing_2006.csv', 'textSimilarityScore': 0.8236891674129334}
{'fileName': 'book_The_Best_American_Science_Writing_2002.csv', 'textSimilarityScore': 0.823666388900886}
{'fileName': 'book_My_Century.csv', 'textSimilarityScore': 0.8235646759203652}
{'fileName': 'book_The_Science_Book.csv', 'textSimilarityScore': 0.8223925593132668}
{'fileName': 'book_Ten_Great_Works_of_Philosophy.csv', 'textSimilarityScore': 0.8214092618962769}
{'fileName': 'book_The_Best_American_Sports_Writing_2003.csv', 'textSimilarityScore': 0.8202521211021323}
{'fileName': 'book_The_Best_American_Crime_Writing_2006.csv', 'textSimilarityScore': 0.8193680698975054}
{'fileName': 'book_The_Best_American_Sports_Writing_2006.csv', 'textSimilarityScore': 0.818735283585479

In [114]:
items_fts = await get_fulltext_search_results(container, questions[0], top_k=400)
print(f"Total items found: {len(items_fts)}")
for item in items_fts:
    print(item)


Total items found: 400
{'fileName': "book_CliffsNotes_on_Dstoevsky's_Crime_and_Punishment.csv"}
{'fileName': 'news_Nasdaq_planning_$100m_share_sale.csv'}
{'fileName': 'news_Nasdaq_planning_$100m-share_sale.csv'}
{'fileName': 'book_Pink_Box.csv'}
{'fileName': 'book_The_Outlandish_Companion.csv'}
{'fileName': 'book_Allusions_in_Ulysses.csv'}
{'fileName': "book_Don_DeLillo's_White_Noise.csv"}
{'fileName': 'news_Little_Britain_two_top_comic_list.csv'}
{'fileName': 'book_Nathaniel_Hawthorne,_The_Scarlet_Letter.csv'}
{'fileName': 'book_Crucible_of_War.csv'}
{'fileName': 'book_Juiced.csv'}
{'fileName': "book_Anita_Diamant's_The_Red_Tent.csv"}
{'fileName': 'book_Michael_W._Smith.csv'}
{'fileName': 'book_Edgar_Allan_Poe,_A_to_Z.csv'}
{'fileName': 'book_Dark_Rivers_of_the_Heart_Intensity_Sole_Survivor.csv'}
{'fileName': 'book_The_Eclogues_of_Virgil.csv'}
{'fileName': 'news_UKIP_could_sue_Veritas_defectors.csv'}
{'fileName': 'book_The_Pirate_Dictionary.csv'}
{'fileName': 'book_Plato,_Not_Prozac!.

In [391]:
combined_results = combine_search_results(items_vector, items_fts)

print(f"Total items found: {len(combined_results)}")
for item in combined_results:
    print(item)

Total items found: 800
{'fileName': 'book_..._The_Circulation.csv'}
{'fileName': 'book_1,000_Places_to_See_Before_You_Die.csv'}
{'fileName': 'book_20,000_Leagues_Under_the_Sea.csv'}
{'fileName': 'book_2061.csv'}
{'fileName': 'book_A_Dangerous_Fortune.csv'}
{'fileName': 'book_A_First_Course_in_String_Theory.csv'}
{'fileName': 'book_A_Journal.csv'}
{'fileName': 'book_A_Picture_Book_of_Thomas_Jefferson.csv'}
{'fileName': 'book_A_Slipping-down_Life.csv'}
{'fileName': 'book_A_guided_tour_of_five_works_by_Plato.csv'}
{'fileName': 'book_About_a_Boy.csv'}
{'fileName': "book_About_the_B'nai_Bagels.csv"}
{'fileName': 'book_Aeneid.csv'}
{'fileName': 'book_Agile_Web_Development_with_Rails.csv'}
{'fileName': 'book_Alfred_Hitchcock.csv'}
{'fileName': 'book_Allusions_in_Ulysses.csv'}
{'fileName': 'book_Among_the_Dolls.csv'}
{'fileName': 'book_An_American_Plague.csv'}
{'fileName': 'book_An_Autobiography.csv'}
{'fileName': 'book_Babar_the_King.csv'}
{'fileName': "book_Baby's_Animal_Friends.csv"}
{'file

  union_items = [{'fileName': name} for name in sorted(union_file_names)]
  union_items = [{'fileName': name} for name in sorted(union_file_names)]


In [366]:
question_index = 5
print(f"Question: {questions[question_index]}")
with open(os.path.join("questions", f'question{question_index + 1}.txt'), 'r') as file:
    q1_lines = [line.strip() for line in file if line.strip()]

# Flatten multi-line entries (like Janet_My_Mother_and_Me.csv)
from itertools import groupby

q1_files = []
temp = []
for line in q1_lines:
    temp.append(line.replace("txt", "csv"))
    if line.endswith('.txt'):
        q1_files.append('_'.join(temp))
        temp = []
#search_query = "teenaged computer hacker"
search_query = questions[question_index]
embedding_result = await openai_helper.generate_embeddings([search_query], model="text-embedding-ada-002")
search_query_embedded = embedding_result[0].embedding
search_query_arr = search_query.split(" ")

Question: Give me wines tasted by Roger Voss
embedding model: text-embedding-ada-002


In [367]:
r1 = get_vector_search_results(container, search_query_embedded, top_k=155, threshold=0.8)
for item in r1:
    print(item)

{'fileName': 'wine_Louis_Roederer_2008_Cristal_Vintage_Brut__(Champagne).csv', 'textSimilarityScore': 0.84304347680781}
{'fileName': 'wine_Rosenhof_2014_Trockenbeerenauslese_Grüner_Veltliner_(Burgenland).csv', 'textSimilarityScore': 0.8361816666014475}
{'fileName': 'wine_Louis_Roederer_2009_Cristal_Brut__(Champagne).csv', 'textSimilarityScore': 0.8324224944158412}
{'fileName': 'wine_Vadio_Wines_2005_Vadio_Baga_(Bairrada).csv', 'textSimilarityScore': 0.831802814964231}
{'fileName': 'wine_DeMorgenzon_2016_Cape_Winemakers_Guild_Roussanne_(Stellenbosch).csv', 'textSimilarityScore': 0.8313968263748}
{'fileName': 'wine_Le_Cadeau_2015_Rocheux_Pinot_Noir_(Willamette_Valley).csv', 'textSimilarityScore': 0.8311952977837614}
{'fileName': 'wine_Vadio_Wines_2011_Vadio_Baga_(Bairrada).csv', 'textSimilarityScore': 0.8309239207051953}
{'fileName': 'wine_Roessler_2004_Wiley_Vineyard_Riesling_(Anderson_Valley).csv', 'textSimilarityScore': 0.8305074577201986}
{'fileName': 'wine_Rust_en_Vrede_2013_Cape_W

In [368]:
r2 = get_fulltext_search_results(container, search_query, top_k=155)
for item in r2:
    print(item)

['Give', 'me', 'wines', 'tasted', 'by', 'Roger', 'Voss']
{'fileName': 'student_Edward_Rogers.csv'}
{'fileName': 'folk_tales_The_Old_Woman_and_the_Wine_Jar.csv'}
{'fileName': 'folk_tales_The_Man_and_the_Wood.csv'}
{'fileName': 'folk_tales_The_Little_Omelet.csv'}
{'fileName': 'folk_tales_The_Serpent_and_the_File.csv'}
{'fileName': 'folk_tales_The_Boy_Bathing.csv'}
{'fileName': 'folk_tales_The_Fawn_and_His_Mother.csv'}
{'fileName': 'folk_tales_The_Sun_and_the_Moon.csv'}
{'fileName': 'folk_tales_The_Dolphin,_the_Whales_and_the_Sprat.csv'}
{'fileName': 'folk_tales_The_Eagle_and_the_Arrow.csv'}
{'fileName': 'folk_tales_Jupiter_and_the_Monkey.csv'}
{'fileName': 'folk_tales_The_Dog_in_the_Manger.csv'}
{'fileName': 'folk_tales_The_Man_Bitten_by_a_Dog.csv'}
{'fileName': 'folk_tales_The_Wolf_and_the_Horse.csv'}
{'fileName': 'book_Rita_Hayworth_and_Shawshank_Redemption.csv'}
{'fileName': 'book_Goodbye_Forever.csv'}
{'fileName': 'folk_tales_The_Ape.csv'}
{'fileName': 'news_Blunkett_hints_at_electio

In [369]:
combined_results = combine_search_results(r1, r2)
print(f"Total items found: {len(combined_results)}")
for item in combined_results:
    print(item)


Total items found: 310
{'fileName': 'book_..._The_Circulation.csv'}
{'fileName': 'book_2061.csv'}
{'fileName': 'book_A_Journal.csv'}
{'fileName': 'book_About_a_Boy.csv'}
{'fileName': 'book_An_Autobiography.csv'}
{'fileName': 'book_Babysitters_Club.csv'}
{'fileName': 'book_Best_Science_Fiction_Stories_of_Clifford_D._Simak.csv'}
{'fileName': 'book_Billy_Budd.csv'}
{'fileName': 'book_Black_Sugar.csv'}
{'fileName': 'book_Cast_in_Dark_Waters.csv'}
{'fileName': 'book_Collected_Stories.csv'}
{'fileName': "book_Drina's_Dancing_Year.csv"}
{'fileName': 'book_Eric.csv'}
{'fileName': 'book_Exodus.csv'}
{'fileName': 'book_Fantastic_Four.csv'}
{'fileName': 'book_Frankenstein__City_of_Night__A_Novel.csv'}
{'fileName': 'book_Goodbye_Forever.csv'}
{'fileName': "book_Gravity's_Rainbow.csv"}
{'fileName': 'book_Her_Little_Secret.csv'}
{'fileName': 'book_I,_Claudius_and_Claudius_the_God.csv'}
{'fileName': 'book_In_the_Country_of_Last_Things.csv'}
{'fileName': 'book_Lawful_Possession.csv'}
{'fileName': 'boo

In [370]:
q1_set = set(q1_files)
item_files_set = {item['fileName'] for item in combined_results}

missing_in_items = q1_set - item_files_set
extra_in_items = item_files_set - q1_set

print("Count of files in q1.txt but missing in search:", len(missing_in_items))

# correct recall calculation
recall = (len(q1_set) - len(missing_in_items)) / len(q1_set) * 100
print(f"Recall: {recall:.2f}%")


Count of files in q1.txt but missing in search: 81
Recall: 17.35%


In [320]:
#%%skip

items = container.query_items( 
query="""SELECT top 200 c.fileName, VectorDistance(c.textVector, @embedding) AS textSimilarityScore 
FROM c
WHERE VectorDistance(c.textVector, @embedding) > 0.7
ORDER BY VectorDistance(c.textVector, @embedding) 
""", 
parameters=[ 
  {"name": "@embedding", "value": search_query_embedded} 
 ], 
 enable_cross_partition_query=True)


for item in items:
    print(item)

{'fileName': 'wine_Il_Colle_2010__Brunello_di_Montalcino.csv', 'textSimilarityScore': 0.8451014622349144}
{'fileName': 'wine_Agricoltori_del_Chianti_Geografico_2005_Riserva_Montegiachi__(Chianti_Classico).csv', 'textSimilarityScore': 0.8427534398998414}
{'fileName': 'wine_Biondi_Santi_2011__Brunello_di_Montalcino.csv', 'textSimilarityScore': 0.8393854530631428}
{'fileName': "wine_Masciarelli_2005_Valori__(Montepulciano_d'Abruzzo).csv", 'textSimilarityScore': 0.8382929831546151}
{'fileName': 'wine_Armilla_2012__Brunello_di_Montalcino.csv', 'textSimilarityScore': 0.8382018668116449}
{'fileName': "wine_Mascarello_Giuseppe_e_Figlio_2008_Cà_d'Morissio_Riserva__(Barolo).csv", 'textSimilarityScore': 0.8381985831778174}
{'fileName': 'wine_Antonio_Caggiano_2004_Vigna_Macchia_dei_Goti__(Taurasi).csv', 'textSimilarityScore': 0.837984985119633}
{'fileName': 'wine_Casina_di_Cornia_2011__Chianti_Classico.csv', 'textSimilarityScore': 0.8378947473214258}
{'fileName': 'wine_Tassi_2012__Brunello_di_Mon

In [296]:


questions = ["Give me a list of books published in the year 2000",
"Give me a list of book of travel category",
"Give me a list of books from author Agatha Christie",
"Give me some haunted incidents from california state",
"Give me some wines found in Italy",
"Give me wines tasted by Roger Voss",
"Give me some wines in the variety of Red Blend",
"Give me some business category news",
"Give me a list of students graduated in the year 2025"]

In [297]:
%%skip
import os
question_index = 1

print(f"Question: {questions[question_index]}")
with open(os.path.join("questions", f'question{question_index + 1}.txt'), 'r') as file:
    q1_lines = [line.strip() for line in file if line.strip()]

# Flatten multi-line entries (like Janet_My_Mother_and_Me.csv)
from itertools import groupby

q1_files = []
temp = []
for line in q1_lines:
    temp.append(line.replace("txt", "csv"))
    if line.endswith('.txt'):
        q1_files.append('_'.join(temp))
        temp = []
search_query = questions[question_index]
embedding_result = await openai_helper.generate_embeddings([search_query], model="text-embedding-ada-002")
search_query_embedded = embedding_result[0].embedding
search_query_arr = search_query.split(" ")
print(search_query_arr)

In [298]:
%%skip

print(search_query_arr)
query_string = f"""
SELECT TOP 400 c.fileName
FROM c
ORDER BY RANK FullTextScore(c.text, {search_query_arr})
"""

items = container.query_items(
    query=query_string,
    parameters=[
        #{"name": "@searchTerms", "value": search_query_arr}
    ],
    enable_cross_partition_query=True
)

item_files = [item for item in items]

for item in item_files:
    print(item)


In [299]:
print(len(q1_files))

181


### Hybrid Search

In [None]:
%%skip
print(f"query:{search_query}")
items = container.query_items( 
query=f"""SELECT top 50 c.fileName
FROM c

ORDER BY RANK RRF(VectorDistance(c.textVector, {search_query_embedded}),  FullTextScore(c.text, {search_query_arr}))
""", 
parameters=[ 
  #{"name": "@embedding", "value": search_query_embedded},
  #{"name": "@search_query_arr", "value": search_query_arr} 
 ], 
 enable_cross_partition_query=True)

item_files = [item for item in items]

print(f"Number of files in database: {len(item_files)}")


## Evaluation

In [301]:
q1_set = set(q1_files)
item_files_set = {item['fileName'] for item in combined_results}

missing_in_items = q1_set - item_files_set
extra_in_items = item_files_set - q1_set

print("Count of files in q1.txt but missing in search:", len(missing_in_items))

# recall percentage
recall = len(q1_set) / (len(q1_set) + len(missing_in_items)) * 100
print(f"Recall: {recall:.2f}%")
print("Files in q1.txt but missing in search:")
for f in sorted(missing_in_items):
    print(f)





Count of files in q1.txt but missing in search: 181
Recall: 50.00%
Files in q1.txt but missing in search:
wine_Agricoltori del Chianti Geografico 2005 Riserva Montegiachi  (Chianti Classico).csv
wine_Aldegheri 1995  Amarone della Valpolicella.csv
wine_Antichi Vigneti di Cantalupo 2006 Collis Breclemae  (Ghemme).csv
wine_Antinori 1999 Guado Al Tasso  (Bolgheri).csv
wine_Antonio Caggiano 2004 Vigna Macchia dei Goti  (Taurasi).csv
wine_Armilla 2012  Brunello di Montalcino.csv
wine_Arnaldo Caprai 2007 25 Anni  (Sagrantino di Montefalco).csv
wine_Arpepe 2007 Sassella Vigna Regina Riserva  (Valtellina Superiore).csv
wine_Attilio Ghisolfi 2011 Bussia Bricco Visette  (Barolo).csv
wine_Avignonesi 1995 Occhio di Pernice  (Vin Santo di Montepulciano).csv
wine_Baricci 2010 Nello Riserva  (Brunello di Montalcino).csv
wine_Basilisco 2012 Storico  (Aglianico del Vulture).csv
wine_Benanti 2014 Rosso  (Etna).csv
wine_Beni di Batasiolo 2000 Vigneto Boscareto  (Barolo).csv
wine_Biondi Santi 2007 Riserva 

In [302]:
%%skip
print("Count of files in search but missing in q1.txt:", len(extra_in_items))
print("Files in search but missing in q1.txt:")
for f in sorted(extra_in_items):
    print(f)