In [9]:
import os
from typing import List
from pydantic import BaseModel
from pymongo import MongoClient
import requests

NEXT_JS_URL = "https://nextjs.org"

class SearchRequest(BaseModel):
    query: str
    index_name: str
    top_k: int
    
class SearchResponseItem(BaseModel):
    id: str
    url: str
    title: str
    text: str
    score: float
    
class SearchResponse(BaseModel):
    query: str
    results: List[SearchResponseItem]
    vector_model: str

# allow you to map the next js results to the vector ids
def get_document_uid_map():
    mongo_db_secret = os.getenv("MONGO_DB")
    mongo_client = MongoClient(mongo_db_secret)
    db = mongo_client["web_crawler"]
    crawl_collection = db["crawl_data"]
    
    documents = list(crawl_collection.find({}))
    vector_id_path_map = {}
    for document in documents:
        url = document["url"]
        vector_id = document["vector_id"]
        
        path = url.split(NEXT_JS_URL)[1] if NEXT_JS_URL in url else url
        
        vector_id_path_map[path] = vector_id
        
    return vector_id_path_map

def get_next_js_results(query: str, path_map: dict) -> SearchResponse:
    base_url = "https://nntahqi9c5-dsn.algolia.net/1/indexes/*/queries"
    parameters = {
        "x-algolia-agent": "Algolia for JavaScript (4.10.5); Browser (lite)",
        "x-algolia-api-key": "948b42d1edd177a55c6d6ae8dab24621",
        "x-algolia-application-id": "NNTAHQI9C5"
    }

    headers = {
        "Content-Type": "application/json"
    }

    payload = {
        "requests":[
            {
                "indexName":"nextjs_docs_canary",
                "query": query, 
                "params":"filters=isApp%3Atrue"
            }
        ]
    }

    response = requests.post(base_url, headers=headers, params=parameters, json=payload).json()
    
    hits = response['results'][0]['hits']
    search_response_items = []
    

    for hit in hits:
        vector_id = path_map.get(hit["path"], None)
        if vector_id is None:
            print(f"Path not found in path_map: {hit['path']}")
            continue
        search_response_items.append(SearchResponseItem(
            id=path_map[hit["path"]],
            url=NEXT_JS_URL + hit["path"],
            title=hit["title"],
            text=hit["content"],
            score=0.0 # Placeholder for score
        ))
    
    search_response = SearchResponse(
        query=query,
        results=search_response_items,
        vector_model="NEXT_JS_SEARCH"
    )
    
    return search_response

path_map = get_document_uid_map()
path_map.keys()

query = "cookies"
get_next_js_results(query, path_map)

Path not found in path_map: /docs/app/guides
Path not found in path_map: /docs/app/guides
Path not found in path_map: /docs/app/guides


SearchResponse(query='cookies', results=[SearchResponseItem(id='22978d4b-c735-482b-8385-a7c3f47cc721', url='https://nextjs.org/docs/app/api-reference/functions/cookies', title='cookies', text='cookies is an async function that allows you to read the HTTP incoming request cookies in Server Components, and read/write outgoing request cookies in Server Actions or Route Handlers.', score=0.0), SearchResponseItem(id='22978d4b-c735-482b-8385-a7c3f47cc721', url='https://nextjs.org/docs/app/api-reference/functions/cookies', title='cookies', text='Reference', score=0.0), SearchResponseItem(id='22978d4b-c735-482b-8385-a7c3f47cc721', url='https://nextjs.org/docs/app/api-reference/functions/cookies', title='cookies', text='Methods', score=0.0), SearchResponseItem(id='22978d4b-c735-482b-8385-a7c3f47cc721', url='https://nextjs.org/docs/app/api-reference/functions/cookies', title='cookies', text='The following methods are available:', score=0.0), SearchResponseItem(id='611fcffe-2b9d-4f56-9c8b-125b418