In [None]:
import os
from typing import List
from pydantic import BaseModel
from pymongo import MongoClient
import requests

class SearchRequest(BaseModel):
    query: str
    index_name: str
    top_k: int
    
class SearchResponseItem(BaseModel):
    id: str
    url: str
    title: str
    text: str
    score: float
    
class SearchResponse(BaseModel):
    query: str
    results: List[SearchResponseItem]
    vector_model: str

# allow you to map the next js results to the vector ids
def get_document_uid_map():
    mongo_db_secret = os.getenv("MONGO_DB")
    mongo_client = MongoClient(mongo_db_secret)
    db = mongo_client["web_crawler"]
    crawl_collection = db["crawl_data_angular"]
    
    documents = list(crawl_collection.find({}))
    vector_id_path_map = {}
    for document in documents:
        url = document["url"]
        vector_id = document["vector_id"]
        vector_id_path_map[url] = vector_id
        
    return vector_id_path_map

def get_next_js_results(query: str, path_map: dict) -> SearchResponse:
    base_url = "https://l1xwt2uj7f-dsn.algolia.net/1/indexes/*/queries"
    parameters = {
        "x-algolia-agent": "Algolia for JavaScript (4.10.5); Browser (lite)",
        "x-algolia-api-key": "dfca7ed184db27927a512e5c6668b968",
        "x-algolia-application-id": "L1XWT2UJ7F"
    }

    headers = {
        "Content-Type": "application/json"
    }

    payload = {
        "requests":[
            {
                "indexName":"angular_v17",
                "type": "default",
                "query": query,
            }
        ]
    }

    response = requests.post(base_url, headers=headers, params=parameters, json=payload).json()
    
    hits = response['results'][0]['hits']
    print(hits)
    search_response_items = []
    

    for hit in hits:
        vector_id = path_map.get(hit["url"].split("#")[0], None)
        if vector_id is None:
            print(f"Path not found in path_map: {hit['url']}")
            continue
        
        content = hit.get("content", "failed to get content")
        content = str(content)
        search_response_items.append(SearchResponseItem(
            id=vector_id,
            url=hit['url'],
            title=content,
            text=content,
            score=0.0 # Placeholder for score
        ))
    
    search_response = SearchResponse(
        query=query,
        results=search_response_items,
        vector_model="ANGULAR_JS_SEARCH"
    )
    
    return search_response


In [None]:
path_map = get_document_uid_map()

dict_keys(['https://angular.dev', 'https://angular.dev/license', 'https://angular.dev/', 'https://angular.dev/tutorials', 'https://angular.dev/playground', 'https://angular.dev/tutorials/learn-angular', 'https://angular.dev/press-kit', 'https://angular.dev/roadmap', 'https://angular.dev/overview', 'https://angular.dev/api', 'https://angular.dev/tools/language-service', 'https://angular.dev/guide/testing', 'https://angular.dev/best-practices/security', 'https://angular.dev/tools/cli/build', 'https://angular.dev/tools/cli', 'https://angular.dev/guide/ssr', 'https://angular.dev/tools/libraries', 'https://angular.dev/guide/forms/typed-forms', 'https://angular.dev/reference/errors', 'https://angular.dev/guide/templates/binding', 'https://angular.dev/guide/directives/directive-composition-api', 'https://angular.dev/tools/devtools', 'https://angular.dev/guide/image-optimization', 'https://angular.dev/guide/i18n', 'https://angular.dev/guide/forms', 'https://angular.dev/reference/extended-diagn

In [32]:
query = "cookies"
get_next_js_results(query, path_map).results

[{'version': '', 'tags': [], 'url': 'https://angular.dev/api/common/http/HttpRequest#withCredentials', 'url_without_variables': 'https://angular.dev/api/common/http/HttpRequest#withCredentials', 'url_without_anchor': 'https://angular.dev/api/common/http/HttpRequest', 'anchor': 'withCredentials', 'content': 'Whether this request should be sent with outgoing credentials (cookies).', 'content_camel': 'Whether this request should be sent with outgoing credentials (cookies).', 'lang': 'en', 'language': 'en', 'type': 'content', 'no_variables': False, 'pathname': '/api/common/http/HttpRequest', 'weight': {'pageRank': 90, 'level': 0, 'position': 15}, 'hierarchy': {'lvl0': 'Reference', 'lvl1': 'HttpRequest', 'lvl2': 'API', 'lvl3': 'withCredentials', 'lvl4': None, 'lvl5': None, 'lvl6': None}, 'recordVersion': 'v3', 'objectID': '15-https://angular.dev/api/common/http/HttpRequest', '_highlightResult': {'content': {'value': 'Whether this request should be sent with outgoing credentials (<em>cookies

[SearchResponseItem(id='e1dea90d-bb3c-4144-9a25-694f768c2ac3', url='https://angular.dev/api/common/http/HttpRequest#withCredentials', title='Whether this request should be sent with outgoing credentials (cookies).', text='Whether this request should be sent with outgoing credentials (cookies).', score=0.0),
 SearchResponseItem(id='ec2aa10e-f3a8-4fcc-9631-4c5e1c0e400b', url='https://angular.dev/api/common/http/HttpResourceRequest#withCredentials', title='Specifies whether the withCredentials flag should be set on the outgoing request.\r\nThis flag causes the browser to send cookies and other authentication information along with\nthe request.', text='Specifies whether the withCredentials flag should be set on the outgoing request.\r\nThis flag causes the browser to send cookies and other authentication information along with\nthe request.', score=0.0),
 SearchResponseItem(id='67c4758f-c0a3-4ddd-8703-0500fbd734d7', url='https://angular.dev/guide/hybrid-rendering#accessing-request-and-res

In [None]:
from pymongo import MongoClient
import uuid

client = MongoClient('mongodb+srv://bxrodgers1:CS4675@cluster0.6u3n5.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
db = client['web_crawler']
crawl_collection = db['crawl_data_angular']
test_collection = db['generated_qa_angular']

test_documents = list(test_collection.find({}))

## update the qa documents to match the test data format
for test_doc in test_documents:
    src_doc = crawl_collection.find_one({"_id": test_doc['source_id']})
    
    pointer_vector_id = src_doc['vector_id']
    new_test_id = str(uuid.uuid4())
    query = test_doc['question']
    answers = [pointer_vector_id]
    
    test_collection.update_one(
        {"_id" : test_doc['_id'] },
        { "$set" : {
            "answers": answers,
            "testId": new_test_id,
            "query": query
        }}
    )