In [1]:
!pip install sentence_transformers
!pip install pymongo==4.11.2
!pip install numpy
!pip install einops



In [4]:
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
client = MongoClient('mongodb+srv://bxrodgers1:CS4675@cluster0.6u3n5.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
db = client['web_crawler']
collection = db['crawl_data_angular']

def generate_embedding(text, model_name, length, model):
    if model_name == "nomic-ai/nomic-embed-text-v2-moe":
        embedding = model.encode(text, prompt_name="passage")
    else:
        embedding = model.encode(text)
    
    if len(embedding) != length:
        raise ValueError(f"Embedding length {len(embedding)} does not match expected length {length}.")

    return embedding.tolist()

def upsert_embedding_to_mongo(asset_id, index_name, embedding):
    collection.update_one(
        {"_id": asset_id},
        {"$set": { f"vectors.{index_name}" : embedding } }
    )
    
def process_and_store_embedding(asset_id, text, model_name, index_name, length, model):
    try:
        embedding = generate_embedding(text, model_name, length, model)
        upsert_embedding_to_mongo(asset_id, index_name, embedding)
    except ValueError as e:
        print(f"Error processing asset_id {asset_id}: {e}")
    except Exception as e:
        print(f"Unexpected error for asset_id {asset_id}: {e}")
        
def add_new_index_to_mongo(model_name, index_name, embedding_length, model):
    documents = list(collection.find())
    
    progress = 0
    total_documents = len(documents)
    
    for doc in documents:
        asset_id = doc['_id']
        
        if index_name.startswith("summary"):
            text = doc.get('summary', None)  # Assuming the summary field is named 'summary'
        else:
            text = doc['html']  # Assuming the text field is named 'text'
            
        if text is None:
            print(f"Skipping asset_id {asset_id} due to missing text.")
            continue
        
        try:
            process_and_store_embedding(asset_id, text, model_name, index_name, embedding_length, model)
            progress += 1
            print(f"Completed: {progress}/{total_documents}", end='\r')
        except ValueError as e:
            print(f"Error processing asset_id {asset_id}: {e}")
        except Exception as e:
            print(f"Unexpected error for asset_id {asset_id}: {e}")
    print("Finished Processing All Documetns for Index:", index_name)
            
def delete_index_from_mongo(index_name):
    
    collection.update_many(
        {},
        {"$unset": { f"vectors.{index_name}" : "" } }
    )

In [None]:
import requests
from pymongo import MongoClient

client = MongoClient('mongodb+srv://bxrodgers1:CS4675@cluster0.6u3n5.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
db = client['web_crawler']
collection = db['crawl_data_angular']

def generate_summary(doc):  # you can tweak this limit as needed
    """ You are a technical assistant for Angular.js documentation. Based on the following content extracted from the Angular.js website, please generate a clear, concise summary to the the provided page from the Angular Documentation. This summary should consist of around 150 words. It should detail all of the import features on the page, the function of the feature and all of the specific references that are detailed in depth in the page.\n\n"""
    
    html_content = doc.get('html', '')

    prompt = f"""
    Documentation:
    {html_content}
    """

    response = requests.post("http://localhost:11434/api/generate", json={
        "model": "llama3.2",
        "prompt": prompt,
        "stream": False
    })

    markdown_text = response.json().get("response", "")
    
    collection.update_one(
        {"_id": doc['_id']},
        {"$set": {"summary": markdown_text}}
    )
    
documents = list(collection.find())
len(documents)

print("Generating summaries for documents...")
total_documents = len(documents)
progress = 0
for doc in documents:
    doc = collection.find_one({"_id": doc['_id']})
    print(f"Processing document {progress}/{total_documents}...", end='\r')
    progress += 1
    if doc.get('summary') is None:
        generate_summary(doc)
    
print("Finished generating summaries for all documents.")
    

Generating summaries for documents...
Finished generating summaries for all documents.


In [None]:
model_name = 'nomic-ai/nomic-embed-text-v2-moe'
index_name = "nomic-embed-text-v2"
summary_index_name = "summary-nomic-embed-text-v2"
embedding_length = 768

model = SentenceTransformer(model_name, trust_remote_code=True)
add_new_index_to_mongo(model_name, index_name, embedding_length, model)
add_new_index_to_mongo(model_name, summary_index_name, embedding_length, model)

model_name = 'all-MiniLM-L6-v2'
index_name = "all-MiniLM-L6-v2"
summary_index_name = "summary-all-MiniLM-L6-v2"
embedding_length = 384

model = SentenceTransformer(model_name, trust_remote_code=True)
add_new_index_to_mongo(model_name, index_name, embedding_length, model)
add_new_index_to_mongo(model_name, summary_index_name, embedding_length, model)

model_name = 'paraphrase-MiniLM-L6-v2'
index_name = "paraphrase-MiniLM-L6-v2"
summary_index_name = "summary-paraphrase-MiniLM-L6-v2"
embedding_length = 384

model = SentenceTransformer(model_name, trust_remote_code=True)
add_new_index_to_mongo(model_name, index_name, embedding_length, model)
add_new_index_to_mongo(model_name, summary_index_name, embedding_length, model)

model_name = 'all-distilroberta-v1'
index_name = "all-distilroberta-v1"
summary_index_name = "summary-all-distilroberta-v1"
embedding_length = 768

model = SentenceTransformer(model_name, trust_remote_code=True)
add_new_index_to_mongo(model_name, index_name, embedding_length, model)
add_new_index_to_mongo(model_name, summary_index_name, embedding_length, model)

Finished Processing All Documetns for Index: summary-nomic-embed-text-v2
Finished Processing All Documetns for Index: summary-all-MiniLM-L6-v2
Finished Processing All Documetns for Index: summary-paraphrase-MiniLM-L6-v2
Finished Processing All Documetns for Index: summary-all-distilroberta-v1
