In [None]:
!pip install sentence_transformers
!pip install pymongo==4.11.2
!pip install numpy
!pip install einops

In [1]:
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = MongoClient('mongodb+srv://bxrodgers1:CS4675@cluster0.6u3n5.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
db = client['web_crawler']
collection = db['crawl_data_angular']

def generate_embedding(text, model_name, length, model):
    if model_name == "nomic-ai/nomic-embed-text-v2-moe":
        embedding = model.encode(text, prompt_name="passage")
    else:
        embedding = model.encode(text)
    
    if len(embedding) != length:
        raise ValueError(f"Embedding length {len(embedding)} does not match expected length {length}.")

    return embedding.tolist()

def upsert_embedding_to_mongo(asset_id, index_name, embedding):
    collection.update_one(
        {"_id": asset_id},
        {"$set": { f"vectors.{index_name}" : embedding } }
    )
    
def process_and_store_embedding(asset_id, text, model_name, index_name, length, model):
    try:
        embedding = generate_embedding(text, model_name, length, model)
        upsert_embedding_to_mongo(asset_id, index_name, embedding)
    except ValueError as e:
        print(f"Error processing asset_id {asset_id}: {e}")
    except Exception as e:
        print(f"Unexpected error for asset_id {asset_id}: {e}")
        
def add_new_index_to_mongo(model_name, index_name, embedding_length, model):
    documents = list(collection.find())
    
    progress = 0
    total_documents = len(documents)
    
    for doc in documents:
        asset_id = doc['_id']
        
        if index_name.startswith("summary"):
            text = doc.get('summary', None)  # Assuming the summary field is named 'summary'
        else:
            text = doc['html']  # Assuming the text field is named 'text'
            
        if text is None:
            print(f"Skipping asset_id {asset_id} due to missing text.")
            continue
        
        try:
            process_and_store_embedding(asset_id, text, model_name, index_name, embedding_length, model)
            progress += 1
            print(f"Completed: {progress}/{total_documents}", end='\r')
        except ValueError as e:
            print(f"Error processing asset_id {asset_id}: {e}")
        except Exception as e:
            print(f"Unexpected error for asset_id {asset_id}: {e}")
    print("Finished Processing All Documetns for Index:", index_name)
            
def delete_index_from_mongo(index_name):
    
    collection.update_many(
        {},
        {"$unset": { f"vectors.{index_name}" : "" } }
    )

In [3]:
import requests
from pymongo import MongoClient
import tiktoken

client = MongoClient('mongodb+srv://bxrodgers1:CS4675@cluster0.6u3n5.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
db = client['web_crawler']
collection = db['crawl_data_angular']

# model is still a proxy for llama3
def count_tokens(text: str, model: str = "gpt-4") -> int:
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def generate_summary(doc):  # you can tweak this limit as needed
    """ 
    You are processing Angular.js documentation to create a summary search index. 
    Based on the following content extracted from the Angular.js website, please generate a clear, concise summary to the the provided page from the Angular Documentation.
    Write a one to two sentence summary of the overall content of the pagenthe include key learnings and concepts.
    
    DO NOT INCLUDE ANY CODE SNIPPETS IN THE SUMMARY
    
    The summary must be 50 to 100 words.\n\n"""
    
    html_content = doc.get('html', '')

    prompt = f"""
    Documentation:
    {html_content}
    """

    response = requests.post("http://localhost:11434/api/generate", json={
        "model": "llama3.2",
        "prompt": prompt,
        "stream": False,
        "options": {
            "num_predict": 500
        }
    })

    markdown_text = response.json().get("response", "")
    # print(f"Generated summary for document ID {doc['_id']}: {markdown_text}")
    
    collection.update_one(
        {"_id": doc['_id']},
        {"$set": {"summary": markdown_text}}
    )
    
    return markdown_text

    


In [4]:
import time

documents = list(collection.find())
len(documents)

documents = sorted(documents, key=lambda doc: count_tokens(doc.get('html', '')))

print("Generating summaries for documents...")
total_documents = len(documents)
progress = 0

total_tokens_generated_ps = 0
need_summary_count = 0
for doc in documents:
    progress += 1
    
    summary_text = doc.get('summary', None)
    text = doc.get('html', None)
    
    if summary_text is None:
        # start timer
        start_time = time.time()
        need_summary_count += 1 
        output = generate_summary(doc)
        elapsed_time = time.time() - start_time
        token_count = count_tokens(output)
        total_tokens_generated_ps += token_count / elapsed_time
        print(f"{progress}/{total_documents} LAST RUN {token_count/elapsed_time:.2f} tokens/s - AVG: {total_tokens_generated_ps/need_summary_count:.2f} tokens/s", end='\r')
        
print(f"Total documents needing summary: {need_summary_count}")
print("Finished generating summaries for all documents.")

KeyboardInterrupt: 

In [5]:
model_name = 'nomic-ai/nomic-embed-text-v2-moe'
index_name = "nomic-embed-text-v2"
summary_index_name = "summary-nomic-embed-text-v2"
embedding_length = 768

model = SentenceTransformer(model_name, trust_remote_code=True)
add_new_index_to_mongo(model_name, index_name, embedding_length, model)
add_new_index_to_mongo(model_name, summary_index_name, embedding_length, model)

model_name = 'all-MiniLM-L6-v2'
index_name = "all-MiniLM-L6-v2"
summary_index_name = "summary-all-MiniLM-L6-v2"
embedding_length = 384

model = SentenceTransformer(model_name, trust_remote_code=True)
add_new_index_to_mongo(model_name, index_name, embedding_length, model)
add_new_index_to_mongo(model_name, summary_index_name, embedding_length, model)

model_name = 'paraphrase-MiniLM-L6-v2'
index_name = "paraphrase-MiniLM-L6-v2"
summary_index_name = "summary-paraphrase-MiniLM-L6-v2"
embedding_length = 384

model = SentenceTransformer(model_name, trust_remote_code=True)
add_new_index_to_mongo(model_name, index_name, embedding_length, model)
add_new_index_to_mongo(model_name, summary_index_name, embedding_length, model)

model_name = 'all-distilroberta-v1'
index_name = "all-distilroberta-v1"
summary_index_name = "summary-all-distilroberta-v1"
embedding_length = 768

model = SentenceTransformer(model_name, trust_remote_code=True)
add_new_index_to_mongo(model_name, index_name, embedding_length, model)
add_new_index_to_mongo(model_name, summary_index_name, embedding_length, model)

!!!!!!!!!!!!megablocks not available, using torch.matmul instead


Finished Processing All Documetns for Index: nomic-embed-text-v2
Finished Processing All Documetns for Index: summary-nomic-embed-text-v2
Finished Processing All Documetns for Index: all-MiniLM-L6-v2
Finished Processing All Documetns for Index: summary-all-MiniLM-L6-v2
Finished Processing All Documetns for Index: paraphrase-MiniLM-L6-v2
Finished Processing All Documetns for Index: summary-paraphrase-MiniLM-L6-v2
Finished Processing All Documetns for Index: all-distilroberta-v1
Finished Processing All Documetns for Index: summary-all-distilroberta-v1


In [None]:
documents = list(collection.find())

In [None]:


documents = sorted(documents, key=lambda doc: count_tokens(doc.get('html', '')))

for document in documents:
    text = document.get('html', '')
    summary = document.get('summary', '')
    
    text_tokens = count_tokens(text)
    summary_tokens = count_tokens(summary)
    
    print(f"Document ID: {document['_id']} - Text Tokens: {text_tokens} - Summary Tokens: {summary_tokens} - Token Ratio: {summary_tokens / text_tokens:.4f}")