- install sentence transformers
- vectorize the html content from db entries
- store the embeddings (like 3) in the db

*** add sentence-transformers to the requirements ***

## Imports

In [40]:
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
import numpy as np

## Functions to create embeddings

In [41]:
def create_allMiniLML6v2_embedding(text):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embedding = model.encode(text)
    
    if len(embedding) != 384:
        raise ValueError(f"Unexpected embedding length: {len(embedding)}. Expected length: 384.")
    
    return embedding

In [42]:
def create_paraphraseMiniLML6v2_embedding(text):
    model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
    embedding = model.encode(text)
    
    if len(embedding) != 384:
        raise ValueError(f"Unexpected embedding length: {len(embedding)}. Expected length: 384.")
    
    return embedding

In [43]:
def create_distilrobertav1_embedding(text):
    model = SentenceTransformer("all-distilroberta-v1")
    embedding = model.encode(text)
    
    if len(embedding) != 768:
        raise ValueError(f"Unexpected embedding length: {len(embedding)}. Expected length: 768.")
    
    return embedding

## Sample Embeddings

In [44]:
text = "App Router: Getting Started | Next.js Menu Using App Router Features available in /app Using Latest Version 15.2.1 Introduction App Router Getting Started Getting Started Installation Create a new Next.js application with the `create-next-app` CLI, and set up TypeScript, ESLint, and Module Path Aliases. Project Structure An overview of the folder and file conventions in Next.js, and how to organize your project. Layouts and Pages Create your first pages and layouts, and link between them. Images and Fonts Learn how to optimize images and fonts. CSS Learn about the different ways to add CSS to your application, including CSS Modules, Global CSS, Tailwind CSS, and more. Fetching Data Start fetching data and streaming content in your application. Updating Data Learn how to update data in your Next.js application. Error Handling Learn how to display expected errors and handle uncaught exceptions."
allMiniLML6v2_embedding = create_allMiniLML6v2_embedding(text)
paraphraseMiniLML6v2_embedding = create_paraphraseMiniLML6v2_embedding(text)
distilrobertav1_embedding = create_distilrobertav1_embedding(text)

print(allMiniLML6v2_embedding)
print(paraphraseMiniLML6v2_embedding)
print(distilrobertav1_embedding)

[-1.02844499e-01  2.42814678e-03 -3.84111283e-03 -2.87044160e-02
  3.19099240e-02 -4.15024236e-02 -6.73262253e-02  8.46763849e-02
 -3.02750859e-02  4.73282114e-02  3.04304827e-02  8.37238207e-02
 -2.66534388e-02 -1.43460175e-02  3.77480425e-02  3.07756942e-02
  2.88599189e-02 -2.60412954e-02  6.78726733e-02 -1.18468650e-01
  2.27381773e-02  8.49326607e-03  8.82087499e-02 -4.03044969e-02
 -3.68568860e-02  1.77282747e-02  9.18359868e-03  4.34074551e-03
  3.39694358e-02 -5.35990261e-02  3.91442813e-02 -4.31473181e-02
 -9.21235234e-02 -2.43820827e-02 -1.47609040e-01 -2.49317586e-02
  5.26476912e-02 -4.97355498e-02 -3.45150731e-03 -2.96590105e-02
 -1.08424928e-02  1.49061494e-02 -5.27123064e-02 -1.34549029e-02
  1.40801221e-02 -1.09747335e-01 -1.02106355e-01 -7.75186811e-03
 -6.41974562e-04  2.90727057e-02 -3.38266343e-02 -8.43236074e-02
 -1.96955577e-02 -7.05996305e-02 -2.66178101e-02  7.03375414e-02
 -5.51368408e-02  6.59551695e-02  1.99820474e-02  3.81508283e-02
  8.48994702e-02 -6.78708

## Append Embeddings to Docs in DB

In [46]:
client = MongoClient('mongodb+srv://bxrodgers1:CS4675@cluster0.6u3n5.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
db = client['web_crawler']
collection = db['crawl_data']

documents = collection.find()

for document in documents:
    html_text = document.get("html", "")

    if all(key in document for key in ["allMiniLML6v2", "paraphraseMiniLML6v2", "distilrobertav1"]):
        continue
    
    allMiniLML6v2_embedding = create_allMiniLML6v2_embedding(html_text).tolist()
    paraphraseMiniLML6v2_embedding = create_paraphraseMiniLML6v2_embedding(html_text).tolist()
    distilrobertav1_embedding = create_distilrobertav1_embedding(html_text).tolist()
    
    collection.update_one(
        {"_id": document["_id"]},
        {"$set": {
            "allMiniLML6v2": allMiniLML6v2_embedding,
            "paraphraseMiniLML6v2": paraphraseMiniLML6v2_embedding,
            "distilrobertav1": distilrobertav1_embedding
        }}
    )

    print(f"Updated document with _id: {document['_id']}")