## Testing Embedding Models

### Set up DB First

In [6]:
from sqlalchemy import create_engine, Column, Integer, String, ARRAY, Float, text, TIMESTAMP, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.dialects.postgresql import UUID as DB_UUID
import uuid
import os
from sqlalchemy.orm import sessionmaker, declarative_base, Session
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import Optional
from uuid import UUID
import psycopg2
from pgvector.sqlalchemy import Vector
from sqlalchemy import select

# Create the sqlachemy engine and connect to our db

load_dotenv()

DATABASE_URL = os.getenv("DATABASE_URL")
print("DB URL: ", DATABASE_URL)

try:
    engine = create_engine(DATABASE_URL)
    print("Connected")
except Exception as e:
    print("Connection falied: ", e)

# Create a session
# auto commmit, transactions are not auto commited, need to call session.commit()
# auto flush, changes are not automatically written to db before every query, need to call session.flush()
# bind to the db engine we created
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

# Base class for declarative models
# Any class that inherits from this will be recognized by SQLAlchemy as a database table
Base = declarative_base()

# SQLAlchemy Data models

class ContentAI(Base):
    __tablename__ = "content_ai"

    content_id = Column(DB_UUID, ForeignKey("content.content_id"), primary_key=True)
    ai_summary = Column(String, nullable=True)
    embedding = Column(Vector(dim=384), nullable=True)
    # embedding = Column(String, nullable=True) # pgvector integration may need different type (OLD CODE)

    # NEED TO REPLACE DIMENSIONS WITH CORRECT EMBEDDING MODEL
    # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
    # For example, this transformer has 384 dim dense vector
    # I am using vector w/ dim of 2 for now 
    
class ContentItem(Base):
    __tablename__ = "content_item"

    user_id = Column(DB_UUID, ForeignKey("users.id"), primary_key=True)
    content_id = Column(DB_UUID, ForeignKey("content.content_id"), primary_key=True)
    saved_at = Column(TIMESTAMP, server_default="NOW()")
    notes = Column(String, nullable=True)

class Content(Base):
    __tablename__ = "content"

    content_id = Column(DB_UUID, primary_key=True, default=uuid.uuid4)
    user_id = Column(DB_UUID, ForeignKey("users.id"))
    url = Column(String, unique=True, nullable=False)   
    title = Column(String, nullable=True)
    source = Column(String, nullable=True)
    first_saved_at = Column(TIMESTAMP, server_default="NOW()")

class User(Base):
    __tablename__ = "users"

    id = Column(DB_UUID, primary_key=True, default=uuid)
    email = Column(String, unique=True, nullable=False)
    created_at = Column(TIMESTAMP, server_default="NOW()")

# Schemas content, these are pydantic schemas for data val and serialization

class ContentCreate(BaseModel):
    url: str
    title: Optional[str] = None
    source: Optional[str] = None


class ContentRead(ContentCreate):
    content_id: UUID

    class Config:
        from_attributes=True

# Create tables for the sqlalchemy models defined above, only creates tables that do not exist

Base.metadata.create_all(bind=engine)
print("All tables created")

# Database.py

# yield a fresh session per request (FASTAPI), caller to use the session and closes when done w/ session
def get_db():
    # session instance
    db = SessionLocal()  
    try:
        yield db         
    finally:
        db.close()

# test connection

try:
    conn = psycopg2.connect(
        dbname="mydb",
        user="postgres",
        password="cunytechprep",
        host="localhost",
        port="5432"
    )
    print("Connection successful")
    conn.close()    
except Exception as e:
    print("Connection failed:", e)


DB URL:  postgresql://postgres:cunytechprep@localhost:5432/mydb
Connected
All tables created
Connection successful


## Testing all-mini sentence transformer

In [7]:
from sentence_transformers import SentenceTransformer

'''
https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
all-MiniLM-L6-v2: This is a very popular model that provides a good balance of speed, size (relatively small), and accuracy for general-purpose semantic search.   
all-mpnet-base-v2: Generally offers higher accuracy than all-MiniLM-L6-v2 but is larger and slightly slower.
multi-qa-mpnet-base-dot-v1 / multi-qa-MiniLM-L6-cos-v1: These models are specifically trained for question-answering and retrieval tasks and can perform well for semantic search where the queries are question-like.   
paraphrase-multilingual-mpnet-base-v2 / paraphrase-multilingual-MiniLM-L12-v2: Excellent choices if you need to support multiple languages.
'''

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [8]:
# AI Generated sentences to embed, will be searched through
sentences = [
    "This is a site about dogs.",
    "Youtube video of cute puppies playing in the park.",
    "A website providing information on sustainable living practices.",
    "Blog with vegan dessert recipes.",
    "Official online store for handcrafted leather goods.",
    "News and articles on artificial intelligence.",
    "Community forum for vintage motorcycle enthusiasts.",
    "Portfolio showcasing freelance web development work.",
    "Learn about the history of ancient Egypt.",
    "Platform for booking guided tours.",
    "Resources for learning the Spanish language.",
    "Reviews of the best hiking gear.",
    "Online tool for converting image formats.",
    "Site to create and share online surveys.",
    "Platform connecting pet owners with local sitters.",
    "Download free stock photos and videos here.",
    "Register for the upcoming tech conference.",
    "Online marketplace for used books.",
    "Track your fitness progress and set goals.",
    "Service for generating creative writing prompts.",
    "Real-time updates on cryptocurrency prices.",
    "Platform for collaborating on software projects.",
    "Blog where developers share coding tips.",
    "Online shop for ethically sourced coffee beans.",
    "Forum for gardeners to discuss plant care.",
    "Website offering online data science courses.",
    "Platform for artists to showcase and sell artwork."
]

In [9]:
# I am loosely defining this function for now and will clean it up

def create_content_with_embedding(db, content_data, text):

    # Check if this urls exists
    url_to_check = content_data.get("url")
    if url_to_check:
        existing_content = db.scalar(select(Content).where(Content.url == url_to_check))
        if existing_content:
            print(f"Content with URL '{url_to_check}' already exists. Skipping insertion.")
            return existing_content, None  

    # Add the content data to the db
    content = Content(**content_data)
    db.add(content)
    db.commit()
    db.refresh(content)

    #### TODO ########################################################################

    # Use an LLM to summarize the url
    url = content.url
    title = content.title
    # source = content.source
    # print(f"The following will be given to the LLM to be summarized {url}, {title}")
    ai_summary = text

    ##################################################################################
    
    # Embed the ai summary text
    embedding_vector = model.encode(text) # NEED TO CHANGE BACK TO AI SUMMARY
    content_ai = ContentAI(content_id=content.content_id, ai_summary=ai_summary, embedding=embedding_vector)
    db.add(content_ai)
    db.commit()
    db.refresh(content_ai)

    print(f"Created Content ID: {content.content_id},\n Content AI ID: {content_ai.content_id},\n Embedding (first 10): {content_ai.embedding[:10]}\n\n")

    return content, content_ai


# Call the generator function on a new sessions, fresh session per request FASTAPI
db = next(get_db())

for i, sentence in enumerate(sentences):
    content_data = {"url": f"http://example.com/{i}", "title": f"Document {i}"}
    create_content_with_embedding(db, content_data, sentence)


db.close()

Content with URL 'http://example.com/0' already exists. Skipping insertion.
Content with URL 'http://example.com/1' already exists. Skipping insertion.
Content with URL 'http://example.com/2' already exists. Skipping insertion.
Content with URL 'http://example.com/3' already exists. Skipping insertion.
Content with URL 'http://example.com/4' already exists. Skipping insertion.
Content with URL 'http://example.com/5' already exists. Skipping insertion.
Content with URL 'http://example.com/6' already exists. Skipping insertion.
Content with URL 'http://example.com/7' already exists. Skipping insertion.
Content with URL 'http://example.com/8' already exists. Skipping insertion.
Content with URL 'http://example.com/9' already exists. Skipping insertion.
Content with URL 'http://example.com/10' already exists. Skipping insertion.
Content with URL 'http://example.com/11' already exists. Skipping insertion.
Content with URL 'http://example.com/12' already exists. Skipping insertion.
Content w

In [10]:
# Vector class https://github.com/pgvector/pgvector-python/blob/master/pgvector/sqlalchemy/vector.py#L43

def find_similar_content(db, query_text, limit=2):
    
    query_embedding = model.encode(query_text) 
    print(f"This query embedding (First 10) = {query_embedding[:10]}")

    results = db.query(ContentAI, Content) \
        .join(Content, ContentAI.content_id == Content.content_id) \
        .order_by(ContentAI.embedding.l2_distance(query_embedding)) \
        .limit(limit) \
        .all()

    return results



db = next(get_db())

# Similarity Search
query = "work out"
print(f"I am querying: {query}\n")
similar_results = find_similar_content(db, query)

print("\nSimilar Content --------------------\n")
for content_ai, content in similar_results:
    print(f"AI SUMMARY: {content_ai.ai_summary},\n "f"Content ID: {content.content_id},\n Title: {content.title},\n Embedding (first 10): {content_ai.embedding[:10]}\n\n")

db.close()


I am querying: work out

This query embedding (First 10) = [-0.06827506  0.05004624 -0.01302164  0.05232676 -0.05089933  0.01155781
  0.07433058 -0.00416497  0.01187012 -0.02499063]

Similar Content --------------------

AI SUMMARY: Track your fitness progress and set goals.,
 Content ID: 804c4a6a-505d-4477-a32b-22f043a865ed,
 Title: Document 18,
 Embedding (first 10): [ 0.00915601  0.04066574 -0.02963619  0.0440957   0.0350326   0.05192318
 -0.01645158 -0.05470086 -0.05029801 -0.07216614]


AI SUMMARY: Reviews of the best hiking gear.,
 Content ID: e1a87727-7c4d-4bfb-b361-981b8ec80d65,
 Title: Document 11,
 Embedding (first 10): [-0.0903054   0.02331327  0.05094973  0.06554957  0.00227929 -0.03645795
  0.05474317  0.00542936 -0.09139992  0.08928707]


