In [1]:
from sqlalchemy import create_engine, Column, Integer, String, ARRAY, Float, text, TIMESTAMP, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.dialects.postgresql import UUID as DB_UUID
import uuid
import os
from sqlalchemy.orm import sessionmaker, declarative_base, Session
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import Optional
from uuid import UUID
import psycopg2
from pgvector.sqlalchemy import Vector
from sqlalchemy import select

# Create the sqlachemy engine and connect to our db

load_dotenv()

DATABASE_URL = os.getenv("DATABASE_URL")
print("DB URL: ", DATABASE_URL)

try:
    engine = create_engine(DATABASE_URL)
    print("Connected")
except Exception as e:
    print("Connection falied: ", e)

# Create a session
# auto commmit, transactions are not auto commited, need to call session.commit()
# auto flush, changes are not automatically written to db before every query, need to call session.flush()
# bind to the db engine we created
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

# Base class for declarative models
# Any class that inherits from this will be recognized by SQLAlchemy as a database table
Base = declarative_base()

# SQLAlchemy Data models

class ContentAI(Base):
    __tablename__ = "content_ai"

    content_id = Column(DB_UUID, ForeignKey("content.content_id"), primary_key=True)
    ai_summary = Column(String, nullable=True)
    embedding = Column(Vector(dim=384), nullable=True)
    # embedding = Column(String, nullable=True) # pgvector integration may need different type (OLD CODE)

    # NEED TO REPLACE DIMENSIONS WITH CORRECT EMBEDDING MODEL
    # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
    # For example, this transformer has 384 dim dense vector
    # I am using vector w/ dim of 2 for now 
    
class ContentItem(Base):
    __tablename__ = "content_item"

    user_id = Column(DB_UUID, ForeignKey("users.id"), primary_key=True)
    content_id = Column(DB_UUID, ForeignKey("content.content_id"), primary_key=True)
    saved_at = Column(TIMESTAMP, server_default="NOW()")
    notes = Column(String, nullable=True)

class Content(Base):
    __tablename__ = "content"

    content_id = Column(DB_UUID, primary_key=True, default=uuid.uuid4)
    user_id = Column(DB_UUID, ForeignKey("users.id"))
    url = Column(String, unique=True, nullable=False)   
    title = Column(String, nullable=True)
    source = Column(String, nullable=True)
    first_saved_at = Column(TIMESTAMP, server_default="NOW()")

class User(Base):
    __tablename__ = "users"

    id = Column(DB_UUID, primary_key=True, default=uuid)
    email = Column(String, unique=True, nullable=False)
    created_at = Column(TIMESTAMP, server_default="NOW()")

# Schemas content, these are pydantic schemas for data val and serialization

class ContentCreate(BaseModel):
    url: str
    title: Optional[str] = None
    source: Optional[str] = None


class ContentRead(ContentCreate):
    content_id: UUID

    class Config:
        from_attributes=True

# Create tables for the sqlalchemy models defined above, only creates tables that do not exist

Base.metadata.create_all(bind=engine)
print("All tables created")

# Database.py

# yield a fresh session per request (FASTAPI), caller to use the session and closes when done w/ session
def get_db():
    # session instance
    db = SessionLocal()  
    try:
        yield db         
    finally:
        db.close()

# test connection

try:
    conn = psycopg2.connect(
        dbname="mydb",
        user="postgres",
        password="cunytechprep",
        host="localhost",
        port="5432"
    )
    print("Connection successful")
    conn.close()    
except Exception as e:
    print("Connection failed:", e)


DB URL:  postgresql://postgres:cunytechprep@localhost:5432/mydb
Connected
All tables created
Connection successful


In [2]:
from sentence_transformers import SentenceTransformer
# from app.data_models.content_ai import ContentAI
# from app.data_models.content import Content
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy import select


class ContentEmbeddingManager:
    '''
    Manages content embeddings, database interactions, and similarity queries
    '''

    def __init__(self, db, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        self.db = db
        self.model = SentenceTransformer(model_name)


    ###############################################################################
    # METHODS
    ###############################################################################

    def query_similar_content(self, query, limit=3):
        ''' Generates a query embedding and searches the db for related content '''
        
        query_embedding = self.model.encode(query) 

        results = (
            self.db.query(ContentAI, Content)
            .join(Content, ContentAI.content_id == Content.content_id)
            .order_by(ContentAI.embedding.l2_distance(query_embedding))
            .limit(limit)
            .all()
        )

        return results


    def insert_embedded_content(self, content_data, placeholder_sent):
        '''
        Inserts content into the database if it doesn't exist, summarizes it, and embeds the summary
        If any exceptions occur, the transaction will be rolled back
        '''
        try:
            if self._url_exists(content_data.get("url")):
                return None, None
            
            # Add content data to the db
            content = self._insert_db(Content, content_data)
            if content is None: 
                raise Exception("Failed to insert content into the database")

            # Use an LLM to summarize the content. If this fails, default to the title for the summary
            ai_summary = self._summarize_content(placeholder_sent) # REPLACE W/ content
            summary = ai_summary if ai_summary else content.tile
            if summary is None: 
                raise Exception("Failed to summarize content and/or there is no title")

            # Embed the summary associated with the content ORM
            embedding = self.generate_embedding(summary)
            if embedding is None: 
                raise Exception("Failed to generate embedding") 

            # Insert the embedding data into the db
            content_ai_data = {
                "content_id": content.content_id, 
                "ai_summary": summary, 
                "embedding": embedding
            }
            content_ai = self._insert_db(ContentAI, content_ai_data)
            if content_ai is None: 
                raise Exception("Failed to insert embedding data") 
            
            # If all steps succeed, then commit transaction to db
            self.db.commit()

            print(
                f"Created Content ID: {content.content_id},\n"
                f"Content AI ID: {content_ai.content_id},\n"
                f"Embedding (first 10): {content_ai.embedding[:10]}\n\n"
            )

            return content, content_ai
        
        except (SQLAlchemyError, Exception) as e:
            self.db.rollback()
            print(f"Error occured in the insert_embedded_content function. Nothing commited to database: {e}")
            return None, None


    def generate_embedding(self, text):
        ''' Generates an embedding for a piece of text using a Sentence Transformer embedding model '''
        try:
            return self.model.encode(text)
        except Exception as e: 
            print(f"An unexpected error occurred during embedding: {e}")
            return None


    ###############################################################################
    # HELPER METHODS
    ###############################################################################

    def _insert_db(self, Data_Model, data):
        '''
        Takes a data model ORM and inserts data into that table
        Returns that db object data
        '''
        try:
            db_data = Data_Model(**data)
            self.db.add(db_data)
            self.db.flush()     # Flush for content_ai insertion
            return db_data
        except SQLAlchemyError as e:
            self.db.rollback()
            print(f"Error Inserting into {Data_Model.__tablename__}: {e}")
            return None


    def _url_exists(self, url):
        ''' Checks if a URL already exists in the database '''
        if url:
            existing_content = self.db.scalar(select(Content).where(Content.url == url))
            if existing_content:
                print(f"Content with URL '{url}' already exists. Skipping insertion.")
                return existing_content  
        return False
    
    
    # TODO
    def _summarize_content(self, content):
        # Place holder for now
        return content # For now

In [3]:
# AI Generated sentences to embed, will be searched through
sentences = [
    "This is a site about dogs.",
    "Youtube video of cute puppies playing in the park.",
    "A website providing information on sustainable living practices.",
    "Blog with vegan dessert recipes.",
    "Official online store for handcrafted leather goods.",
    "News and articles on artificial intelligence.",
    "Community forum for vintage motorcycle enthusiasts.",
    "Portfolio showcasing freelance web development work.",
    "Learn about the history of ancient Egypt.",
    "Platform for booking guided tours.",
    "Resources for learning the Spanish language.",
    "Reviews of the best hiking gear.",
    "Online tool for converting image formats.",
    "Site to create and share online surveys.",
    "Platform connecting pet owners with local sitters.",
    "Download free stock photos and videos here.",
    "Register for the upcoming tech conference.",
    "Online marketplace for used books.",
    "Track your fitness progress and set goals.",
    "Service for generating creative writing prompts.",
    "Real-time updates on cryptocurrency prices.",
    "Platform for collaborating on software projects.",
    "Blog where developers share coding tips.",
    "Online shop for ethically sourced coffee beans.",
    "Forum for gardeners to discuss plant care.",
    "Website offering online data science courses.",
    "Platform for artists to showcase and sell artwork."
]

In [4]:
db = next(get_db())

manager = ContentEmbeddingManager(db)

for i, sentence in enumerate(sentences):
    content_data = {"url": f"http://example.com/{i}", "title": f"Document {i}"}
    manager.insert_embedded_content(content_data, sentence)

Created Content ID: 5ba661f7-88b8-406f-9dac-5e36f8d6d63c,
Content AI ID: 5ba661f7-88b8-406f-9dac-5e36f8d6d63c,
Embedding (first 10): [-0.01026562 -0.03155861  0.03697931  0.07983022 -0.07506996  0.00247868
 -0.0256738  -0.00067373  0.00070128  0.02674934]


Created Content ID: 262863d0-e3c2-43e9-8caf-5cc54cf06829,
Content AI ID: 262863d0-e3c2-43e9-8caf-5cc54cf06829,
Embedding (first 10): [-0.02276482 -0.03064476  0.0544138  -0.0121147   0.07638656  0.00855733
  0.03231319 -0.01104542  0.04446819  0.05932536]


Created Content ID: 95d5d8f4-0521-4c66-a80e-590b4dc6a412,
Content AI ID: 95d5d8f4-0521-4c66-a80e-590b4dc6a412,
Embedding (first 10): [ 0.03150964  0.08399283 -0.03196497  0.08412008  0.0204234  -0.02037003
 -0.09261682 -0.03375958 -0.08588447  0.01923372]


Created Content ID: 99bc4cbd-58eb-4c8d-8f78-5b841090f2fe,
Content AI ID: 99bc4cbd-58eb-4c8d-8f78-5b841090f2fe,
Embedding (first 10): [-0.03376704 -0.04490927  0.01887723  0.08904339 -0.01167508  0.01128747
 -0.02187717 -0.0499

In [6]:
db = next(get_db())

# Similarity Search
query = "pets"
print(f"I am querying: {query}\n")
similar_results = manager.query_similar_content(query, limit=3)

print("\nSimilar Content --------------------\n")
for content_ai, content in similar_results:
    print(f"AI SUMMARY: {content_ai.ai_summary},\n "f"Content ID: {content.content_id},\n Title: {content.title},\n Embedding (first 10): {content_ai.embedding[:10]}\n\n")

db.close()

I am querying: pets


Similar Content --------------------

AI SUMMARY: This is a site about dogs.,
 Content ID: 5ba661f7-88b8-406f-9dac-5e36f8d6d63c,
 Title: Document 0,
 Embedding (first 10): [-0.01026562 -0.03155861  0.03697931  0.07983022 -0.07506996  0.00247868
 -0.0256738  -0.00067373  0.00070128  0.02674934]


AI SUMMARY: Platform connecting pet owners with local sitters.,
 Content ID: e6ae9f45-2073-4e42-8292-6813c018f6e0,
 Title: Document 14,
 Embedding (first 10): [ 0.03999429 -0.01985181  0.0063899  -0.03825036 -0.12702149 -0.01979247
 -0.01255366 -0.02646453 -0.050263    0.02477865]


AI SUMMARY: Youtube video of cute puppies playing in the park.,
 Content ID: 262863d0-e3c2-43e9-8caf-5cc54cf06829,
 Title: Document 1,
 Embedding (first 10): [-0.02276482 -0.03064476  0.0544138  -0.0121147   0.07638656  0.00855733
  0.03231319 -0.01104542  0.04446819  0.05932536]


