In [1]:
from sqlalchemy import create_engine, Column, Integer, String, ARRAY, Float, text, TIMESTAMP, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.dialects.postgresql import UUID as DB_UUID
import uuid
import os
from sqlalchemy.orm import sessionmaker, declarative_base, Session
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import Optional
from uuid import UUID
import psycopg2
from pgvector.sqlalchemy import Vector
from sqlalchemy import select
from sqlalchemy import Column, String, TIMESTAMP
from sqlalchemy.dialects.postgresql import UUID
from pydantic import BaseModel, EmailStr
from datetime import datetime

# Create the sqlachemy engine and connect to our db

load_dotenv()

DATABASE_URL = os.getenv("DATABASE_URL")
print("DB URL: ", DATABASE_URL)

try:
    engine = create_engine(DATABASE_URL)
    print("Connected")
except Exception as e:
    print("Connection falied: ", e)

# Create a session
# auto commmit, transactions are not auto commited, need to call session.commit()
# auto flush, changes are not automatically written to db before every query, need to call session.flush()
# bind to the db engine we created
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

# Base class for declarative models
# Any class that inherits from this will be recognized by SQLAlchemy as a database table
Base = declarative_base()

# SQLAlchemy Data models

class ContentAI(Base):
    __tablename__ = "content_ai"

    content_id = Column(UUID(as_uuid=True), ForeignKey("content.content_id"), primary_key=True)
    ai_summary = Column(String, nullable=True)
    embedding = Column(Vector(dim=384), nullable=True)
    
class ContentItem(Base):
    __tablename__ = "content_item"

    user_id = Column(UUID(as_uuid=True), ForeignKey("users.id"), primary_key=True)
    content_id = Column(UUID(as_uuid=True), ForeignKey("content.content_id"), primary_key=True)
    saved_at = Column(TIMESTAMP, server_default="NOW()")
    notes = Column(String, nullable=True)

class Content(Base):
    __tablename__ = "content"

    content_id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    user_id = Column(UUID(as_uuid=True), ForeignKey("users.id"))
    url = Column(String, unique=True, nullable=False)   
    title = Column(String, nullable=True)
    source = Column(String, nullable=True)
    first_saved_at = Column(TIMESTAMP, server_default="NOW()")

class User(Base):
    __tablename__ = "users"

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid)
    email = Column(String, unique=True, nullable=False)
    created_at = Column(TIMESTAMP, server_default="NOW()")
    username = Column(String,  nullable=False)
    password = Column(String, nullable=False)



class UserCreate(BaseModel):
    email: EmailStr  # email field is validated as a proper email format
    created_at: datetime = None  # Optional: you can default this to now on the server-side

    class Config:
        orm_mode = True

# Schemas content, these are pydantic schemas for data val and serialization

# class ContentCreate(BaseModel):
#     url: str
#     title: Optional[str] = None
#     source: Optional[str] = None


# class ContentRead(ContentCreate):
#     content_id: UUID

#     class Config:
#         from_attributes=True

# Create tables for the sqlalchemy models defined above, only creates tables that do not exist

Base.metadata.create_all(bind=engine)
print("All tables created")

# Database.py

# yield a fresh session per request (FASTAPI), caller to use the session and closes when done w/ session
def get_db():
    # session instance
    db = SessionLocal()  
    try:
        yield db         
    finally:
        db.close()

# test connection

try:
    conn = psycopg2.connect(
        dbname="mydb",
        user="postgres",
        password="cunytechprep",
        host="localhost",
        port="5432"
    )
    print("Connection successful")
    conn.close()    
except Exception as e:
    print("Connection failed:", e)


DB URL:  postgresql://postgres:cunytechprep@localhost:5432/mydb
Connected
All tables created
Connection successful


* 'orm_mode' has been renamed to 'from_attributes'


In [None]:
from sentence_transformers import SentenceTransformer
# from app.data_models.content_ai import ContentAI
# from app.data_models.content import Content
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import Session
from transformers import pipeline
from readability import Document
from bs4 import BeautifulSoup
from sqlalchemy import select
from uuid import UUID
import requests


class ContentEmbeddingManager:
    '''
    Manages:
        - Generating vector embeddings for content summaries
        - Inserting and retrieving content and their embeddings from the db
        - Enriching raw HTML content for a summarization model
        - Performing similarity queries on content embeddings
        - Handling database interactions for both `Content` and `ContentAI` models
    '''

    def __init__(
            self, 
            db, 
            embedding_model_name='sentence-transformers/all-MiniLM-L6-v2', 
            summary_model_name='google-t5/t5-small' # We can always change the model
    ):
        self.db = db
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.summary_model = pipeline("summarization", model=summary_model_name)


    ###############################################################################
    # METHODS
    ###############################################################################

    def query_similar_content(self, query, limit=3):
        ''' Generates a query embedding and searches the db for related content '''
        
        query_embedding = self.embedding_model.encode(query) 

        results = (
            self.db.query(ContentAI, Content)
            .join(Content, ContentAI.content_id == Content.content_id)
            .order_by(ContentAI.embedding.l2_distance(query_embedding))
            .limit(limit)
            .all()
        )

        return results


    def insert_embedded_content(self, content_data):
        '''
        Inserts content into the database if it doesn't exist, summarizes it, and embeds the summary
        If any exceptions occur, the transaction will be rolled back
        '''
        try:

            # Check if the url exists in the db already
            url = content_data.get("url")
            if self._url_exists(url):
                return None, None
            
            # Add content data to the db
            content = self._insert_db(Content, content_data)
            if content is None: 
                raise Exception("Failed to insert content into the database")

            # Enrich the content by parsing the raw_html. If getting the html fails, default the summary_input to title
            summary_input = self._enrich_content(url, content.content_id, self.db)
            if summary_input is None:
                summary_input = content_data.get("title")

            # Use an LLM to summarize the content. If this fails, default to the title for the summary
            ai_summary = self._summarize_content(summary_input) 
            summary = ai_summary if ai_summary else summary_input
            if summary is None: 
                raise Exception("Failed to summarize content and/or there is no title")

            # Embed the summary associated with the content ORM
            embedding = self.generate_embedding(summary)
            if embedding is None: 
                raise Exception("Failed to generate embedding") 

            # Insert the embedding data into the db
            content_ai_data = {
                "content_id": content.content_id, 
                "ai_summary": summary, 
                "embedding": embedding
            }
            content_ai = self._insert_db(ContentAI, content_ai_data)
            if content_ai is None: 
                raise Exception("Failed to insert embedding data") 
            
            # If all steps succeed, then commit transaction to db
            self.db.commit()

            print(
                f"Created Content ID: {content.content_id},\n"
                f"Content AI ID: {content_ai.content_id},\n"
                f"Embedding (first 10): {content_ai.embedding[:10]},\n"
                f"Summary that was embedded {summary}\n\n"
            )

            return content, content_ai
        
        except (SQLAlchemyError, Exception) as e:
            self.db.rollback()
            print(f"Error occured in the insert_embedded_content function. Nothing commited to database: {e}")
            return None, None


    def generate_embedding(self, text):
        ''' Generates an embedding for a piece of text using a Sentence Transformer embedding model '''

        try:
            return self.embedding_model.encode(text)
        except Exception as e: 
            print(f"An unexpected error occurred during embedding: {e}")
            return None


    ###############################################################################
    # HELPER METHODS
    ###############################################################################


    def _enrich_content(self, url: str, content_id: UUID, db: Session):
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            print(f"Error: {response.status_code}: failed get request for {url}, defaulting to title for summarization input")
            return None
        
        raw_html = response.text
        metadata = self._extract_metadata_and_body(raw_html)
        summary_input = self._build_summary_input(metadata)

        print(f"THE SUMMARY INPUT AFTER ENRICHING IS = {summary_input}")

        return summary_input


    def _extract_metadata_and_body(self, html: str) -> dict:
        soup = BeautifulSoup(html, "html.parser")
        
        title = soup.title.string.strip() if soup.title else ""
        description = ""
        tags = []

        for meta in soup.find_all("meta"):
            if meta.get("name") == "description":
                description = meta.get("content", "")
            if meta.get("property") == "og:description":
                description = meta.get("content", "") or description
            if meta.get("name") == "keywords":
                tags = [tag.strip() for tag in meta.get("content", "").split(",")]

        readable_doc = Document(html)
        # html snippet of main content body with boilerplate (nav bars, ads, footers) removed
        body_html = readable_doc.summary() 
        body_text = BeautifulSoup(body_html, "html.parser").get_text()

        print(f"The title from soup is: {title}")

        return {
            "title": title,
            "description": description,
            "tags": tags,
            "body_text": body_text.strip()
        }


    def _build_summary_input(self, metadata: dict) -> str:
        input_parts = []

        if metadata["title"]:
            input_parts.append(f"Title: {metadata["title"]}")
        if metadata["description"]:
            input_parts.append(f"Description: {metadata["description"]}")
        if metadata["tags"]:
            input_parts.append(f"Tags: {", ".join(metadata["tags"])}")
        
        '''
        Content snippet seems to be messing up the summarizer
        The content may not be relavant 
        Example:
            for https://www.lancasterpuppies.com/puppy-search/state/NY?sortBy=prod_all_listings
            Content snippet is copyright info
            and the summary that gets embeeded is: site logo, Web Layout, and all pictures and text are copyright 2014-2024 by PMG US, LLC.
        Commenting put the content snippet seems to help
        '''
        # snippet = metadata["body_text"][:500]
        # input_parts.append(f"Content Snippet: {snippet}")

        return "\n".join(input_parts)


    def _insert_db(self, Data_Model, data):
        '''
        Takes a data model ORM and inserts data into that table
        Returns that db object data
        '''
        try:
            db_data = Data_Model(**data)
            self.db.add(db_data)
            self.db.flush()     # Flush for content_ai insertion
            return db_data
        except SQLAlchemyError as e:
            self.db.rollback()
            print(f"Error Inserting into {Data_Model.__tablename__}: {e}")
            return None


    def _url_exists(self, url):
        ''' Checks if a URL already exists in the database '''
        if url:
            existing_content = self.db.scalar(select(Content).where(Content.url == url))
            if existing_content:
                print(f"Content with URL '{url}' already exists. Skipping insertion.")
                return existing_content  
        return False
    
    
    def _summarize_content(self, summary_input):
        ''' Uses a summary model to get a more detailed summary for the content embeddings '''
        
        # Debug (TO REMOVE)
        print(f"The summary input being passed to summary model is: {summary_input}")

        # Check if there is input first
        if summary_input is None:
            return None

        try:
            input_length = len(self.embedding_model.tokenizer.encode(summary_input)) # Get actual token length
            max_length = int(input_length * 0.6)  # Set the max length to about 60 % of input (we can change)
            max_length = max(30, min(max_length, 150)) # Ensure the max length is within a reasonable range
            summary = self.summary_model(summary_input, max_length=max_length, min_length=15, do_sample=False)[0]['summary_text']
            return summary
        except Exception as e:
            print(f"An error occurred during summarization: {e}")
            return None


In [3]:
db = next(get_db())

manager = ContentEmbeddingManager(db)

# These are fake websites so will get a 404 error
# for i, sentence in enumerate(sentences):
#     content_data = {"url": f"http://example.com/{i}", "title": f"Document {i}"}
#     manager.insert_embedded_content(content_data)

# Lets try some real websites
content_data = {"url": f"https://www.cnn.com/", "title": "Breaking News, Latest News and Videos | CNN"}
manager.insert_embedded_content(content_data)

content_data = {"url": f"https://www.lancasterpuppies.com/puppy-search/state/NY?sortBy=prod_all_listings", "title": "Puppies for sale in New York | Lancaster Puppies"}
manager.insert_embedded_content(content_data)

content_data = {"url": f"https://www.lafitness.com/Pages/default.aspx", "title": "LA Fitness | Gym and Fitness Club | Join Today"}
manager.insert_embedded_content(content_data)

content_data = {"url": f"https://www.barnesandnoble.com/", "title": "Online Bookstore: Books, NOOK ebooks, Music, Movies & Toys | Barnes & Noble®"}
manager.insert_embedded_content(content_data)

# test reddit
content_data = {"url": f"https://www.reddit.com/r/PlanetFitnessMembers/comments/1cpi6r9/where_do_you_get_your_workout_clothes/", "title": "Where Do You Get Your Workout Clothes? : r/PlanetFitnessMembers"}
manager.insert_embedded_content(content_data)


Device set to use mps:0


The title from soup is: Breaking News, Latest News and Videos | CNN
THE SUMMARY INPUT AFTER ENRICHING IS = Title: Breaking News, Latest News and Videos | CNN
Description: View the latest news and breaking news today for U.S., world, weather, entertainment, politics and health at CNN.com.
Tags: cnn news, daily news, breaking news, news today, current events
The summary input being passed to summary model is: Title: Breaking News, Latest News and Videos | CNN
Description: View the latest news and breaking news today for U.S., world, weather, entertainment, politics and health at CNN.com.
Tags: cnn news, daily news, breaking news, news today, current events
Created Content ID: bf401fd7-eec9-456d-a265-277e28adae99,
Content AI ID: bf401fd7-eec9-456d-a265-277e28adae99,
Embedding (first 10): [-0.00127274 -0.02549458  0.03402197  0.02090326  0.11594051 -0.00539603
 -0.03234625 -0.02586317 -0.03936379 -0.02048217],
Summary that was embedded Tags: cnn news, breaking news, news today, current eve

Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)


Created Content ID: 71b07909-64ed-46fd-99fb-383f55477606,
Content AI ID: 71b07909-64ed-46fd-99fb-383f55477606,
Embedding (first 10): [-0.004375   -0.05000497 -0.02920699  0.00618206 -0.03755658 -0.03490396
  0.01428863 -0.12507975 -0.02801919  0.01826911],
Summary that was embedded fitness club, fitness guest pass, indoor cycling, group fitness classes, spin classes, indoor swimming pool, racquetball, gym with basketball, personal training, gym trainer, gyms with classes .


Error: 403: failed get request for https://www.barnesandnoble.com/, defaulting to title for summarization input
The summary input being passed to summary model is: Online Bookstore: Books, NOOK ebooks, Music, Movies & Toys | Barnes & Noble®
Created Content ID: 9fd4a34b-ac60-40fe-b814-0c6ba572f781,
Content AI ID: 9fd4a34b-ac60-40fe-b814-0c6ba572f781,
Embedding (first 10): [-0.00601284 -0.0259262   0.02008257  0.02900521 -0.07970294  0.05311825
 -0.09713804  0.00011612  0.10316836  0.05342844],
Summary that was embed

Your max_length is set to 30, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)


The title from soup is: Reddit - The heart of the internet
THE SUMMARY INPUT AFTER ENRICHING IS = Title: Reddit - The heart of the internet
The summary input being passed to summary model is: Title: Reddit - The heart of the internet
Created Content ID: b0508818-72f1-4276-98da-008c47c676c9,
Content AI ID: b0508818-72f1-4276-98da-008c47c676c9,
Embedding (first 10): [-0.05809776  0.02277812  0.03312013 -0.03189891  0.0085295  -0.03829026
  0.05035569 -0.06179682  0.09096923 -0.08921135],
Summary that was embedded title: reddit . The heart of the internet .




(<__main__.Content at 0x36c074f80>, <__main__.ContentAI at 0x3e7080d70>)

In [4]:
db = next(get_db())

# Similarity Search
query = "pets"
print(f"I am querying: {query}\n")
similar_results = manager.query_similar_content(query, limit=3)

print("\nSimilar Content --------------------\n")
for content_ai, content in similar_results:
    print(f"AI SUMMARY: {content_ai.ai_summary},\n "f"Content ID: {content.content_id},\n Title: {content.title},\n Embedding (first 10): {content_ai.embedding[:10]}\n\n")

db.close()

I am querying: pets


Similar Content --------------------

AI SUMMARY: Lancaster Puppies is the #1 online marketplace to buy and sell puppies in new york .,
 Content ID: a1ef71cc-7813-4d1f-b09e-05bb5a5d5c10,
 Title: Puppies for sale in New York | Lancaster Puppies,
 Embedding (first 10): [-0.01824263 -0.08501491  0.06489807 -0.01231944 -0.05576228  0.03060605
 -0.06183546 -0.02284733 -0.04623424 -0.017433  ]


AI SUMMARY: Tags: cnn news, breaking news, news today, current events, news and breaking news .,
 Content ID: bf401fd7-eec9-456d-a265-277e28adae99,
 Title: Breaking News, Latest News and Videos | CNN,
 Embedding (first 10): [-0.00127274 -0.02549458  0.03402197  0.02090326  0.11594051 -0.00539603
 -0.03234625 -0.02586317 -0.03936379 -0.02048217]


AI SUMMARY: title: reddit . The heart of the internet .,
 Content ID: b0508818-72f1-4276-98da-008c47c676c9,
 Title: Where Do You Get Your Workout Clothes? : r/PlanetFitnessMembers,
 Embedding (first 10): [-0.05809776  0.02277812  0.0331