In [None]:
import psycopg2
from psycopg2 import sql
import os
from dotenv import load_dotenv
import markdown_parser
import embedding_strings
import numpy as np

In [27]:
load_dotenv()
print(os.getenv('POSTGRES_HOST'))


conn = psycopg2.connect(
    dbname=os.getenv('POSTGRES_DB'),  # Replace with your database name
    user=os.getenv('POSTGRES_USER'),       # Replace with your username
    password=os.getenv('POSTGRES_PASSWORD'),  # Replace with your password
    host=os.getenv('POSTGRES_HOST'),  # Replace with your host (e.g., localhost)
    port=os.getenv('PORT', '5432')      # Replace with your port if different
)


127.0.0.1


In [2]:
import sys
sys.path.append(os.path.abspath("../llm"))
from service import get_text_embeddings_llama

# update EMBEDDING to the embedding you want to use
EMBEDDING = "llama" # or "sentenceTransformer"


In [51]:
# data path - update this to change data directory
path = os.path.abspath(os.path.join(os.getcwd(), "../../../resources/sample-notes-markdown/GeneratedTestData"))


In [52]:
# Create a cursor object
cur = conn.cursor()

# Create the table if it doesn't exist
create_table_query = """
CREATE TABLE IF NOT EXISTS note_data (
    id SERIAL PRIMARY KEY,  
    title VARCHAR(255) NOT NULL,  
    content TEXT,  
    created_at DATE DEFAULT CURRENT_DATE,  
    updated_at DATE DEFAULT CURRENT_DATE,
    embeddings DOUBLE PRECISION[]
);
"""
cur.execute(create_table_query)
conn.commit()

# Function to insert a new row
def insert_row(title, content, embeddings):
    insert_query = """
    INSERT INTO note_data (title, content, embeddings)
    VALUES (%s, %s, %s)
    """
    try:
        cur.execute(insert_query, (title, content, embeddings))
        conn.commit()
    except Exception as e:
        conn.rollback()
        print(f"Insert failed for title: {title} â€” {e}")


for dirpath, dirnames, filenames in os.walk(path):
    for filename in filenames:
        # Update / to \\ if you are in windows 
        file = dirpath + "/" + filename 
        parsed = markdown_parser.markdown_plaintext(file)

        title = filename.replace(".md", "")
        content = " ".join(parsed)

        if EMBEDDING  == "llama":
            embeddings = get_text_embeddings_llama(content)
            insert_row(title, content, embeddings)
        if EMBEDDING == "SentenceTransformer":
            embeddings = embedding_strings.word_embeddings(markdown_parser.markdown_plaintext(file))
            avg_query_embedding = np.mean([emb for emb in embeddings.values()], axis=0)
            avg_query_embedding_list = avg_query_embedding.tolist()
            insert_row(title, content, avg_query_embedding_list)


In [23]:
# close the connection 
cur.close()
conn.close()

In [24]:
len(an)

3072