In [1]:
import numpy as np
from Embedders import get_embedder
from database.database import Database
from time import time

db = Database()
db.test_connection()

Database         User             Host                             Port            
citeline_db      bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.3 (Homebrew) on x86_64-apple-darwin23.6.0, compiled by Apple clang version 16.0.0 (clang-1600.0.26.6), 64-bit',)


In [2]:
embedder = get_embedder(model_name="BAAI/bge-small-en", device="mps")
vector = embedder(["Hello world!"])
print(vector.shape)

(1, 384)


In [13]:
start = time()
results = db.query_vector_column(vector[0], target_column='bge', table_name='lib', use_index=False, top_k=1000000)
print(f"Time take: {time() - start:.2f} seconds")
print(f"Results: {len(results)}")

  Query execution time: 21.48 seconds
  Found 1000000 results
top_k: 1000000
Time take: 25.22 seconds
Results: 1000000


In [9]:
cursor = db.conn.cursor()
cursor.execute('SET enable_indexscan = off')
cursor.execute('''
SELECT id, doi, title, abstract, chunk, bge <=> %s AS distance
FROM lib
ORDER BY distance 
LIMIT 1000000
''', (vector[0],))
results = cursor.fetchall()
print(len(results))

1000000


In [None]:
small_bodies = research[research['body'].str.len() < 1000]
small_bodies.head()

In [None]:
mid_bodies = research[(research['body'].str.len() >= 1000) & (research['body'].str.len() < 5000)]
mid_bodies.head()

In [None]:
research.iloc[10138]['pubdate']

In [None]:
# df = mid_bodies
mid_bodies = research[(research['body'].str.len() >= 1000)
                      & (research['body'].str.len() < 5000)]
mid_bodies['pubdate'] = mid_bodies['pubdate'].str.replace(
    r'-00', '-01', regex=True)
# mid_bodies['pubdate'] = mid_bodies['pubdate'].str.replace(
#     r'-00-', '-01', regex=True)
mid_bodies['pubdate'] = pd.to_datetime(
    mid_bodies['pubdate'], format='%Y-%m-%d', errors='coerce')
mid_bodies['pubdate'].tolist()

In [None]:
mid_bodies['pubdate'].dt.strftime('%Y-%m-%d').tolist()

In [None]:
dupes = []
for record in reviews.to_dict(orient='records'):
    if record['doi'] in research.doi.values:
        dupes.append(record)
print("Done")
print(len(dupes))

In [None]:
bert = EncoderEmbedder(model_name='bert-base-uncased', device='mps', normalize=False)
bert.model.config

In [None]:
import os

from dotenv import load_dotenv
from database.database import Database

load_dotenv('.env', override=True)

# Database setup
db_params = {
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT')
}
db = Database(db_params)
db.test_connection()

In [None]:
from database.database import Database
from dotenv import load_dotenv
import os

load_dotenv()
db_params = {
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT'),
}
db = Database(db_params)

db.test_connection()
print(db.db_params)

In [None]:
import psycopg2
from time import time
conn = psycopg2.connect(**db.db_params)
cursor = conn.cursor()

cursor.execute('SELECT text FROM chunks;')
embedding_times = []
for i in range(30):
    rows = [row[0] for row in cursor.fetchmany(1024)]
    start = time()
    embeddings = embedder(rows)
    end = time()
    embedding_times.append(end - start)
    print(f'Batch {i+1} took {end - start:.2f} seconds. Shape: {embeddings.shape}')

print(f'Average time: {sum(embedding_times) / len(embedding_times):.2f} seconds')

In [None]:
import psycopg2
from time import time

averages = []
batch_size = 1
while batch_size < 2_500_000:
    try:
        # Get chunks from the database
        conn = psycopg2.connect(**db.db_params)
        cursor = conn.cursor()
        cursor.execute(
            f"SELECT text FROM chunks LIMIT {batch_size}")
        rows = cursor.fetchall()
        conn.close()
        chunks = [row[0] for row in rows]
        print(f"Got {len(chunks)} chunks")

        # Embed the chunks
        start = time()
        result = embedder(chunks)
        duration = time() - start
        print(f"Result shape: {result.shape}")
        averages.append(duration/batch_size)
        print(f"Batch size {batch_size} took {duration} seconds ({duration/batch_size} per chunk)")
        batch_size *= 2
    except Exception as e:
        print(e)
        break


In [None]:
print(chunks[234])

In [None]:
results = db.query_vector_table('bge', query_vector=embeddings[0], metric='vector_cosine_ops', top_k=5)
for result in results:
    print(result.similarity)

In [None]:
ip_results = db.query_vector_table('bge', query_vector=embeddings[0], metric='vector_ip_ops', top_k=5)
for result in ip_results:
    print(result.similarity)

In [None]:
import matplotlib.pyplot as plt
from random import random

plt.figure()
x = [random() for _ in range(100)]
y = [-x_i for x_i in x]
plt.plot(x, y, marker='o', label='Average Score')
plt.xlabel('Distance (n = 123)')
plt.grid(True)
plt.text(0.95, 0.05, "n = 123", horizontalalignment='right', verticalalignment='bottom')
plt.show()