In [24]:
from database.database import DatabaseProcessor
from Embedders import get_embedder
import torch
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv
import os

In [4]:
load_dotenv('.env', override=True)
db_params = {
    'dbname': 'test',
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT')
}
db = DatabaseProcessor(db_params)
db.test_connection()

Database         User             Host                             Port            
test             bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.3 (Homebrew) on x86_64-apple-darwin23.6.0, compiled by Apple clang version 16.0.0 (clang-1600.0.26.6), 64-bit',)


In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
model.to('mps')
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [23]:
ins = tokenizer(['hello', 'world you look nice'], return_tensors='pt', padding=True, truncation=True).to('mps')
outs = model(**ins)
embeddings = outs.last_hidden_state[:, 0, :]
print(embeddings)

tensor([[-0.3061,  0.2622, -0.1896,  ..., -0.1651,  0.1014,  0.4119],
        [ 0.3192,  0.1219,  0.2656,  ..., -0.4724,  0.2757, -0.0384]],
       device='mps:0', grad_fn=<SliceBackward0>)


In [None]:
embedder = get_embedder('bert-base-uncased', device='mps', normalize=False)
db.create_vector_table('test_tensor', dim=768, embedder=embedder)

BertConfig {
  "_name_or_path": "BAAI/bge-small-en",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.46.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [25]:
embeddings.shape

torch.Size([2, 768])

In [31]:
import psycopg2
from pgvector.psycopg2 import register_vector
from psycopg2.extras import execute_values

In [None]:
conn = psycopg2.connect(**db.db_params)
register_vector(conn)
cursor = conn.cursor()
cursor.execute('SELECT count(*) FROM test_tensor_input;')
results = cursor.fetchall()
print(results)

In [33]:
query = f'INSERT INTO test_tensor_input ({embeddings.detach().numpy()}) VALUES %s;'

execute_values(cursor, query, embeddings.cpu().numpy())
conn.commit()
cursor.execute('SELECT count(*) FROM test_tensor_input;')
results = cursor.fetchall()
print(results)


TypeError: can't convert mps:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.