In [29]:
from pprint import pprint
import glob
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores.pgvector import PGVector

In [30]:
documents_path = "/home/ubuntu/genai_learning/llm/data"

pdf_files = glob.glob(f"{documents_path}/*.pdf")

pprint(pdf_files)

['/home/ubuntu/genai_learning/llm/data/MSL_Notes_15.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_19.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_7.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_17.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_32.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_34.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_6.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_35.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_2.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_12.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_13.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_14.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_23.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_22.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_29.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_1.pdf',
 '/home/ubuntu/genai_learning/llm/data/MSL_Notes_30.pdf',
 '/home/ubuntu/gen

In [9]:
# load document (pdf)

loader = PyPDFLoader("/home/ubuntu/genai_learning/llm/data/MSL_Notes_15.pdf")
documents = loader.load()

print(type(documents))
print(len(documents))
print(type(documents[0]))
print(documents[0])

# pprint(documents)

<class 'list'>
2
<class 'langchain_core.documents.base.Document'>
page_content="Subject:  Medical Science Liaison (MSL) Notes - Critical Discussion on Vitaligen in Cardiovascular Health  \nDate:  June 8, 2024  \nProvider:  Dr. Rachel Foster  \nTitle:  Interventional Cardiologist  \nInstitution:  Advanced Cardiovascular Institute  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced Vitaligen as a potential therapy targeting arterial stiffness and cardiovascular \nhealth . \n• Dr. Foster expressed skepticism about the need for another medication in an already \ncrowded cardiovascular treatment landscape.  \n2. Provider's Current Patient Cases:  \n• Dr. Foster shared reservations about the practical impact of reducing arterial stiffness in \nher patient population.  \n• Discussed cases where patients struggled with more pressing cardiovascular issues such \nas coronary artery disease.  \n3. Efficacy and Clinical Data:  \n• Presented recent clinical data showcasing Vita

In [10]:
# split text 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=80
)

chunks = text_splitter.split_documents(documents)

print(len(chunks))
print(type(chunks[0]))
print()
pprint(chunks)

11
<class 'langchain_core.documents.base.Document'>

[Document(page_content='Subject:  Medical Science Liaison (MSL) Notes - Critical Discussion on Vitaligen in Cardiovascular Health  \nDate:  June 8, 2024  \nProvider:  Dr. Rachel Foster  \nTitle:  Interventional Cardiologist  \nInstitution:  Advanced Cardiovascular Institute  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced Vitaligen as a potential therapy targeting arterial stiffness and cardiovascular', metadata={'source': '/home/ubuntu/genai_learning/llm/data/MSL_Notes_15.pdf', 'page': 0}),
 Document(page_content="health . \n• Dr. Foster expressed skepticism about the need for another medication in an already \ncrowded cardiovascular treatment landscape.  \n2. Provider's Current Patient Cases:  \n• Dr. Foster shared reservations about the practical impact of reducing arterial stiffness in \nher patient population.  \n• Discussed cases where patients struggled with more pressing cardiovascular issues such", me

In [11]:
# embedding model

embedding_model = SentenceTransformerEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {'normalize_embeddings': True}
)

In [18]:
embeddings = embedding_model.embed_documents(chunks[0].page_content)

# print(embeddings)

In [26]:
# The connection to the database
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver= "psycopg2",
    host = "localhost",
    port = "5432",
    database = "vectordb",
    user= "username",
    password="password"
)

print(CONNECTION_STRING)

postgresql+psycopg2://username:password@localhost:5432/vectordb


In [27]:
# # Creates the database connection to our existing DB
# db = PGVector(
#     connection_string = CONNECTION_STRING,
#     collection_name = "my_vector_embedding_tab",
#     embedding_function = embedding_model
# )

In [28]:
# create a collection named "my_vector_embedding_tab" and stores the embeddings
# for the document chunks 


db = PGVector.from_documents(
    chunks[:5],
    connection_string = CONNECTION_STRING,
    collection_name = "my_vector_embedding_tab",
    embedding = embedding_model,
    # pre_delete_collection = True, # uncomment this to delete existing database first
)

In [None]:
```sh

(base) ubuntu@ip-10-0-0-123:~/genai_learning$
(base) ubuntu@ip-10-0-0-123:~/genai_learning$
(base) ubuntu@ip-10-0-0-123:~/genai_learning$ psql --host=localhost --port=5432  --dbname=vectordb --username=username --password password
psql: warning: extra command-line argument "password" ignored
Password:
psql (12.18 (Ubuntu 12.18-0ubuntu0.20.04.1), server 15.4 (Debian 15.4-2.pgdg120+1))
WARNING: psql major version 12, server major version 15.
         Some psql features might not work.
Type "help" for help.

vectordb=# \l
                                 List of databases
   Name    |  Owner   | Encoding |  Collate   |   Ctype    |   Access privileges
-----------+----------+----------+------------+------------+-----------------------
 postgres  | username | UTF8     | en_US.utf8 | en_US.utf8 |
 template0 | username | UTF8     | en_US.utf8 | en_US.utf8 | =c/username          +
           |          |          |            |            | username=CTc/username
 template1 | username | UTF8     | en_US.utf8 | en_US.utf8 | =c/username          +
           |          |          |            |            | username=CTc/username
 vectordb  | username | UTF8     | en_US.utf8 | en_US.utf8 |
(4 rows)

vectordb=#
vectordb=# \dt
                  List of relations
 Schema |          Name           | Type  |  Owner
--------+-------------------------+-------+----------
 public | langchain_pg_collection | table | username
 public | langchain_pg_embedding  | table | username
(2 rows)

vectordb=#
vectordb=# \d+ public.langchain_pg_collection
                                 Table "public.langchain_pg_collection"
  Column   |       Type        | Collation | Nullable | Default | Storage  | Stats target | Description
-----------+-------------------+-----------+----------+---------+----------+--------------+-------------
 name      | character varying |           |          |         | extended |              |
 cmetadata | json              |           |          |         | extended |              |
 uuid      | uuid              |           | not null |         | plain    |              |
Indexes:
    "langchain_pg_collection_pkey" PRIMARY KEY, btree (uuid)
Referenced by:
    TABLE "langchain_pg_embedding" CONSTRAINT "langchain_pg_embedding_collection_id_fkey" FOREIGN KEY (collection_id) REFERENCES langchain_pg_collection(uuid) ON DELETE CASCADE
Access method: heap

vectordb=#
vectordb=#
vectordb=# \d+ public.langchain_pg_embedding
                                   Table "public.langchain_pg_embedding"
    Column     |       Type        | Collation | Nullable | Default | Storage  | Stats target | Description
---------------+-------------------+-----------+----------+---------+----------+--------------+-------------
 collection_id | uuid              |           |          |         | plain    |              |
 embedding     | vector            |           |          |         | extended |              |
 document      | character varying |           |          |         | extended |              |
 cmetadata     | json              |           |          |         | extended |              |
 custom_id     | character varying |           |          |         | extended |              |
 uuid          | uuid              |           | not null |         | plain    |              |
Indexes:
    "langchain_pg_embedding_pkey" PRIMARY KEY, btree (uuid)
Foreign-key constraints:
    "langchain_pg_embedding_collection_id_fkey" FOREIGN KEY (collection_id) REFERENCES langchain_pg_collection(uuid) ON DELETE CASCADE
Access method: heap

vectordb=#
vectordb=#
vectordb=# select * from public.langchain_pg_embedding where collection_id=(select uuid from public.langchain_pg_collection where name='my_vector_embedding_tab') LIMIT 5;


```