# Lesson 1 - Getting started with Embeddings
In this lesson we will be using the B.C law titles, sentence transformer as our embedding model and Postgres PGVector to store the embeddings. We will be using the embeddings to find similar sentences in the database.

In [None]:
#Do the migration
import os
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain.docstore.document import Document
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.pgvector import PGVector
from langchain_text_splitters import CharacterTextSplitter
from getlawtitles import downloadlawtitles

In [None]:
POSTGRES_USER = os.getenv('POSTGRES_USER')
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')
POSTGRES_DB = os.getenv('POSTGRES_DB')
POSTGRES_PORT = os.getenv('POSTGRES_PORT')
POSTGRES_HOST = os.getenv('POSTGRES_HOST')


CONNECTION_STRING = f'postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}'

In [None]:
##################### EXAMPLE EMBEDDINGS ############################
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
text = "This is a test document."
query_result = embeddings.embed_query(text)
doc_result = embeddings.embed_documents([text, "This is not a test document."])

##### Download the law titles from S3  bucket. You will need to set the S3 access environment variables to download the data.
```bash
      - S3_ACCESS_KEY
      - S3_SECRET_ACCESS_KEY
      - S3_ENDPOINT_URL
```

In [None]:
downloadlawtitles()

We have set the chunk size to 1 as the CharacterTextSplitter takes the manximum length between the chunk_size and separator. Each setences has variable length. We won't know how long the title is until we read it. Since the titles are short, and each laws title starts in a new line, we have set the chunk size to 1 as we hope each title is more than 1 character long.

In [None]:
loader = TextLoader("DB/RawBCLaws/all_act_titles.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
docs = text_splitter.split_documents(documents)

In [None]:
COLLECTION_NAME = "bc_law_titles"

# If the database table already exists, delete it
db = PGVector.from_documents(
    documents=docs,
    embedding=embeddings,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    pre_delete_collection=True,
)


In [None]:
query = "Is there a law on tenancy act?"
print('\n' + query + '\n')
docs_with_score = db.similarity_search_with_score(query)

for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

In [None]:
store = PGVector(
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    embedding_function=embeddings,
) 

query = "Laptop is a device."
print('\n' + query + '\n')
docs_with_score = store.similarity_search_with_score(query)

for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)
