# Notebook to Generate the Collections

In [1]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [3]:
OPENAI_APIKEY = ""
EMBEDDING_MODEL = "text-embedding-3-large"
df = pd.read_csv("data/sonas.csv")
df.head()

Unnamed: 0,url,title,date,president,sona_no,datetime,title.cleaned,speech
0,https://www.officialgazette.gov.ph/2023/07/24/...,"Ferdinand R. Marcos Jr., Second State of the N...","July 24, 2023",Ferdinand R. Marcos Jr.,Second,2023-07-24,(2023) Second State of the Nation Address of F...,STATE OF THE NATION ADDRESS\nOF\nHIS EXCELLENC...
1,https://www.officialgazette.gov.ph/2021/07/26/...,"Ferdinand R. Marcos Jr., First State of the Na...","July 25, 2022",Ferdinand R. Marcos Jr.,First,2022-07-25,(2022) First State of the Nation Address of Fe...,STATE OF THE NATION ADDRESS\nOF\nHIS EXCELLENC...
2,https://www.officialgazette.gov.ph/2021/07/26/...,"Rodrigo Roa Duterte, Sixth State of the Nation...","July 26, 2021",Rodrigo Roa Duterte,Sixth,2021-07-26,(2021) Sixth State of the Nation Address of Ro...,STATE OF THE NATION ADDRESS OF\nRODRIGO ROA DU...
3,https://www.officialgazette.gov.ph/2020/07/27/...,"Rodrigo Roa Duterte, Fifth State of the Nation...","July 27, 2020",Rodrigo Roa Duterte,Fifth,2020-07-27,(2020) Fifth State of the Nation Address of Ro...,5TH STATE OF THE NATION ADDRESS OF\nRODRIGO RO...
4,https://www.officialgazette.gov.ph/2019/07/22/...,"Rodrigo Roa Duterte, Fourth State of the Natio...","July 22, 2019",Rodrigo Roa Duterte,Fourth,2019-07-22,(2019) Fourth State of the Nation Address of R...,STATE OF THE NATION ADDRESS OF\nRODRIGO ROA DU...


In [5]:
def get_splitted_documents(chunk_size=2000, chunk_overlap=300):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap
    )

    documents = text_splitter.create_documents(
        texts = df['speech'],
        metadatas = [{"url": url, "title": title, "president": president, "date": dt_date, "sona_no": sona_no} for url, title, president, dt_date, sona_no in df[['url', 'title.cleaned', 'president', 'datetime', "sona_no"]].values]
    )

    return documents


def init_chroma_db(collection_name, db_path='sonas.db'):
    # Create a Chroma Client
    chroma_client = chromadb.PersistentClient(path=db_path)

    # Create an embedding function
    embedding_function = OpenAIEmbeddingFunction(api_key=OPENAI_APIKEY, model_name=EMBEDDING_MODEL)

    # Create a collection
    collection = chroma_client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

    return collection


collection = init_chroma_db(collection_name='sonas', db_path='sonas.db')

In [6]:
def upsert_documents_to_collection(collection):
    # Get the documents
    documents = get_splitted_documents()
    ids = list(f"id_{idx}" for idx, _ in enumerate(documents))
    docs = list(map(lambda x: x.page_content, documents))
    mets = list(map(lambda x: x.metadata, documents))

    # Update/insert text documents to the db collection
    collection.upsert(ids=ids, documents=docs, metadatas=mets)


upsert_documents_to_collection(collection)

In [7]:
# Query the collection
def specific_semantic_search(Q, k=3, collection=collection, metadata_key = "", meta_val = ""):
    results = collection.query(
        query_texts=[Q], # Chroma will embed this for you
        n_results=k, # how many results to return,
        where={f"{metadata_key}": f"{meta_val}"} # specific data only
    )
    return results

def general_semantic_search(Q, k=3, collection=collection):
    results = collection.query(
        query_texts=[Q], # Chroma will embed this for you
        n_results=k, # how many results to return,
    )
    return results

In [8]:
# Specific is based on some metadata
results = specific_semantic_search('war', k=5, collection=collection, metadata_key="president", meta_val="Gloria Macapagal-Arroyo")

# General search is on the whole collection
# results = general_semantic_search('war', k=5, collection=collection)

In [9]:
data_dict = {
    'ids': results['ids'][0],
    'distances': results['distances'][0],
    'documents': results['documents'][0],
    'title': [eval(str(m))['title'] for m in results['metadatas'][0]],
    'url': [eval(str(m))['url'] for m in results['metadatas'][0]],
    'president': [eval(str(m))['president'] for m in results['metadatas'][0]],
    'date': [eval(str(m))['date'] for m in results['metadatas'][0]],
    'sona_no': [eval(str(m))['sona_no'] for m in results['metadatas'][0]],
    'metadata': results['metadatas'][0]
}

pd.DataFrame(data_dict)

Unnamed: 0,ids,distances,documents,title,url,president,date,sona_no,metadata
0,id_590,1.45167,"Si Sonny Ayao, umayaw sa giyera at naging comm...",(2003) Third State of the Nation Address of Gl...,https://www.officialgazette.gov.ph/2003/07/28/...,Gloria Macapagal-Arroyo,2003-07-28,Third,"{'date': '2003-07-28', 'president': 'Gloria Ma..."
1,id_591,1.453824,Nasa giyera tayo. giyera laban sa terorismo. g...,(2003) Third State of the Nation Address of Gl...,https://www.officialgazette.gov.ph/2003/07/28/...,Gloria Macapagal-Arroyo,2003-07-28,Third,"{'date': '2003-07-28', 'president': 'Gloria Ma..."
2,id_570,1.461759,Wars are for combatants. As I speak soldiers a...,(2004) Fourth State of the Nation Address of G...,https://www.officialgazette.gov.ph/2004/07/26/...,Gloria Macapagal-Arroyo,2004-07-26,Fourth,"{'date': '2004-07-26', 'president': 'Gloria Ma..."
3,id_603,1.468408,I am happy to let you know that yesterday we c...,(2002) Second State of the Nation Address of G...,https://www.officialgazette.gov.ph/2002/07/22/...,Gloria Macapagal-Arroyo,2002-07-22,Second,"{'date': '2002-07-22', 'president': 'Gloria Ma..."
4,id_581,1.542495,"For the practical purposes of most people, gov...",(2003) Third State of the Nation Address of Gl...,https://www.officialgazette.gov.ph/2003/07/28/...,Gloria Macapagal-Arroyo,2003-07-28,Third,"{'date': '2003-07-28', 'president': 'Gloria Ma..."


## Don't Mind

In [46]:
presi = "Ferdinand R. Marcos Jr."
title = "(2023) Second State of the Nation Address of Ferdinand R. Marcos Jr."
col_keys = list(collection.get().keys())
basta = collection.get(where={"title": title}, include=['documents', 'metadatas'])['documents']

In [58]:
actual = ' '.join(df[df['title.cleaned'] == title]['speech'][0].split("\n"))
splitted_one = ' '.join(' '.join(basta).split("\n"))

In [67]:
print(len(actual), len(splitted_one)) # not same due to word overlap

53320 57847
