# Indexing Process

This notebooks is to index the documents to the vector database

In [30]:
from dotenv import load_dotenv 

load_dotenv() 

True

In [31]:
import chromadb

chroma_client = chromadb.PersistentClient(path="chromadb")

In [32]:
import os
from supabase import create_client, Client

supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_KEY")
supabase_client = create_client(supabase_url, supabase_key)

## Article

In [None]:
import pandas as pd

articles = supabase_client.table("article").select("*").execute().model_dump()['data']
df_article = pd.DataFrame(articles)
df_article

In [18]:
# Indexing to ChromaDB

try:
    article_collection = chroma_client.get_collection("article")
except:
    article_collection = chroma_client.create_collection("article")

In [19]:
def create_article_sentence(article):
    return f"{article['title']}. {article['short_desc']}"

In [20]:
article_collection.add(
    ids=[str(article['article_id']) for article in articles],
    documents=[create_article_sentence(article) for article in articles],
    metadatas=[
        {
            "category": article['category'],
            "created_date": article['created_date'],
        } for article in articles        
    ]
)

## Mentor

In [None]:
import pandas as pd

mentors = supabase_client.table("mentor").select("*").execute().model_dump()['data']
df_mentor = pd.DataFrame(mentors)
df_mentor

In [34]:
# Indexing to ChromaDB

try:
    mentor_collection = chroma_client.get_collection("mentor")
except:
    mentor_collection = chroma_client.create_collection("mentor")

In [35]:
def create_mentor_sentence(mentor):
    return f"A mentor from {mentor['department']} department, working as a {mentor['position']} " \
           f"specializing in {mentor['specialization']}. {mentor['short_desc']}"

In [36]:
create_mentor_sentence(mentors[0])

'A mentor from Finance department, working as a Financial Analyst specializing in Investment Analysis. Analyzes financial data and provides investment guidance.'

In [37]:
mentor_collection.add(
    ids=[str(mentor['mentor_id']) for mentor in mentors],
    documents=[create_mentor_sentence(mentor) for mentor in mentors],
    metadatas=[
        {
            "name": mentor['name'],
            "department": mentor['department'],
            "position": mentor['position'],
            "short_desc": mentor['short_desc'],
            "email": mentor['email'],
        } for mentor in mentors        
    ]
)

Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3
Insert of existing embedding ID: 4
Insert of existing embedding ID: 5
Insert of existing embedding ID: 6
Insert of existing embedding ID: 7
Insert of existing embedding ID: 8
Insert of existing embedding ID: 9
Insert of existing embedding ID: 10
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10


In [38]:
result = mentor_collection.query(
            query_texts=create_mentor_sentence(mentors[0]),
            n_results=1,
        )
result

{'ids': [['1']],
 'distances': [[0.0]],
 'metadatas': [[{'department': 'Finance',
    'email': 'john.doe@bank.com',
    'name': 'John Doe',
    'position': 'Financial Analyst',
    'short_desc': 'Analyzes financial data and provides investment guidance.'}]],
 'embeddings': None,
 'documents': [['A mentor from Finance department, working as a Financial Analyst specializing in Investment Analysis. Analyzes financial data and provides investment guidance.']],
 'uris': None,
 'data': None}

In [39]:
result['ids'][0][0]

'1'

## Buddy

In [None]:
import pandas as pd

buddies = supabase_client.table("buddy").select("*").execute().model_dump()['data']
df_buddy = pd.DataFrame(buddies)
df_buddy

In [41]:
# Indexing to ChromaDB

try:
    buddy_collection = chroma_client.get_collection("buddy")
except:
    buddy_collection = chroma_client.create_collection("buddy")

In [42]:
def create_buddy_sentence(buddy):
    return f"A buddy from {buddy['role']}, with interest in {buddy['professional_interest']} " \
           f"likes {', '.join(eval(buddy['hobby']))}. proficient in { ', '.join(eval(buddy['language']))} " \
           f"with a goal in {buddy['buddy_program_goal']}"

In [43]:
create_buddy_sentence(buddies[0])

'A buddy from Management, with interest in Leadership skills likes Travel, Cooking, Sports. proficient in English with a goal in Networking'

In [None]:
buddy_collection.add(
    ids=[str(buddy['buddy_id']) for buddy in buddies],
    documents=[create_buddy_sentence(buddy) for buddy in buddies],
    metadatas=[
        {
            "name": buddy['name'],
            "short_desc": buddy['short_desc'],
            "email": buddy['email'],
            "linkedin": buddy['linkedin'],
            "instagram": buddy['instagram'],
            "years_of_experience": buddy['years_of_experience'],
            "role": buddy['role'],
            "professional_interest": buddy['professional_interest'],
            "interaction_frequency": buddy['interaction_frequency'],
            "hobby": buddy['hobby'],
            "language": buddy['language'],
            "meeting_preference": buddy['meeting_preference'],
            "buddy_program_goal": buddy['buddy_program_goal'] 
        } for buddy in buddies        
    ]
)

In [49]:
result = buddy_collection.get('21')
result

{'ids': ['21'],
 'embeddings': None,
 'metadatas': [{'buddy_program_goal': 'Cultural Integration',
   'email': 'jonathan@ubs.com',
   'hobby': "['Sports & Working Out', 'Arts and Crafts']",
   'instagram': 'jonathan_park',
   'interaction_frequency': 'Weekly',
   'language': "['Chinese', 'Cantonese']",
   'linkedin': 'jonathan-park',
   'meeting_preference': 'Combination of both',
   'name': 'Jonathan',
   'professional_interest': 'Leadership Skills',
   'role': 'Operations',
   'short_desc': 'Manages frontend development and compliance',
   'years_of_experience': 2}],
 'documents': ['A buddy from Operations, with interest in Leadership Skills likes Sports & Working Out, Arts and Crafts. proficient in Chinese, Cantonese with a goal in Cultural Integration'],
 'uris': None,
 'data': None}

In [46]:
result['documents'][0]

'A buddy from Operations, with interest in Leadership Skills likes Sports & Working Out, Arts and Crafts. proficient in Chinese, Cantonese with a goal in Cultural Integration'

In [47]:
query_sentence = "A buddy from Operations, with interest in Leadership Skills likes Sports & Working Out, Arts and Crafts. proficient in Chinese, Cantonese with a goal in Cultural Integration"

result = buddy_collection.query(
        query_texts=query_sentence,
        n_results=1,
        where={
            "$and" : [
                {"years_of_experience" : {"$gte" : 0}},      
                {"years_of_experience" : {"$lte" : 4}},

                {"interaction_frequency" : "Weekly"},

                {"meeting_preference" :  "Combination of both"},

                {"email" :  {"$ne" : "jonathan@ubs.com"}},
            ]
        }
    )
print(result)

{'ids': [['13']], 'distances': [[0.3957486788025282]], 'metadatas': [[{'buddy_program_goal': 'Providing mentorship', 'email': 'nora.quinn@ubs.com', 'hobby': '["Sports", "Technology/Gadgets"]', 'instagram': 'noraquinn_bizdev', 'interaction_frequency': 'Weekly', 'language': '["English"]', 'linkedin': 'nora-quinn-bizdev', 'meeting_preference': 'Combination of both', 'name': 'Nora Quinn', 'professional_interest': 'Leadership skills', 'role': 'Human Resources', 'short_desc': 'Explores new business opportunities.', 'years_of_experience': 1}]], 'embeddings': None, 'documents': [['A buddy from Human Resources, with interest in Leadership skills likes Sports, Technology/Gadgets. proficient in English with a goal in Providing mentorship']], 'uris': None, 'data': None}
