Skip to content

Commit

Permalink
Merge pull request #214 from polywrap/dev
Browse files Browse the repository at this point in the history
merge dev
  • Loading branch information
dOrgJelli committed Feb 21, 2024
2 parents b73a493 + 2436e84 commit e02c67f
Show file tree
Hide file tree
Showing 11 changed files with 1,797 additions and 1,906 deletions.
3 changes: 2 additions & 1 deletion .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ NEXT_PUBLIC_SUPABASE_ANON_KEY=
SUPABASE_SERVICE_ROLE_KEY=
SUPABASE_JWT_SECRET=
OPENAI_API_KEY=
PINECONE_API_KEY=
PINECONE_INDEX_NAME=
PINECONE_API_KEY=
3 changes: 2 additions & 1 deletion .github/workflows/cd.prod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ jobs:
DEV_NEXT_PUBLIC_SUPABASE_ANON_KEY: ${{ secrets.PROD_NEXT_PUBLIC_SUPABASE_ANON_KEY }}
DEV_SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.PROD_SUPABASE_SERVICE_ROLE_KEY }}
DEV_OPENAI_API_KEY: ${{ secrets.DEV_OPENAI_API_KEY}}
DEV_PINECONE_API_KEY: ${{ secrets.DEV_PINECONE_API_KEY }}
DEV_PINECONE_API_KEY: ${{ secrets.PROD_PINECONE_API_KEY }}
DEV_PINECONE_INDEX_NAME: ${{ secrets.PROD_PINECONE_INDEX_NAME }}

Deploy-DB:
runs-on: ubuntu-latest
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"workers:env": "if [ \"$CICD\" != \"true\" ]; then cp .env ./workers/.env; fi",
"workers:test": "cd workers && poetry run pytest -v -s",
"workers:types": "cd ops && poetry run generate-types",
"workers:create_embeddings": "cd workers && poetry run create-embeddings",
"events:dev": "npx inngest-cli dev",
"ops:install": "cd ops && poetry install",
"ops:codegen": "cd ops && poetry run generate-types"
Expand Down
14 changes: 13 additions & 1 deletion workers/fund_public_goods/db/app_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,40 @@
URL_ENV = "NEXT_PUBLIC_SUPABASE_URL"
ANON_KEY_ENV = "NEXT_PUBLIC_SUPABASE_ANON_KEY"
SERV_KEY_ENV = "SUPABASE_SERVICE_ROLE_KEY"
PINECONE_API_KEY_ENV = "PINECONE_API_KEY"
PINECONE_INDEX_NAME_ENV = "PINECONE_INDEX_NAME"

class Env(BaseModel):
url: str
anon_key: str
serv_key: str
pinecone_key: str
pinecone_index: str

def load_env() -> Env:
url: str | None = os.environ.get(URL_ENV)
anon_key: str | None = os.environ.get(ANON_KEY_ENV)
serv_key: str | None = os.environ.get(SERV_KEY_ENV)
pinecone_key: str | None = os.environ.get(PINECONE_API_KEY_ENV)
pinecone_index: str | None = os.environ.get(PINECONE_INDEX_NAME_ENV)

if url is None:
raise Exception(f"{URL_ENV} is not set")
if anon_key is None:
raise Exception(f"{ANON_KEY_ENV} is not set")
if serv_key is None:
raise Exception(f"{SERV_KEY_ENV} is not set")
if pinecone_key is None:
raise Exception(f"{PINECONE_API_KEY_ENV} is not set")
if pinecone_index is None:
raise Exception(f"{PINECONE_INDEX_NAME_ENV} is not set")

return Env(
url=url,
anon_key=anon_key,
serv_key=serv_key
serv_key=serv_key,
pinecone_key=pinecone_key,
pinecone_index=pinecone_index
)

def create(options: ClientOptions = ClientOptions(postgrest_client_timeout=15)) -> Client:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
from fund_public_goods.db.entities import Projects
from fund_public_goods.db.tables.projects import get_all_projects_lightweight, get_projects_by_ids
from fund_public_goods.db.tables.projects import get_projects_by_ids
from fund_public_goods.lib.strategy.models.answer import Answer
from fund_public_goods.lib.strategy.utils.get_top_matching_projects import get_top_matching_projects
from fund_public_goods.lib.strategy.utils.utils import get_latest_project_per_website


def fetch_matching_projects(prompt: str) -> list[tuple[Projects, list[Answer]]]:
projects_to_rank = get_all_projects_lightweight()

deduplicated_projects = get_latest_project_per_website(projects_to_rank)
matching_projects = get_top_matching_projects(prompt, deduplicated_projects)[:10]

matching_projects = get_top_matching_projects(prompt)[:10]
matched_ids = [p.id for p in matching_projects]

matching_projects_with_answers = get_projects_by_ids(matched_ids)

return matching_projects_with_answers
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import json
from langchain.text_splitter import CharacterTextSplitter
from fund_public_goods.db.app_db import load_env
from fund_public_goods.db.tables.projects import get_projects_by_ids
from fund_public_goods.lib.strategy.models.answer import Answer

from chromadb import EphemeralClient
from langchain.text_splitter import CharacterTextSplitter
from fund_public_goods.db.entities import Projects
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain_pinecone import Pinecone
import openai


Expand All @@ -26,13 +26,6 @@
}}
'''
Example response:
'''
{{
"project_ids": [32, 25, 4, 8]
}}
'''
In your assessment, it is essential to discern the genuine alignment of each project with the user's specific requirements.
Be mindful that the project descriptions may include buzzwords or jargon intended to exaggerate their relevance.
Your judgment should penetrate beyond superficial claims to identify projects that truly resonate with the user's prompt.
Expand Down Expand Up @@ -79,79 +72,35 @@ def rerank_top_projects(prompt: str, projects: list[Projects]) -> list[Projects]

return reranked_projects

def get_top_n_unique_ids(data: dict[str, list[str]], n: int) -> list[str]:
unique_ids = set()
result_ids: list[str] = []
query_order = list(data.keys())
max_length = max(len(ids) for ids in data.values())

for i in range(max_length):
for query in query_order:
if len(result_ids) >= n:
break
ids = data[query]
if i < len(ids) and ids[i] not in unique_ids:
unique_ids.add(ids[i])
result_ids.append(ids[i])

if len(result_ids) >= n:
break

return result_ids
def remove_duplicates_and_preserve_order(lst: list[str]) -> list[str]:
seen = set()
result = []
for item in lst:
if item not in seen:
seen.add(item)
result.append(item)
return result


def create_embeddings_collection(projects: list[Projects]):
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=200,
chunk_overlap=10,
separator=" ",
keep_separator=True
)

texts: list[str] = []
metadatas: list[dict] = []

for project in projects:
description_chunks = text_splitter.split_text(project.description)

for description_chunk in description_chunks:
texts.append(description_chunk)
metadatas.append({ "id": project.id, "title": project.title })

db_client = EphemeralClient()
collection = Chroma.from_texts(
texts=texts,
metadatas=metadatas,
def get_top_matching_projects(prompt: str) -> list[Projects]:
env = load_env()
vectorstore = Pinecone(
index_name=env.pinecone_index,
embedding=OpenAIEmbeddings(),
client=db_client,
collection_name="projects"
pinecone_api_key=env.pinecone_key
)

return collection


def get_top_matching_projects(prompt: str, projects: list[Projects]) -> list[Projects]:
projects_by_id = {project.id: project for project in projects}
all_projects_collection = create_embeddings_collection(projects)

queries = [prompt]
target_unique_ids = 35
total_unique_ids: list[str] = []

query_to_matched_project_ids: dict[str, list[str]] = {}

for query in queries:
matches = all_projects_collection.similarity_search(query, k=2000)
query_to_matched_project_ids[query] = [match.metadata["id"] for match in matches]

unique_ids = get_top_n_unique_ids(query_to_matched_project_ids, 30)
while (len(total_unique_ids) < target_unique_ids):
matches = vectorstore.similarity_search(query=prompt, k=300, filter={"id": { "$nin": total_unique_ids }})
query_to_matched_project_ids = [match.metadata["id"] for match in matches]

total_unique_ids += remove_duplicates_and_preserve_order(query_to_matched_project_ids)

matched_projects: list[Projects] = []

# TODO: this is a patch for an error seen in prod, should look at why
# some of these IDs don't exist...
for id in unique_ids:
if projects_by_id.get(id):
matched_projects.append(projects_by_id[id])
matched_projects: list[tuple[Projects, list[Answer]]] = get_projects_by_ids(total_unique_ids[:target_unique_ids])

reranked_projects = rerank_top_projects(prompt=prompt, projects=matched_projects)
reranked_projects = rerank_top_projects(prompt=prompt, projects=[p for (p, _) in matched_projects])

return reranked_projects
File renamed without changes.
46 changes: 46 additions & 0 deletions workers/fund_public_goods/scripts/create_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from fund_public_goods.db.app_db import load_env
from fund_public_goods.lib.strategy.utils.utils import get_latest_project_per_website
from langchain.text_splitter import CharacterTextSplitter
from fund_public_goods.db.tables.projects import get_all_projects_lightweight
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import Pinecone
from langchain.schema import Document


env = load_env()

def run():
projects = get_all_projects_lightweight()
deduplicated_projects = get_latest_project_per_website(projects)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=200,
chunk_overlap=10,
separator=" ",
keep_separator=True
)

documents: list[Document] = []

for project in deduplicated_projects:
description_chunks = text_splitter.split_text(project.description)

for description_chunk in description_chunks:
doc = Document(page_content=description_chunk, metadata={
"id": project.id,
})

documents.append(doc)

vectorstore = Pinecone(
index_name=env.pinecone_index,
embedding=OpenAIEmbeddings(),
pinecone_api_key=env.pinecone_key
)

try:
vectorstore.delete(delete_all=True)
except:
print("Pinecone index empty. Skipping deletion.")

vectorstore.add_documents(documents)
Loading

0 comments on commit e02c67f

Please sign in to comment.