Skip to content

Commit

Permalink
Merge pull request #209 from polywrap/namesty/re-categorization
Browse files Browse the repository at this point in the history
Remove Category Filtering
  • Loading branch information
dOrgJelli committed Feb 21, 2024
2 parents 498c0d9 + 7da5846 commit 6ae520d
Show file tree
Hide file tree
Showing 10 changed files with 106 additions and 263 deletions.
2 changes: 0 additions & 2 deletions workers/fund_public_goods/db/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,6 @@ class Projects(BaseModel):
website: str
logo: Optional[str] = None
twitter: Optional[str] = None
keywords: list[str] = []
categories: list[str] = []
impact_funding_report: Optional[str] = Field(..., alias="impactFundingReport")
impact: Optional[float] = None
funding_needed: Optional[float] = Field(..., alias="fundingNeeded")
Expand Down
83 changes: 54 additions & 29 deletions workers/fund_public_goods/db/tables/projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ def upsert(
"website": row.website,
"twitter": row.twitter,
"short_description": row.short_description,
"keywords": row.keywords,
"categories": row.categories,
"logo": row.logo,
"funding_needed": row.funding_needed,
"impact_funding_report": row.impact_funding_report,
Expand All @@ -37,8 +35,6 @@ def upsert_multiple(
"website": row.website,
"twitter": row.twitter,
"short_description": row.short_description,
"keywords": row.keywords,
"categories": row.categories,
"logo": row.logo,
"funding_needed": row.funding_needed,
"impact_funding_report": row.impact_funding_report,
Expand Down Expand Up @@ -69,8 +65,6 @@ def sanitize_projects_information(projects: list[dict[str, Any]]) -> list[tuple[
website=project_data.get("website", ""),
twitter=project_data.get("twitter", ""),
logo=project_data.get("logo", ""),
keywords=project_data.get("keywords", []),
categories=project_data.get("categories", []),
short_description=project_data.get("short_description", None),
funding_needed=project_data.get("funding_needed", None),
impact=project_data.get("impact", None),
Expand All @@ -82,35 +76,66 @@ def sanitize_projects_information(projects: list[dict[str, Any]]) -> list[tuple[
return projects_with_answers


def get_unique_categories() -> list[str]:
def get_projects_lightweight(range_from: int, range_to: int) -> PostgrestAPIResponse[dict[str, Any]]:
db = create_admin()
response: PostgrestAPIResponse[list[dict[str, str]]] = (
db.table("unique_categories_views").select("*").execute()
return (
db.table("projects")
.select(
"id, title, website, updated_at, description"
)
.range(range_from, range_to)
.execute()
)
if not response.data:
return []

categories = []

for row in response.data:
categories.append(row["category"]) # type: ignore

return categories

def fetch_projects_by_category(categories: list[str]) -> list[tuple[Projects, list[Answer]]]:
results = get_projects_from_description(categories).data
sanitized_projects = sanitize_projects_information(results)
return sanitized_projects

def get_projects_from_description(categories: list[str]):

def get_projects_by_ids(ids: list[str]) -> list[tuple[Projects, list[Answer]]]:
db = create_admin()
request = (
results = (
db.table("projects")
.select(
"* applications(id, recipient, round, answers)"
"*, applications(id, recipient, round, answers)"
)
.ov("categories", categories)
.in_('id', ids)
.execute()
)

return sanitize_projects_information(results.data)


def get_all_projects_lightweight() -> list[Projects]:
all_results: list[dict[str, Any]] = []
current_from = 0
page_size = 999
while True:
current_to = current_from + page_size
results = get_projects_lightweight(current_from, current_to).data
all_results.extend(results)

if len(results) < page_size:
break

current_from += page_size

projects: list[Projects] = []

for item in all_results:
# Remove all None values
project_data = {k: v for k, v in item.items() if v is not None}

project = Projects(
id=project_data.get("id", ""),
updated_at=project_data.get("updated_at", ""),
title=project_data.get("title", ""),
description=project_data.get("description", ""),
website=project_data.get("website", ""),
twitter=project_data.get("twitter", ""),
logo=project_data.get("logo", ""),
short_description=project_data.get("short_description", None),
funding_needed=project_data.get("funding_needed", None),
impact=project_data.get("impact", None),
impact_funding_report=project_data.get("impact_funding_report", None),
)

projects.append(project)

return projects

return request
47 changes: 0 additions & 47 deletions workers/fund_public_goods/lib/strategy/utils/categorize_project.py

This file was deleted.

53 changes: 0 additions & 53 deletions workers/fund_public_goods/lib/strategy/utils/categorize_prompt.py

This file was deleted.

102 changes: 0 additions & 102 deletions workers/fund_public_goods/lib/strategy/utils/constants.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
from fund_public_goods.db.entities import Projects
from fund_public_goods.db.tables.projects import fetch_projects_by_category, get_unique_categories
from fund_public_goods.db.tables.projects import get_all_projects_lightweight, get_projects_by_ids
from fund_public_goods.lib.strategy.models.answer import Answer
from fund_public_goods.lib.strategy.utils.categorize_prompt import categorize_prompt
from fund_public_goods.lib.strategy.utils.get_top_matching_projects import get_top_matching_projects
from fund_public_goods.lib.strategy.utils.utils import get_latest_project_per_website


def fetch_matching_projects(prompt: str) -> list[tuple[Projects, list[Answer]]]:
prompt_categories = categorize_prompt(prompt, get_unique_categories())
fetched_projects = fetch_projects_by_category(prompt_categories)

answers_by_id = { project.id: answers for (project, answers) in fetched_projects }
projects = [project for (project, _) in fetched_projects]
projects_to_rank = get_all_projects_lightweight()

deduplicated_projects = get_latest_project_per_website(projects)
deduplicated_projects = get_latest_project_per_website(projects_to_rank)
matching_projects = get_top_matching_projects(prompt, deduplicated_projects)[:10]

matching_projects_with_answers = [(project, answers_by_id[project.id]) for project in matching_projects]
matched_ids = [p.id for p in matching_projects]

matching_projects_with_answers = get_projects_by_ids(matched_ids)

return matching_projects_with_answers
33 changes: 33 additions & 0 deletions workers/fund_public_goods/lib/strategy/utils/generate_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import CommaSeparatedListOutputParser


queries_prompt_template = """
Your goal is to provide a list of queries that will be used to perform
and embeddings search over different project descriptions and get the ones
that best match the user's interests. All projects are public goods funding
projects in the crypto ecosystem.
Provide a maximum of {n} queries.
This is the user's interest: {prompt}
Respond strictly with a comma-separated list of queries, without quotes
"""


def generate_queries(prompt: str, n) -> list[str]:
queries_prompt = ChatPromptTemplate.from_messages([
("system", queries_prompt_template),
])
llm = ChatOpenAI(model="gpt-4-1106-preview") # type: ignore

queries_chain = queries_prompt | llm | CommaSeparatedListOutputParser()

queries = queries_chain.invoke({
"prompt": prompt,
"n": n,
})

return queries
Loading

0 comments on commit 6ae520d

Please sign in to comment.