In [3]:
import firebase_admin
from firebase_admin import credentials, firestore

In [None]:
# Initialize Firebase Admin SDK
cred = credentials.Certificate("goblob-95e2a-6add9b68fd5d.json")  # Replace with your JSON key
firebase_admin.initialize_app(cred)

# Get Firestore client
db = firestore.client()

# Option 3 - Flattened Collection

In [4]:
# Function to add a category (with description)
def add_category(category_id, category_name, description):
    db.collection("categories").document(category_id).set({
        "name": category_name,
        "description": description  # Added description
    })
    print(f"✅ Category '{category_name}' added!")

# Function to add a subcategory (with description)
def add_subcategory(subcategory_id, subcategory_name, category_id, description):
    db.collection("subcategories").document(subcategory_id).set({
        "name": subcategory_name,
        "category_id": category_id,
        "description": description  # Added description
    })
    print(f"✅ Subcategory '{subcategory_name}' added under category '{category_id}'!")

# Function to get all subcategories of a given category
def get_subcategories_by_category(category_id):
    subcategories = db.collection("subcategories").where("category_id", "==", category_id).get()
    result = [{**doc.to_dict(), "id": doc.id} for doc in subcategories]  # Include ID for reference
    return result

# ---- DEMO ----

# 1️⃣ Add Categories (with descriptions)
add_category("electronics", "Electronics", "Devices and gadgets like phones, laptops, and more.")
add_category("clothing", "Clothing", "Apparel and fashion for men, women, and kids.")

# 2️⃣ Add Subcategories (with descriptions)
add_subcategory("phones", "Phones", "electronics", "Smartphones and mobile devices.")
add_subcategory("laptops", "Laptops", "electronics", "Portable computers for work and gaming.")
add_subcategory("men", "Men", "clothing", "Men’s fashion including shirts, pants, and accessories.")
add_subcategory("women", "Women", "clothing", "Women’s fashion including dresses, shoes, and accessories.")

# 3️⃣ Query Subcategories by Category
electronics_subcategories = get_subcategories_by_category("electronics")
clothing_subcategories = get_subcategories_by_category("clothing")

# Print results
print("\n📌 Electronics Subcategories:")
for sub in electronics_subcategories:
    print(f"  - {sub['name']}: {sub['description']}")

print("\n📌 Clothing Subcategories:")
for sub in clothing_subcategories:
    print(f"  - {sub['name']}: {sub['description']}")


✅ Category 'Electronics' added!
✅ Category 'Clothing' added!
✅ Subcategory 'Phones' added under category 'electronics'!
✅ Subcategory 'Laptops' added under category 'electronics'!
✅ Subcategory 'Men' added under category 'clothing'!
✅ Subcategory 'Women' added under category 'clothing'!

📌 Electronics Subcategories:
  - Laptops: Portable computers for work and gaming.
  - Phones: Smartphones and mobile devices.

📌 Clothing Subcategories:
  - Men: Men’s fashion including shirts, pants, and accessories.
  - Women: Women’s fashion including dresses, shoes, and accessories.


# Option 2 (Nested Subcollection Approach)

In [5]:
# Function to add a category (with description)
def add_category(category_id, category_name, description):
    db.collection("categories").document(category_id).set({
        "name": category_name,
        "description": description
    })
    print(f"✅ Category '{category_name}' added!")

# Function to add a subcategory inside a category (nested subcollection)
def add_subcategory(category_id, subcategory_id, subcategory_name, description):
    db.collection("categories").document(category_id).collection("subcategories").document(subcategory_id).set({
        "name": subcategory_name,
        "description": description
    })
    print(f"✅ Subcategory '{subcategory_name}' added under category '{category_id}'!")

# Function to get all subcategories for a given category
def get_subcategories_by_category(category_id):
    subcategories_ref = db.collection("categories").document(category_id).collection("subcategories").get()
    subcategories = [{**doc.to_dict(), "id": doc.id} for doc in subcategories_ref]  # Include ID for reference
    return subcategories

# ---- DEMO ----

# 1️⃣ Add Categories
add_category("electronics", "Electronics", "Devices and gadgets like phones, laptops, and more.")
add_category("clothing", "Clothing", "Apparel and fashion for men, women, and kids.")

# 2️⃣ Add Subcategories (inside their respective categories)
add_subcategory("electronics", "phones", "Phones", "Smartphones and mobile devices.")
add_subcategory("electronics", "laptops", "Laptops", "Portable computers for work and gaming.")
add_subcategory("clothing", "men", "Men", "Men’s fashion including shirts, pants, and accessories.")
add_subcategory("clothing", "women", "Women", "Women’s fashion including dresses, shoes, and accessories.")

# 3️⃣ Query Subcategories by Category
electronics_subcategories = get_subcategories_by_category("electronics")
clothing_subcategories = get_subcategories_by_category("clothing")

# Print results
print("\n📌 Electronics Subcategories:")
for sub in electronics_subcategories:
    print(f"  - {sub['name']}: {sub['description']}")

print("\n📌 Clothing Subcategories:")
for sub in clothing_subcategories:
    print(f"  - {sub['name']}: {sub['description']}")


✅ Category 'Electronics' added!
✅ Category 'Clothing' added!
✅ Subcategory 'Phones' added under category 'electronics'!
✅ Subcategory 'Laptops' added under category 'electronics'!
✅ Subcategory 'Men' added under category 'clothing'!
✅ Subcategory 'Women' added under category 'clothing'!

📌 Electronics Subcategories:
  - Laptops: Portable computers for work and gaming.
  - Phones: Smartphones and mobile devices.

📌 Clothing Subcategories:
  - Men: Men’s fashion including shirts, pants, and accessories.
  - Women: Women’s fashion including dresses, shoes, and accessories.


# Now with profiles

## Flat Collections (Option 1)

In [6]:
# Function to add a category
def add_category(category_id, category_name, description):
    db.collection("categories").document(category_id).set({
        "name": category_name,
        "description": description
    })
    print(f"✅ Category '{category_name}' added!")

# Function to add a subcategory
def add_subcategory(subcategory_id, subcategory_name, category_id, description):
    db.collection("subcategories").document(subcategory_id).set({
        "name": subcategory_name,
        "category_id": category_id,
        "description": description
    })
    print(f"✅ Subcategory '{subcategory_name}' added under category '{category_id}'!")

# Function to add a service provider
def add_service_provider(sp_id, name, categories, subcategories, location, rating):
    db.collection("service_providers").document(sp_id).set({
        "name": name,
        "categories": categories,  # List of category IDs
        "subcategories": subcategories,  # List of subcategory IDs
        "location": location,
        "rating": rating
    })
    print(f"✅ Service Provider '{name}' added!")

# Function to find service providers by subcategory
def get_providers_by_subcategory(subcategory_id):
    providers_ref = db.collection("service_providers").where("subcategories", "array_contains", subcategory_id).get()
    return [{**doc.to_dict(), "id": doc.id} for doc in providers_ref]

# ---- DEMO ----

# 1️⃣ Add Categories
add_category("electronics", "Electronics", "Devices and gadgets like phones, laptops, and more.")
add_category("clothing", "Clothing", "Apparel and fashion for men, women, and kids.")

# 2️⃣ Add Subcategories
add_subcategory("phones", "Phones", "electronics", "Smartphones and mobile devices.")
add_subcategory("laptops", "Laptops", "electronics", "Portable computers for work and gaming.")
add_subcategory("men", "Men", "clothing", "Men’s fashion including shirts, pants, and accessories.")

# 3️⃣ Add Service Providers
add_service_provider("sp1", "John's Phone Repair", ["electronics"], ["phones"], "New York, USA", 4.7)
add_service_provider("sp2", "Elite Laptops Service", ["electronics"], ["laptops"], "San Francisco, USA", 4.9)
add_service_provider("sp3", "Men's Fashion Tailor", ["clothing"], ["men"], "Miami, USA", 4.8)

# 4️⃣ Query Providers by Subcategory
phone_repair_providers = get_providers_by_subcategory("phones")
print("\n📌 Phone Repair Providers:")
for sp in phone_repair_providers:
    print(f"  - {sp['name']} ({sp['location']}): {sp['rating']}⭐")


✅ Category 'Electronics' added!
✅ Category 'Clothing' added!
✅ Subcategory 'Phones' added under category 'electronics'!
✅ Subcategory 'Laptops' added under category 'electronics'!
✅ Subcategory 'Men' added under category 'clothing'!
✅ Service Provider 'John's Phone Repair' added!
✅ Service Provider 'Elite Laptops Service' added!
✅ Service Provider 'Men's Fashion Tailor' added!

📌 Phone Repair Providers:
  - John's Phone Repair (New York, USA): 4.7⭐


In [7]:
taxonomy = db.collection("tags-taxonomy").get()

In [9]:
for doc in taxonomy:
    print(doc.to_dict())

{'text': 'dog_walker', 'text_es': 'paseador_de_mascotas', 'weight': 10, 'isService': True, 'dateCreated': DatetimeWithNanoseconds(2025, 3, 10, 4, 32, 21, 543000, tzinfo=datetime.timezone.utc), 'dateUpdated': DatetimeWithNanoseconds(2025, 3, 10, 4, 32, 21, 543000, tzinfo=datetime.timezone.utc), 'usedBy': 0, 'parentSlug': 'pets', 'parentId': '', 'isSystemCreated': True, 'slug': 'dog_walker'}
{'text': 'elder_care', 'text_es': 'cuidado_de_ancianos', 'weight': 10, 'isService': True, 'dateCreated': DatetimeWithNanoseconds(2025, 3, 10, 4, 32, 21, 543000, tzinfo=datetime.timezone.utc), 'dateUpdated': DatetimeWithNanoseconds(2025, 3, 10, 4, 32, 21, 543000, tzinfo=datetime.timezone.utc), 'usedBy': 0, 'parentSlug': 'health', 'parentId': '', 'isSystemCreated': True, 'slug': 'elder_care'}
{'text': 'air_conditioning_installations', 'text_es': 'instalaciones_de_aire_acondicionado', 'weight': 10, 'isService': True, 'dateCreated': DatetimeWithNanoseconds(2025, 3, 10, 4, 32, 21, 543000, tzinfo=datetime.

In [18]:
import asyncio
from crawl4ai import *

import nest_asyncio

nest_asyncio.apply()

async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://www.airtasker.com/au/services/",
        )
        print(result.markdown)

if __name__ == "__main__":
    asyncio.run(main())

[INIT].... → Crawl4AI 0.4.246
[FETCH]... ↓ https://www.airtasker.com/au/services/... | Status: True | Time: 0.07s
[COMPLETE] ● https://www.airtasker.com/au/services/... | Status: True | Total: 0.08s
[](https://www.airtasker.com/au/services/</au/>)[+](https://www.airtasker.com/au/services/</post-task/?origin=header_post_task-seo>)
[](https://www.airtasker.com/au/services/</au/>)
[Post a task](https://www.airtasker.com/au/services/</post-task/?origin=header_post_task-seo>)
[Categories](https://www.airtasker.com/au/services/</au/services/>)
What are you looking for?
Pick a type of task.
As a tasker
I'm looking for work in ...
As a poster
I’m looking to hire someone for ...
  * [Accountants](https://www.airtasker.com/au/services/</au/services/accounting/>)
  * [Admin](https://www.airtasker.com/au/services/</au/services/admin/>)
  * [Alterations](https://www.airtasker.com/au/services/</au/services/alteration/>)
  * [Appliances](https://www.airtasker.com/au/services/</au/services/appliance-s

In [1]:
from pydantic import BaseModel, Field
from typing import List
import asyncio

import nest_asyncio

nest_asyncio.apply()

import json

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai.async_configs import LLMConfig

In [2]:
class Taxonomy(BaseModel):
    category: str
    subcategory: str

async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token="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"),
        schema=Taxonomy.model_json_schema(), # Or use model_json_schema()
        extraction_type="schema",
        instruction="Extract all categories and subcategories from the web page",
        chunk_token_threshold=1000,
        overlap_rate=0.0,
        apply_chunking=True,
        input_format="markdown",   # or "html", "fit_markdown"
        extra_args={"temperature": 0.0, "max_tokens": 800}
    )

    # 2. Build the crawler config
    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strategy,
        cache_mode=CacheMode.BYPASS
    )

    # 3. Create a browser config if needed
    browser_cfg = BrowserConfig(headless=True)

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        # 4. Let's say we want to crawl a single page
        result = await crawler.arun(
            url="https://www.airtasker.com/au/services/",
            config=crawl_config
        )

        if result.success:
            # 5. The extracted content is presumably JSON
            data = json.loads(result.extracted_content)
            print("Extracted items:", data)

            # 6. Show usage stats
            llm_strategy.show_usage()  # prints token usage
        else:
            print("Error:", result.error_message)

if __name__ == "__main__":
    asyncio.run(main())

[INIT].... → Crawl4AI 0.5.0.post8
[FETCH]... ↓ https://www.airtasker.com/au/services/... | Status: True | Time: 1.26s
[SCRAPE].. ◆ https://www.airtasker.com/au/services/... | Time: 0.224s
[EXTRACT]. ■ Completed for https://www.airtasker.com/au/services/... | Time: 48.53760895799496s
[COMPLETE] ● https://www.airtasker.com/au/services/... | Status: True | Total: 50.04s
Extracted items: [{'index': 1, 'error': True, 'tags': ['error'], 'content': "'str' object has no attribute 'choices'"}, {'index': 0, 'error': True, 'tags': ['error'], 'content': "'str' object has no attribute 'choices'"}, {'index': 2, 'error': True, 'tags': ['error'], 'content': "'str' object has no attribute 'choices'"}, {'index': 3, 'error': True, 'tags': ['error'], 'content': "'str' object has no attribute 'choices'"}, {'index': 5, 'error': True, 'tags': ['error'], 'content': "'str' object has no attribute 'choices'"}, {'index': 6, 'error': True, 'tags': ['error'], 'content': "'str' object has no attribute 'choices'"}, 

In [3]:
%pip install --quiet -U langchain-scrapegraph

Note: you may need to restart the kernel to use updated packages.


In [3]:
import getpass
import os

if not os.environ.get("SGAI_API_KEY"):
    os.environ["SGAI_API_KEY"] = getpass.getpass("ScrapeGraph AI API key:\n")

In [4]:
from langchain_scrapegraph.tools import (
    GetCreditsTool,
    MarkdownifyTool,
    SmartScraperTool,
)

smartscraper = SmartScraperTool()
markdownify = MarkdownifyTool()
credits = GetCreditsTool()

In [18]:
# SmartScraper
result = smartscraper.invoke(
    {
        "user_prompt": "Extract categories and subcategories from the website",
        "website_url": "https://www.airtasker.com/au/services/",
    }
)
print("SmartScraper Result:", result)

SmartScraper Result: {'categories': ['Accountants', 'Admin', 'Alterations', 'Appliances', 'Assembly', 'Auto Electricians', 'Bakers', 'Barbers', 'Beauticians', 'Bicycle Service', 'Bricklaying', 'Building & Construction', 'Business', 'Car Body Work', 'Car Detailing', 'Car Repair', 'Car Service', 'Carpentry', 'Cat Care', 'Catering', 'Chef', 'Cladding', 'Cleaning', 'Computers & IT', 'Concreting', 'Decking', 'Delivery', 'Design', 'Dog Care', 'Draftsman', 'Driving', 'Electricians', 'Entertainment', 'Events', 'Fencing', 'Fitness', 'Flooring', 'Florist', 'Furniture Assembly', 'Furniture Repair', 'Gardening', 'Gate Installation', 'Gift Delivery', 'Glaziers', 'Glass Services', 'Grocery Delivery', 'Hair Removal', 'Hairdressers', 'Handyman', 'Health & Wellness', 'Heating & Cooling', 'Home & Lifestyle', 'Home Automation and Security', 'Home Theatre', 'House Cleaning', 'Interior Designer', 'Kitchen Renovation', 'Kitchen Installation', 'Landscaping', 'Laundry', 'Lawn Care', 'Legal Services', 'Lessons

In [19]:
result

{'categories': ['Accountants',
  'Admin',
  'Alterations',
  'Appliances',
  'Assembly',
  'Auto Electricians',
  'Bakers',
  'Barbers',
  'Beauticians',
  'Bicycle Service',
  'Bricklaying',
  'Building & Construction',
  'Business',
  'Car Body Work',
  'Car Detailing',
  'Car Repair',
  'Car Service',
  'Carpentry',
  'Cat Care',
  'Catering',
  'Chef',
  'Cladding',
  'Cleaning',
  'Computers & IT',
  'Concreting',
  'Decking',
  'Delivery',
  'Design',
  'Dog Care',
  'Draftsman',
  'Driving',
  'Electricians',
  'Entertainment',
  'Events',
  'Fencing',
  'Fitness',
  'Flooring',
  'Florist',
  'Furniture Assembly',
  'Furniture Repair',
  'Gardening',
  'Gate Installation',
  'Gift Delivery',
  'Glaziers',
  'Glass Services',
  'Grocery Delivery',
  'Hair Removal',
  'Hairdressers',
  'Handyman',
  'Health & Wellness',
  'Heating & Cooling',
  'Home & Lifestyle',
  'Home Automation and Security',
  'Home Theatre',
  'House Cleaning',
  'Interior Designer',
  'Kitchen Renovation'

In [20]:
taxo = {}

for category in result.get("categories"):
    print(category)
    # SmartScraper
    result1 = smartscraper.invoke(
        {
            "user_prompt": "Extract all the subcategories of the category: " + category + " from the website ",
            "website_url": "https://www.airtasker.com/au/services/",
        }
    )



    taxo[category] = result1.get('subcategories')

Accountants
Admin
Alterations
Appliances
Assembly
Auto Electricians
Bakers
Barbers
Beauticians
Bicycle Service
Bricklaying
Building & Construction
Business
Car Body Work
Car Detailing
Car Repair
Car Service
Carpentry
Cat Care
Catering
Chef
Cladding
Cleaning
Computers & IT
Concreting
Decking
Delivery
Design
Dog Care
Draftsman
Driving
Electricians
Entertainment
Events
Fencing
Fitness
Flooring
Florist
Furniture Assembly
Furniture Repair
Gardening
Gate Installation
Gift Delivery
Glaziers
Glass Services
Grocery Delivery
Hair Removal
Hairdressers
Handyman
Health & Wellness
Heating & Cooling
Home & Lifestyle
Home Automation and Security
Home Theatre
House Cleaning
Interior Designer
Kitchen Renovation
Kitchen Installation
Landscaping
Laundry
Lawn Care
Legal Services
Lessons
Local Mobile Mechanic
Locksmith
Makeup Artist
Marketing
Martial Arts
Mechanic
Models
Motorcycle Mechanic
Moped Repair
Music Lessons
Painting
Paving
Pest Control
Pet Care
Photographers


KeyboardInterrupt: 

In [21]:
taxo

{'Accountants': ['Budgeting Help',
  'Financial Advisor',
  'Financial Modelling',
  'Financial Planning',
  'Financial Reporting',
  'MYOB Training',
  'Mortgage Advisor',
  'Pension Advisor',
  'Tax Advisor',
  'XERO Training',
  'Accounting Tutor',
  'Algebra Tutor',
  'Art Tutor',
  'Biochemistry Tutor',
  'Biology Tutor',
  'Calligraphy Tutor',
  'Chemistry Tutor',
  'Chinese Tutor',
  'Economics Tutor',
  'Elocution Lessons',
  'Engineering Tutor',
  'English Tutor',
  'Environment Science Tutor',
  'French Tutor',
  'GMAT Tutor',
  'German Tutor',
  'History Tutor',
  'Italian Tutor',
  'Japanese Tutor',
  'Korean Lessons',
  'Language Tutor',
  'Mandarin Tutor',
  'Maths Tutor',
  'Physics Tutor',
  'Portuguese Tutor',
  'Public Speaking',
  'Python Tutor',
  'Science Tutor',
  'Spanish Lessons',
  'University Tutor'],
 'Admin': ['Data Entry',
  'Document Filing',
  'HR Services',
  'Office Work',
  'Personal Assistant',
  'Research Assistant',
  'Typist',
  'Virtual Assistant'