In [1]:
import pandas as pd
import json
import os
import nest_asyncio
from typing import List, Optional, Literal
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
from crawl4ai import RateLimiter as CrawlRateLimiter
from pydantic import BaseModel, Field
from google import genai
from google.genai import types
from dotenv import load_dotenv
import re
import asyncio
from aiolimiter import AsyncLimiter

### Filter List

In [20]:
df = pd.read_json('venues.json')

In [21]:
df.head()

Unnamed: 0,id,name,location,contact,type,tagline,price_range_id,rating,num_ratings,url_slug,enable_discovery,enable_for_amex,deep_link,metadata,web_link
0,"{'resy': 84342, 'foursquare': None, 'google': ...",Like Music VIP Cancún,{'address_1': 'Av Xcaret Supermanzana 35 Manza...,"{'phone_number': None, 'url': None}",Cocktail Bar,,2,,0.0,like-music-vip-cancun,1,1,resy://resy.com/VenueDetails?venue_id=%7B%27re...,{'description': ' Like Music VIP Cancún is a C...,"https://resy.com/?venue_id={'resy': 84342, 'fo..."
1,"{'resy': 81110, 'foursquare': None, 'google': ...",Restaurante Da Enzo Playa del Carmen,"{'address_1': 'Calle 42 Entre Av. 10 y, 5 Av. ...","{'phone_number': None, 'url': None}",Italian,,2,,0.0,restaurante-da-enzo-playa-del-carmen,1,1,resy://resy.com/VenueDetails?venue_id=%7B%27re...,{'description': ' Restaurante Da Enzo Playa de...,"https://resy.com/?venue_id={'resy': 81110, 'fo..."
2,"{'resy': 78530, 'foursquare': None, 'google': ...",Restaurante El Plebe Bichi Teotihuacan,"{'address_1': 'Calle Emilio Carranza 222, 5585...","{'phone_number': None, 'url': None}",Seafood,,2,,0.0,restaurante-el-plebe-bichi-teotihuacan,1,1,resy://resy.com/VenueDetails?venue_id=%7B%27re...,{'description': ' Restaurante El Plebe Bichi T...,"https://resy.com/?venue_id={'resy': 78530, 'fo..."
3,"{'resy': 78730, 'foursquare': None, 'google': ...",Restaurante La Mentirosa Los Mochis,"{'address_1': 'Blvd Centenario 805, Centro, 81...","{'phone_number': None, 'url': None}",International,,2,,0.0,restaurante-la-mentirosa-los-mochis,1,1,resy://resy.com/VenueDetails?venue_id=%7B%27re...,{'description': ' Restaurante La Mentirosa Los...,"https://resy.com/?venue_id={'resy': 78730, 'fo..."
4,"{'resy': 75788, 'foursquare': None, 'google': ...",Restaurante Salmone's Morelia Suc. Siervo,"{'address_1': 'Av Siervo de La Nacion s/n, Agu...","{'phone_number': None, 'url': None}",Seafood,,2,,0.0,restaurante-salmones-morelia-suc-siervo,1,1,resy://resy.com/VenueDetails?venue_id=%7B%27re...,{'description': ' Restaurante Salmone's Moreli...,"https://resy.com/?venue_id={'resy': 75788, 'fo..."


In [22]:
df_flat = pd.json_normalize(df['location'])

In [23]:
df = pd.concat([df, df_flat.add_prefix('loc_')], axis=1)

In [24]:
df_nyc = df[df["loc_url_slug"] == 'new-york-ny'].reset_index(drop=True)

In [25]:
df_nyc_flat_url = pd.json_normalize(df_nyc['contact'])
df_nyc_flat_id = pd.json_normalize(df_nyc['id'])

In [26]:
df_nyc = df_nyc.drop(columns=['contact']).join(df_nyc_flat_url)
df_nyc = df_nyc.drop(columns=['id']).join(df_nyc_flat_id)

In [27]:
df_nyc = df_nyc.drop(columns=['location'])

In [28]:
df_nyc.columns

Index(['name', 'type', 'tagline', 'price_range_id', 'rating', 'num_ratings',
       'url_slug', 'enable_discovery', 'enable_for_amex', 'deep_link',
       'metadata', 'web_link', 'loc_address_1', 'loc_address_2',
       'loc_locality', 'loc_region', 'loc_postal_code', 'loc_cross_street_1',
       'loc_cross_street_2', 'loc_longitude', 'loc_latitude',
       'loc_neighborhood', 'loc_time_zone', 'loc_url_slug', 'loc_id',
       'phone_number', 'url', 'resy', 'foursquare', 'google'],
      dtype='object')

In [29]:
enrichdf = df_nyc[["resy", "foursquare", "google", "name", "type", "price_range_id", "rating", "num_ratings", "web_link", "loc_id", "loc_neighborhood", "phone_number", "url"]]

In [30]:
enrichdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2467 entries, 0 to 2466
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   resy              2467 non-null   int64  
 1   foursquare        1391 non-null   object 
 2   google            2466 non-null   object 
 3   name              2467 non-null   object 
 4   type              2467 non-null   object 
 5   price_range_id    2467 non-null   int64  
 6   rating            2439 non-null   float64
 7   num_ratings       2439 non-null   float64
 8   web_link          2467 non-null   object 
 9   loc_id            2467 non-null   object 
 10  loc_neighborhood  2467 non-null   object 
 11  phone_number      2293 non-null   object 
 12  url               2420 non-null   object 
dtypes: float64(2), int64(2), object(9)
memory usage: 250.7+ KB


### Crawl4All 

In [2]:
load_dotenv()

True

In [3]:
nest_asyncio.apply()

In [4]:
client = genai.Client(api_key=os.getenv("GENAI_API_KEY")) 

In [5]:
AI_LIMITER = AsyncLimiter(max_rate=500, time_period=60)

#### Define Data Models

In [6]:
class ClassifiedLink(BaseModel):
    url: str = Field(description="The href URL from the input.")
    text: str = Field(description="The anchor text from the input. May be gibberish.")
    category: Literal['ordering', 'gift_card', 'instagram', 'private_events', 'mailing_list', 'other'] = Field(
        description="The category: 'ordering', 'gift_card', 'instagram', 'private_events', 'mailing_list', or 'other'."
    )

class LinkCollection(BaseModel):
    classified_links: List[ClassifiedLink]

class ClassificationBatch(BaseModel):
    categories: List[Literal['ordering', 'gift_card', 'instagram', 'private_events', 'mailing_list', 'other']]

class RestaurantTechProfile(BaseModel):
    pos_system: Optional[str] = Field(None, description="Inferred POS (Toast, Square, etc)")
    tech_stack: List[str] = Field(default_factory=list, description="Other systems (Bentobox, OpenTable, etc)")
    ordering_provider: Optional[str] = Field(None, description="Who powers the online ordering?")
    instagram_handle: Optional[str] = Field(None, description="The extracted handle (e.g. 'thesmithnyc')")
    newsletter_status: str = Field("Not Found", description="Confirmed or Not Found")
    tripleseat_status: str = Field("Not Found", description="Confirmed, Suspected, or Not Found")


#### Categorization LLM Helper (GeminiFlash)

In [7]:
def classify_links_flash(links: List[dict]) -> LinkCollection:
    """
    Uses gemini-flash-latest (Gemini 2.5) to categorize links 
    instead of guessing with keywords.
    """
    # Prepare batch for LLM
    candidates = links[:100]
    
    #print("candidates links count:", len(candidates))
    
    if not candidates:
        return LinkCollection(classified_links=[])

    # We ask for a Map of Index -> Category to keep response small
    prompt = f"""
    You are a restaurant bot link classifier.
    Classify every one of the following links based on their text and href into:
    - 'ordering' (online ordering, takeout, delivery, 'order now', point of sale, POS)
    - 'gift_card' (Gift cards, merch, store)
    - 'instagram' (Social media links to Instagram)
    - 'private_events' (Private dining, event booking, party reservations)
    - 'mailing_list' (Newsletter signups, email signups, mailing lists)
    - 'other' (Menus, about, contact, locations, reservations)

    If the text is gibberish or empty, RELY MORE on the href URL to classify.

    Input Links:
    {json.dumps(candidates)}
    """

    #print("Flash Classification Prompt:", prompt)

    try:
        response = client.models.generate_content(
            model="gemini-flash-latest", # CURRENTLY GEMINI 2.5 FLASH
            contents=prompt,
            config={
                "temperature": 0.0,
                "response_mime_type": "application/json",
                "response_json_schema": ClassificationBatch.model_json_schema(),
            }
        )

        #print("Flash Classification Response:", response.text)
        
        # Parse map and rebuild list
        batch_result = ClassificationBatch.model_validate_json(response.text)
        final_links = []
        for i, link in enumerate(candidates):
            category = "other" # Default fallback
            if i < len(batch_result.categories):
                category = batch_result.categories[i]
                
            final_links.append(ClassifiedLink(
                url=link.get("href", "") or link.get("url", ""),
                text=link.get("text", ""),
                category=category
            ))

        return LinkCollection(classified_links=final_links)

    except Exception as e:
        print(f"Flash Classification Error: {e}")
        return LinkCollection(classified_links=[])

In [8]:
async def classify_links_flash_async(links: List[dict]) -> LinkCollection:
    """
    Uses gemini-flash-latest (Gemini 2.5) to categorize links async
    """
    candidates = links[:100]
    
    if not candidates:
        return LinkCollection(classified_links=[])

    prompt = f"""
    You are a restaurant bot link classifier.
    Classify every one of the following links based on their text and href into:
    - 'ordering' (online ordering, takeout, delivery, 'order now', point of sale, POS)
    - 'gift_card' (Gift cards, merch, store)
    - 'instagram' (Social media links to Instagram)
    - 'private_events' (Private dining, event booking, party reservations)
    - 'mailing_list' (Newsletter signups, email signups, mailing lists)
    - 'other' (Menus, about, contact, locations, reservations)

    If the text is gibberish or empty, RELY MORE on the href URL to classify.

    Input Links:
    {json.dumps(candidates)}
    """

    async with AI_LIMITER: # Limit rate
        try:
            response = await client.aio.models.generate_content(
                model="gemini-flash-latest", # CURRENTLY GEMINI 2.5 FLASH
                contents=prompt,
                config={
                    "temperature": 0.0,
                    "response_mime_type": "application/json",
                    "response_json_schema": ClassificationBatch.model_json_schema(),
                }
            )

            #print("Flash Classification Response:", response.text)
            
            # Parse map and rebuild list
            batch_result = ClassificationBatch.model_validate_json(response.text)
            final_links = []
            for i, link in enumerate(candidates):
                category = "other"
                if i < len(batch_result.categories):
                    category = batch_result.categories[i]
                    
                final_links.append(ClassifiedLink(
                    url=link.get("href", "") or link.get("url", ""),
                    text=link.get("text", ""),
                    category=category
                ))

            return LinkCollection(classified_links=final_links)

        except Exception as e:
            print(f"Flash Classification Error: {e}")
            return LinkCollection(classified_links=[])

#### Analyze Tech Stack (Gemini3)

In [9]:
def analyze_tech_stack_gemini3(
    classified_links: List[ClassifiedLink], 
    script_domains: List[str], 
    footer_text: str,
    deep_dive_signals: List[str]
) -> RestaurantTechProfile:
    """
    Uses gemini-3-pro-preview to reason about the signals found.
    """
    
    # Organize data for the model
    ordering_urls = [l.url for l in classified_links if l.category == "ordering"]
    gift_urls = [l.url for l in classified_links if l.category == "gift_card"]
    socials = [l.url for l in classified_links if l.category == "instagram"]
    newsletter_links = [l.url for l in classified_links if l.category == "mailing_list"]
    
    prompt = f"""
    Analyze these signals to determine the Restaurant's Tech Stack.
    
    1. Validated Ordering Links: {ordering_urls}
    2. Validated Gift Card Links: {gift_urls}

    3. **Deep Dive Signals (Ordering/Gift Pages):** 
    {json.dumps(deep_dive_signals, indent=2)}
    (IMPORTANT: These are links/redirects found AFTER clicking the ordering/gift buttons. 
     Look here for 3rd party POS domains like 'toasttab.com', 'spoton.com', 'clover.com'.)

    4. Loaded Scripts/Domains: {script_domains}
    5. Footer Text: {footer_text}
    6. Social Links: {socials}
    7. Newsletter/Mailing List Links: {newsletter_links}
    
    Task:
    - Identify the POS System (Point of Sale). 
      - PRIORITY: Look at "Deep Dive Signals". If a link redirects to or points to a known POS (Toast, Square, SpotOn, Upserve, Aloha, Heartland, Clover etc), that is the POS.
      - SECONDARY: Look at Scripts.
    - Identify the technologies the website uses (e.g., Website builders like Bentobox or Squarespace, online order builders like Sauce or Owner.com, reservation systems).
    - Extract the Instagram Handle.
    - Determine Newsletter/Mailing List status (Confirmed, Not Found).
    """

    #print("gemini3", prompt)

    try:
        response = client.models.generate_content(
            model="gemini-flash-latest", # REASONING MODEL
            contents=prompt,
            config={
                "thinking_config": types.ThinkingConfig(thinking_budget=-1),
                "response_mime_type": "application/json",
                "response_json_schema": RestaurantTechProfile.model_json_schema(),
            }
        )

        techprofile = RestaurantTechProfile.model_validate_json(response.text)
        #print("gemini3profile", techprofile)
        return techprofile
    except Exception as e:
        print(f"Gemini 3 Error: {e}")
        return RestaurantTechProfile()

In [10]:
async def analyze_tech_stack_async(
    classified_links: List[ClassifiedLink], 
    script_domains: List[str], 
    footer_text: str,
    deep_dive_signals: List[str]
) -> RestaurantTechProfile:
    """
    Uses thinking model to reason about the signals found.
    """
    
    # Organize data for the model
    ordering_urls = [l.url for l in classified_links if l.category == "ordering"]
    gift_urls = [l.url for l in classified_links if l.category == "gift_card"]
    socials = [l.url for l in classified_links if l.category == "instagram"]
    newsletter_links = [l.url for l in classified_links if l.category == "mailing_list"]
    
    prompt = f"""
    Analyze these signals to determine the Restaurant's Tech Stack.
    
    1. Validated Ordering Links: {ordering_urls}
    2. Validated Gift Card Links: {gift_urls}

    3. **Deep Dive Signals (Ordering/Gift Pages):** 
    {json.dumps(deep_dive_signals, indent=2)}
    (IMPORTANT: These are links/redirects found AFTER clicking the ordering/gift buttons. 
     Look here for 3rd party POS domains like 'toasttab.com', 'spoton.com', 'clover.com'.)

    4. Loaded Scripts/Domains: {script_domains}
    5. Footer Text: {footer_text}
    6. Social Links: {socials}
    7. Newsletter/Mailing List Links: {newsletter_links}
    
    Task:
    - Identify the POS System (Point of Sale). 
      - PRIORITY: Look at "Deep Dive Signals". If a link redirects to or points to a known POS (Toast, Square, SpotOn, Upserve, Aloha, Heartland, Clover etc), that is the POS.
      - SECONDARY: Look at Scripts.
    - Identify the technologies the website uses (e.g., Website builders like Bentobox or Squarespace, online order builders like Sauce or Owner.com, reservation systems).
    - Extract the Instagram Handle.
    - Determine Newsletter/Mailing List status (Confirmed, Not Found).
    """

    async with AI_LIMITER: # Limit concurrency

        try:
            response = await client.aio.models.generate_content(
                model="gemini-flash-latest", # REASONING MODEL
                contents=prompt,
                config={
                    "thinking_config": types.ThinkingConfig(thinking_budget=-1),
                    "response_mime_type": "application/json",
                    "response_json_schema": RestaurantTechProfile.model_json_schema(),
                }
            )

            techprofile = RestaurantTechProfile.model_validate_json(response.text)
            return techprofile
        except Exception as e:
            print(f"Gemini 3 Error: {e}")
            return RestaurantTechProfile()

#### Other Helper Functions

In [11]:
def extract_footer(soup):
    # --- LEVEL 1: The "Smoking Gun" Selectors (Specific Builders) ---
    # These attributes are definitive proof that an element is a footer template.
    high_priority_selectors = [
        '[data-elementor-post-type="footer"]',  # <--- THIS FIXES YOUR SPECIFIC HTML
        '[data-elementor-type="footer"]',       # Standard Elementor
        '.elementor-location-footer',           # Elementor Theme Builder
        '#footer-builder',
        '.fusion-footer',                       # Avada
        '.divi-builder #main-footer',           # Divi
    ]

    for selector in high_priority_selectors:
        element = soup.select_one(selector)
        if element:
            #print(f"Match found via High Priority: {selector}")
            return clean_text(element)

    # --- LEVEL 2: The "Name Game" (Class/ID Search) ---
    # We look for classes/IDs containing "footer", but we filter them.
    # We don't want to accidentally grab a wrapper like <div id="page-wrapper-footer-included">
    candidates = []
    
    # Find ALL divs, sections, and footers
    tags = soup.find_all(['div', 'section', 'footer'])
    
    for tag in tags:
        # Get string of classes and ID
        classes = " ".join(tag.get('class', []))
        id_val = tag.get('id', "")
        identifier = (classes + " " + id_val).lower()
        
        # Check if it identifies as a footer
        if 'footer' in identifier or 'colophon' in identifier:
            # FILTER: Ignore elements that are too large (likely page wrappers)
            # A footer usually has fewer than 2000 characters of text.
            text_len = len(tag.get_text(strip=True))
            if 10 < text_len < 2500:
                candidates.append(tag)

    # If we found candidates, return the LAST one in the DOM (closest to bottom)
    if candidates:
        #print("Match found via Class/ID Search")
        return clean_text(candidates[-1])

    # --- LEVEL 3: Text Anchors (Copyright & Developer Credits) ---
    # Your example didn't have "Copyright", but it had "Site Made With Love".
    # We look for these phrases and grab their parent container.
    keywords = [
        r'©', 
        r'&copy;', 
        r'copyright', 
        r'all rights reserved', 
        r'powered by', 
        r'made with love', 
        r'made by', 
        r'designed by'
    ]
    
    pattern = re.compile('|'.join(keywords), re.IGNORECASE)
    
    # Find text nodes matching keywords
    matches = soup.find_all(string=pattern)
    
    if matches:
        # Get the last match (closest to bottom)
        target = matches[-1]
        
        # Walk up 3 levels to find a container (div or section)
        parent = target.parent
        for _ in range(3):
            if parent.name in ['div', 'section', 'footer', 'aside']:
                #print(f"Match found via Text Anchor: {target.strip()[:20]}...")
                return clean_text(parent)
            parent = parent.parent

    return ""

def clean_text(element):
    """Clean up whitespace but preserve readability"""
    # Get text with newline separators
    text = element.get_text("\n", strip=True)
    # Remove excessive newlines
    text = re.sub(r'\n\s*\n', '\n', text)
    return text

In [42]:
def filter_urls(urls):
     """
    Remove social media URLs and Resy URLs from a list.
    Keep everything else.
    """
     BLOCKED_DOMAINS = [
            "instagram.com",
            "facebook.com",
            "fb.com",
            "tiktok.com",
            "twitter.com",
            "x.com",
            "youtube.com",
            "youtu.be",
            "linkedin.com",
            "pinterest.com",
            "snapchat.com",
            "threads.net",
            "resy.com"
        ]
     
     filtered = []
     for url in urls:
        if not isinstance(url, str):
            continue
        
        url = url.strip()
        if not url:
            continue
        
        netloc = urlparse(url).netloc.lower()
        if any(domain in netloc for domain in BLOCKED_DOMAINS):
            continue
        
        filtered.append(url)

     return filtered

#### Main Logic

In [29]:
async def process_restaurant(crawler: AsyncWebCrawler, start_url: str, config: CrawlerRunConfig):
    print(f"--- Processing: {start_url} ---")
    
    # 1. Crawl Homepage
    result = await crawler.arun(url=start_url, config=config)
    if not result.success:
        print(f"  ! Failed to crawl: {start_url}")
        return None

    # 2. Extract Basic Signals (Scripts & Footer)
    soup = BeautifulSoup(result.html, 'html.parser')
    scripts = set()
    for s in soup.find_all('script', src=True):
        domain = urlparse(s.get('src')).netloc
        if domain: scripts.add(domain)
    
    footer = extract_footer(soup)
    
    # Check Tripleseat on Homepage
    ts_found = False
    if "tripleseat.com" in result.html or soup.find(id="tripleseat-form"):
        ts_found = True

    # Check Newsletter on Homepage
    newsletter_found = False
    newsletter_keywords = ["newsletter", "mailing list", "subscribe", "sign up", "stay updated"]
    if any(kw in result.html.lower() for kw in newsletter_keywords):
        newsletter_found = True

    # 3. USE GEMINI FLASH: Classify Links
    # We merge internal and external links for classification
    all_links = [{"text": l['text'], "href": urljoin(start_url, l['href'])} 
                 for l in result.links.get('internal', []) + result.links.get('external', [])]
    
    link_collection = classify_links_flash(all_links)
    classified_links = link_collection.classified_links

    # We visit these pages to find:
    # A) Redirects (e.g. /order -> toasttab.com)
    # B) Links ON that page (e.g. /order -> Button href="toasttab.com")
    deep_dive_signals = []
    ordering_candidates = [l for l in classified_links if l.category == "ordering"]
    gift_candidates = [l for l in classified_links if l.category == "gift_card"]
    urls_to_drill = (ordering_candidates + gift_candidates)[:4]

    for link_obj in urls_to_drill:
        print(f"  > Drilling down into tech link: {link_obj.url}")
        try:
            sub_res = await crawler.arun(url=link_obj.url)

            if sub_res.success:
                # Signal A: Did we get redirected?
                # Compare the final URL to the one we clicked.
                # If we clicked /order and ended up on toasttab.com, that's a strong signal.
                if urlparse(sub_res.url).netloc != urlparse(link_obj.url).netloc:
                     deep_dive_signals.append(f"Redirect from {link_obj.text}: {sub_res.url}")

                # Signal B: Scan for External Links on this sub-page
                # This handles the case where the page is internal but contains a button to the POS.
                # We extract external links found on this sub-page.
                external_links = sub_res.links.get("external", [])

                for ext_link in external_links[:10]:
                    href = ext_link.get('href', '')
                    if href:
                        deep_dive_signals.append(f"Link on '{link_obj.text}' page: {href}")

                # Signal C: Capture scripts on this sub-page
                sub_soup = BeautifulSoup(sub_res.html, 'html.parser')
                for s in sub_soup.find_all('script', src=True):
                    domain = urlparse(s.get('src')).netloc
                    if domain: scripts.add(domain)
        except Exception as e:
            print(f"  ! Failed to drill down {link_obj.url}: {e}")
    
    # 4. USE GEMINI 3: Analyze Tech Stack
    # We pass the *clean, categorized* data to the smart model
    tech_profile = analyze_tech_stack_gemini3(
        classified_links, 
        list(scripts)[:50], 
        footer,
        deep_dive_signals
    )

    if ts_found: 
        tech_profile.tripleseat_status = "Confirmed (Homepage)"
    
    if newsletter_found:
        tech_profile.newsletter_status = "Confirmed (Homepage)"

    # 5. Navigate to Private Events (if not already found)
    # We look for the link categorized as 'private_events' by Flash
    events_link = next((l for l in classified_links if l.category == "private_events"), None)
    if events_link:
        print(f"  > Flash identified Events page: {events_link.url}")
        try:
            evt_res = await crawler.arun(url=events_link.url)
            if evt_res.success:
                if "tripleseat.com" in evt_res.html or "tripleseat" in evt_res.html.lower():
                    tech_profile.tripleseat_status = "Confirmed (Events Page)"
                elif tech_profile.tripleseat_status == "Not Found":
                    tech_profile.tripleseat_status = "Not Found on Events Page"
        except Exception as e:
            print(f"  ! Failed to process Events page {events_link.url}: {e}")

    return {
        "url": start_url,
        "pos": tech_profile.pos_system,
        "stack": tech_profile.tech_stack,
        "instagram": tech_profile.instagram_handle,
        "tripleseat": tech_profile.tripleseat_status,
        "newsletter": tech_profile.newsletter_status,
        "ordering_url": next((l.url for l in classified_links if l.category == "ordering"), None)
    }
    

In [152]:
async def main():
    
    urls = ["http://www.nycprimerib.com/"]
    
    results = []
    crawler_run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler(verbose=False) as crawler:
        for url in urls:
            try:
                data = await process_restaurant(crawler, url, config=crawler_run_config)
                if data: results.append(data)
            except Exception as e:
                print(f"Error on {url}: {e}")

    df = pd.DataFrame(results)

    final_df = pd.merge(df, enrichdf, on='url', how='left')

    display(final_df)

In [None]:
await main()

--- Processing: http://www.nycprimerib.com/ ---


Flash Classification Prompt: 
    You are a restaurant bot link classifier.
    Classify every one of the following links based on their text and href into:
    - 'ordering' (online ordering, takeout, delivery, 'order now', point of sale, POS)
    - 'gift_card' (Gift cards, merch, store)
    - 'instagram' (Social media links to Instagram)
    - 'private_events' (Private dining, event booking, party reservations)
    - 'mailing_list' (Newsletter signups, email signups, mailing lists)
    - 'other' (Menus, about, contact, locations, reservations)

    If the text is gibberish or empty, RELY MORE on the href URL to classify.

    Input Links:
    [{"text": "0", "href": "https://www.nycprimerib.com/cart"}, {"text": "", "href": "https://www.nycprimerib.com/holiday"}, {"text": "Skip to Content", "href": "https://www.nycprimerib.com/"}, {"text": "Food & Drink", "href": "https://www.nycprimerib.com/menus"}, {"text": "About", "href": "https://www.nycprimerib.com/about"}, {"text": "Private Dinin

  > Drilling down into tech link: https://www.nycprimerib.com/gift-cards


  > Flash identified Events page: https://www.nycprimerib.com/private-dining


Unnamed: 0,url,pos,stack,instagram,tripleseat,newsletter,ordering_url,resy,foursquare,google,name,type,price_range_id,rating,num_ratings,web_link,loc_id,loc_neighborhood,phone_number
0,http://www.nycprimerib.com/,,"[Squarespace, Resy, Harri, Secure Tree]",nycprimerib,Confirmed (Events Page),Confirmed (Homepage),https://www.nycprimerib.com/cart,834,585164b77220e62219c9aeb6,ChIJ-7AOKpRZwokRMq0XnG_eehU,4 Charles Prime Rib,Steakhouse,3,4.94094,28952.0,"https://resy.com/?venue_id={'resy': 834, 'four...",ny,West Village,


#### Concurrent

##### Prepare Data for Concurrent Processing

In [43]:
urlset = set(enrichdf["url"].to_list())

In [45]:
urllist = filter_urls(list(urlset))

In [46]:
allurls = sorted(urllist)

In [49]:
first100 = allurls[:100]

##### SYNCHRONOUS HELPERS

In [12]:
def _sync_extract_homepage_signals(html: str):
    """
    CPU-bound task: Parses homepage HTML to find scripts, footer, and Tripleseat.
    """
    soup = BeautifulSoup(html, 'lxml')
    
    # Extract Scripts
    scripts = set()
    for s in soup.find_all('script', src=True):
        domain = urlparse(s.get('src')).netloc
        if domain: scripts.add(domain)
    
    # Extract Footer
    footer = extract_footer(soup)
    
    # Check Tripleseat
    # Note: simple string check is fast, but finding ID requires soup
    ts_found = "tripleseat.com" in html or soup.find(id="tripleseat-form") is not None

    newsletter_found = False
    newsletter_keywords = ["newsletter", "mailing list", "subscribe", "sign up", "stay updated"]
    if any(kw in html.lower() for kw in newsletter_keywords):
        newsletter_found = True
    
    return scripts, footer, ts_found, newsletter_found

def _sync_extract_scripts_only(html: str):
    """
    CPU-bound task: Parses sub-page HTML just for scripts.
    """
    soup = BeautifulSoup(html, 'lxml')
    scripts = set()
    for s in soup.find_all('script', src=True):
        domain = urlparse(s.get('src')).netloc
        if domain: scripts.add(domain)
    return scripts

##### MAIN CONCURRENT LOGIC

In [13]:
async def process_restaurant_concurrent(crawler: AsyncWebCrawler, start_url: str, config: CrawlerRunConfig):
    print(f"--- Processing: {start_url} ---")
    
    # 1. Crawl Homepage
    result = await crawler.arun(url=start_url, config=config)
    if not result.success:
        print(f"  ! Failed to crawl: {start_url}")
        return None

    # 2. Extract Basic Signals (CPU Bound - Offload to Thread)
    # This prevents blocking the event loop while parsing 
    scripts, footer, ts_found, newsletter_found = await asyncio.to_thread(
        _sync_extract_homepage_signals, 
        result.html
    )

    # 3. USE GEMINI FLASH: Classify Links
    # We merge internal and external links for classification
    all_links = [{"text": l['text'], "href": urljoin(start_url, l['href'])} 
                 for l in result.links.get('internal', []) + result.links.get('external', [])]
    
    link_collection = await classify_links_flash_async(all_links)
    classified_links = link_collection.classified_links

    # We visit these pages to find:
    # A) Redirects (e.g. /order -> toasttab.com)
    # B) Links ON that page (e.g. /order -> Button href="toasttab.com")
    ordering_candidates = [l for l in classified_links if l.category == "ordering"]
    gift_candidates = [l for l in classified_links if l.category == "gift_card"]
    urls_to_drill_objs = (ordering_candidates + gift_candidates)[:4]
    urls_to_drill_strings = [u.url for u in urls_to_drill_objs]

    deep_dive_signals = []
    
    if urls_to_drill_strings:
        print(f"  > Drilling down {len(urls_to_drill_strings)} URLs for {start_url}...")
        
        # Use MemoryAdaptiveDispatcher to handle these sub-tasks safely in parallel
        # This prevents the browser from crashing if we are running 5 restaurants * 5 sub-links = 25 tabs
        sub_dispatcher = MemoryAdaptiveDispatcher(
            memory_threshold_percent=90.0,
            max_session_permit=5,
            rate_limiter=CrawlRateLimiter(base_delay=(0.5, 1.0))
        )
        
        sub_results = await crawler.arun_many(
            urls=urls_to_drill_strings,
            config=config,
            dispatcher=sub_dispatcher
        )

        for i, sub_res in enumerate(sub_results):
            original_link_obj = urls_to_drill_objs[i]
            if sub_res.success:
                # Redirect check
                if urlparse(sub_res.url).netloc != urlparse(original_link_obj.url).netloc:
                    deep_dive_signals.append(f"Redirect from {original_link_obj.text}: {sub_res.url}")
                # Link check
                external_links = sub_res.links.get("external", [])
                for ext_link in external_links[:10]:
                    href = ext_link.get('href', '')
                    if href: deep_dive_signals.append(f"Link on '{original_link_obj.text}' page: {href}")
                # Script check
                new_scripts = await asyncio.to_thread(_sync_extract_scripts_only, sub_res.html)
                scripts.update(new_scripts)

    
    # 4. Analyze Tech Stack
    tech_profile = await analyze_tech_stack_async(
        classified_links, 
        list(scripts)[:50], 
        footer,
        deep_dive_signals
    )

    if ts_found: tech_profile.tripleseat_status = "Confirmed (Homepage)"
    
    if newsletter_found: tech_profile.newsletter_status = "Confirmed (Homepage)"

    # 5. Navigate to Private Events (if not already found)
    # We look for the link categorized as 'private_events' by Flash
    events_link = next((l for l in classified_links if l.category == "private_events"), None)
    if events_link:
        print(f"  > Flash identified Events page: {events_link.url}")
        try:
            evt_res = await crawler.arun(url=events_link.url)
            if evt_res.success:
                if "tripleseat.com" in evt_res.html or "tripleseat" in evt_res.html.lower():
                    tech_profile.tripleseat_status = "Confirmed (Events Page)"
                elif tech_profile.tripleseat_status == "Not Found":
                    tech_profile.tripleseat_status = "Not Found on Events Page"
        except Exception as e:
            print(f"  ! Failed to process Events page {events_link.url}: {e}")

    return {
        "url": start_url,
        "pos": tech_profile.pos_system,
        "stack": tech_profile.tech_stack,
        "instagram": tech_profile.instagram_handle,
        "tripleseat": tech_profile.tripleseat_status,
        "newsletter": tech_profile.newsletter_status,
        "ordering_url": next((l.url for l in classified_links if l.category == "ordering"), None)
    }
    

In [50]:
async def main_concurrent():

    urls = first100

    output_file = "restaurant_results.csv"

    if not os.path.exists(output_file):
        pd.DataFrame(columns=["url", "pos", "stack", "instagram", "tripleseat", "newsletter", "ordering_url"]).to_csv(output_file, index=False)
        print(f"Created new file: {output_file}")
    else:
        print(f"Appending to existing file: {output_file}")
    
    browser_config = BrowserConfig(headless=True, verbose=False, text_mode=True)
    crawler_run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

    concurrency_sem = asyncio.Semaphore(5)

    async def semaphore_worker(url, crawler):
        async with concurrency_sem:
            try:
                return await process_restaurant_concurrent(crawler, url, crawler_run_config)
            except Exception as e:
                print(f"Error processing {url}: {e}")
                return None
    
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Create tasks
        tasks = [semaphore_worker(url, crawler) for url in urls]
        
        print(f"Processing {len(urls)} restaurants with incremental saving...")
        
        # as_completed yields futures as they finish
        for future in asyncio.as_completed(tasks):
            result = await future
            
            if result:
                print(f"Finished: {result['url']} -> Saving.")
                
                # INCREMENTAL SAVE: Create 1-row DF and append to CSV
                df_chunk = pd.DataFrame([result])
                
                # Check if file exists to determine if we need a header (in case it was deleted mid-run)
                use_header = not os.path.exists(output_file)
                df_chunk.to_csv(output_file, mode='a', header=use_header, index=False)
                
            else:
                print(f"A task: {future} failed returned None.")

    print("\n--- Processing Complete ---")
    if os.path.exists(output_file):
        final_df = pd.read_csv(output_file)
        #print(final_df.to_string())

In [51]:
await main_concurrent()

Appending to existing file: restaurant_results.csv
Processing 100 restaurants with incremental saving...
--- Processing: http://carbonenewyork.com/ ---
--- Processing: http://harrysbarrestaurant.com/ ---
--- Processing: http://clintonstreetbaking.com/ ---
--- Processing: http://barbellynyc.com/ ---
--- Processing: http://housemanrestaurant.com/ ---


  > Drilling down 1 URLs for http://barbellynyc.com/...


  > Drilling down 2 URLs for http://housemanrestaurant.com/...
  > Drilling down 2 URLs for http://harrysbarrestaurant.com/...


  > Drilling down 2 URLs for http://carbonenewyork.com/...
--- Processing: http://blackironburger.com ---
Finished: http://barbellynyc.com/ -> Saving.


--- Processing: http://coloniaverdenyc.com/#/ ---
Finished: http://clintonstreetbaking.com/ -> Saving.


  > Flash identified Events page: https://www.harrysbarrestaurant.com/private-events-2


--- Processing: http://barmadonna.com ---
Finished: http://harrysbarrestaurant.com/ -> Saving.


  > Flash identified Events page: https://carbonenewyork.com/nyc-events


--- Processing: http://ilcortile.com ---
Finished: http://carbonenewyork.com/ -> Saving.


--- Processing: http://bombaygrillhouseforesthill.com/ ---
Finished: http://housemanrestaurant.com/ -> Saving.


  ! Failed to crawl: http://bombaygrillhouseforesthill.com/
--- Processing: http://cosmenyc.com/ ---
A task: <coroutine object _AsCompletedIterator._wait_for_one at 0x1460ff510> failed returned None.
  > Drilling down 1 URLs for http://coloniaverdenyc.com/#/...


  > Drilling down 2 URLs for http://barmadonna.com...


  > Drilling down 4 URLs for http://blackironburger.com...


  > Flash identified Events page: http://comparticatering.com/


--- Processing: http://cartawinebar.com ---
Finished: http://coloniaverdenyc.com/#/ -> Saving.
  > Drilling down 4 URLs for http://ilcortile.com...
  > Flash identified Events page: https://www.blackironburger.com/catering-inquiry-form


--- Processing: http://ilfioristanyc.com ---
Finished: http://barmadonna.com -> Saving.


--- Processing: http://bondisushi.com/ ---
Finished: http://blackironburger.com -> Saving.


  > Drilling down 3 URLs for http://cosmenyc.com/...


  > Flash identified Events page: https://www.tripleseat.com
  > Flash identified Events page: https://ilcortile.com/private-dining-choices
  > Drilling down 3 URLs for http://bondisushi.com/...


--- Processing: http://decades.pizza ---
Finished: http://cosmenyc.com/ -> Saving.


--- Processing: http://artesanorestaurant.com/ ---
Finished: http://ilcortile.com -> Saving.


  > Drilling down 1 URLs for http://cartawinebar.com...


  > Flash identified Events page: https://bondisushi.com/catering
  > Flash identified Events page: https://cartawinebar.com/events


--- Processing: http://jacobspickles.com/ ---
Finished: http://bondisushi.com/ -> Saving.


--- Processing: http://deptofculturebk.com ---
Finished: http://cartawinebar.com -> Saving.


  > Drilling down 3 URLs for http://artesanorestaurant.com/...
  > Drilling down 4 URLs for http://decades.pizza...


  > Drilling down 1 URLs for http://deptofculturebk.com...


  > Drilling down 4 URLs for http://jacobspickles.com/...
--- Processing: http://amityhallnyc.com/ ---
Finished: http://artesanorestaurant.com/ -> Saving.


--- Processing: http://jejunoodlebar.com/ ---
Finished: http://deptofculturebk.com -> Saving.


  > Flash identified Events page: https://resy.com/cities/new-york-ny/venues/decades-pizza/events/decades-2-year-anniversary-2025-02-27?date=2025-02-18&event_url_slug=decades-2-year-anniversary-2025-02-27&maxpartysize=50&seats=2


--- Processing: http://dimesnyc.com/ ---
Finished: http://decades.pizza -> Saving.


  > Flash identified Events page: https://www.jacobspickles.com/private-events-upper-west-side
  > Drilling down 1 URLs for http://dimesnyc.com/...


--- Processing: http://cactuswren.nyc ---
Finished: http://jacobspickles.com/ -> Saving.


  > Flash identified Events page: https://amityhallnyc.com/new-york-greenwich-village-amity-hall-downtown-party
--- Processing: http://juelanclub.com ---
Finished: http://jejunoodlebar.com/ -> Saving.


--- Processing: http://dinernyc.com/ ---
Finished: http://amityhallnyc.com/ -> Saving.


--- Processing: http://cafechelseanyc.com ---
Finished: http://dimesnyc.com/ -> Saving.


  > Drilling down 1 URLs for http://cactuswren.nyc...


  > Drilling down 1 URLs for http://cafechelseanyc.com...
  > Drilling down 1 URLs for http://dinernyc.com/...


  > Drilling down 1 URLs for http://juelanclub.com...


--- Processing: http://kawaomakase.com/ ---
Finished: http://cactuswren.nyc -> Saving.


  > Flash identified Events page: https://www.cafechelseanyc.com/private-events


--- Processing: http://dinnerpartybk.com ---
Finished: http://cafechelseanyc.com -> Saving.
  > Flash identified Events page: https://www.juelanclub.com/private-events


  > Flash identified Events page: https://www.dinernyc.com/parties


--- Processing: http://acespizzaspot.com ---
Finished: http://juelanclub.com -> Saving.


--- Processing: http://cafe-colette.com/ ---
Finished: http://dinernyc.com/ -> Saving.


--- Processing: http://dirtyfrench.com/ ---
Finished: http://kawaomakase.com/ -> Saving.


  > Drilling down 2 URLs for http://dinnerpartybk.com...
  > Drilling down 1 URLs for http://dirtyfrench.com/...


  > Drilling down 4 URLs for http://acespizzaspot.com...


  > Drilling down 2 URLs for http://cafe-colette.com/...


  ! Failed to crawl: http://ilfioristanyc.com
--- Processing: http://kesnewyork.com ---
A task: <coroutine object _AsCompletedIterator._wait_for_one at 0x125a76330> failed returned None.


  > Flash identified Events page: https://www.dinnerpartybk.com/reservation-info-1
--- Processing: http://kingscoimperial.com/ ---
Finished: http://dirtyfrench.com/ -> Saving.


--- Processing: http://arkrestaurants.com/bryant_park/ ---
Finished: http://dinnerpartybk.com -> Saving.


  > Drilling down 1 URLs for http://kesnewyork.com...


  > Flash identified Events page: mailto:hello@cafe-colette.com?subject=Private+Event


--- Processing: http://districtbarnyc.com/ ---
Finished: http://cafe-colette.com/ -> Saving.


  > Flash identified Events page: https://www.kesnewyork.com/large-party-event-requests


--- Processing: http://akimorinyc.com ---
Finished: http://kesnewyork.com -> Saving.
  > Flash identified Events page: mailto:party@acesperfectpizza.com?subject=I+WANT+TO+PARTY+


--- Processing: http://krupagrocery.com ---
Finished: http://acespizzaspot.com -> Saving.


  > Drilling down 4 URLs for http://kingscoimperial.com/...


  > Drilling down 3 URLs for http://krupagrocery.com...
  > Drilling down 3 URLs for http://akimorinyc.com...


  > Drilling down 4 URLs for http://districtbarnyc.com/...


  > Flash identified Events page: https://www.kingscoimperial.com/private-events-and-large-parties


--- Processing: http://auchevalnyc.com ---
Finished: http://kingscoimperial.com/ -> Saving.
  > Flash identified Events page: https://www.akimorinyc.com/events


--- Processing: http://drunkenmunkey.com ---
Finished: http://akimorinyc.com -> Saving.


--- Processing: http://alicenyc.com/ ---
Finished: http://krupagrocery.com -> Saving.


  > Flash identified Events page: https://districtbarnyc.com/?page_id=18


--- Processing: http://Zensushiomakase.com ---
Finished: http://districtbarnyc.com/ -> Saving.
  > Drilling down 3 URLs for http://drunkenmunkey.com...


  > Drilling down 1 URLs for http://auchevalnyc.com...


  > Drilling down 2 URLs for http://alicenyc.com/...


--- Processing: http://eckhartbeer.com ---
Finished: http://auchevalnyc.com -> Saving.
--- Processing: http://kuunbrooklyn.com ---
Finished: http://Zensushiomakase.com -> Saving.
  > Flash identified Events page: https://alicenyc.com/events


  > Flash identified Events page: https://bryantparkgrillnyc.com/corporate-brochure
--- Processing: http://lacontentanyc.com/ ---
Finished: http://drunkenmunkey.com -> Saving.


--- Processing: http://elencantodelola2.com/ ---
Finished: http://alicenyc.com/ -> Saving.


--- Processing: http://laissezfaire.nyc ---
Finished: http://arkrestaurants.com/bryant_park/ -> Saving.


  > Drilling down 1 URLs for http://eckhartbeer.com...


  > Drilling down 4 URLs for http://kuunbrooklyn.com...
  > Drilling down 1 URLs for http://laissezfaire.nyc...


  > Drilling down 4 URLs for http://elencantodelola2.com/...
--- Processing: http://elephantdistrict.com/ ---
Finished: http://eckhartbeer.com -> Saving.


  > Flash identified Events page: https://www.laissezfaire.nyc/private-events-1


--- Processing: http://achillesheelnyc.com/ ---
Finished: http://laissezfaire.nyc -> Saving.
--- Processing: http://ariawinebar.com/west-village.html ---
Finished: http://lacontentanyc.com/ -> Saving.


--- Processing: http://eliarestaurant.com/ ---
Finished: http://kuunbrooklyn.com -> Saving.


  > Drilling down 1 URLs for http://elephantdistrict.com/...


--- Processing: http://lanoxenyc.com/ ---
Finished: http://elencantodelola2.com/ -> Saving.
  > Drilling down 4 URLs for http://ariawinebar.com/west-village.html...
--- Processing: http://larinabk.com/ ---
Finished: http://elephantdistrict.com/ -> Saving.


--- Processing: http://elquijotenyc.com ---
Finished: http://achillesheelnyc.com/ -> Saving.
  > Drilling down 4 URLs for http://eliarestaurant.com/...


  > Drilling down 1 URLs for http://lanoxenyc.com/...


  > Flash identified Events page: https://ariawinebar.com/events-ariawv
--- Processing: http://boutrosbk.com/ ---
Finished: http://lanoxenyc.com/ -> Saving.


--- Processing: http://lebaratinnyc.com/ ---
Finished: http://ariawinebar.com/west-village.html -> Saving.
  > Drilling down 1 URLs for http://elquijotenyc.com...


  > Drilling down 4 URLs for http://larinabk.com/...


--- Processing: http://bellaunionnyc.com ---
Finished: http://eliarestaurant.com/ -> Saving.


  > Flash identified Events page: https://www.elquijotenyc.com/private-events


--- Processing: http://emmettsnyc.com/ ---
Finished: http://elquijotenyc.com -> Saving.


--- Processing: http://bourbonandbranchnyc.com ---
Finished: http://larinabk.com/ -> Saving.


  > Drilling down 1 URLs for http://boutrosbk.com/...
  > Flash identified Events page: http://lebaratinnyc.com/events-catering


--- Processing: http://lepetitvillagenyc.com ---
Finished: http://lebaratinnyc.com/ -> Saving.


  > Flash identified Events page: https://www.bellaunionnyc.com/parties
  > Drilling down 4 URLs for http://emmettsnyc.com/...


  > Drilling down 1 URLs for http://bourbonandbranchnyc.com...


--- Processing: http://cafeteriagroup.com/ ---
Finished: http://bellaunionnyc.com -> Saving.


--- Processing: http://empire-diner.com/ ---
Finished: http://boutrosbk.com/ -> Saving.


--- Processing: http://brindleroomny.com ---
Finished: http://cafeteriagroup.com/ -> Saving.
  > Flash identified Events page: https://www.bourbonandbranchnyc.com/private-events


--- Processing: http://lifeatmars.com/ ---
Finished: http://bourbonandbranchnyc.com -> Saving.


  > Drilling down 1 URLs for http://lepetitvillagenyc.com...
  > Drilling down 3 URLs for http://empire-diner.com/...
  > Flash identified Events page: https://www.emmettsnyc.com/private-events


--- Processing: http://antonsnyc.com ---
Finished: http://emmettsnyc.com/ -> Saving.


  > Drilling down 1 URLs for http://lifeatmars.com/...


  > Flash identified Events page: https://www.lepetitvillagenyc.com/private-events
  > Drilling down 4 URLs for http://brindleroomny.com...


--- Processing: http://epistrophynyc.com ---
Finished: http://lepetitvillagenyc.com -> Saving.


  > Flash identified Events page: https://empire-diner.com/events-catering


  > Flash identified Events page: http://lifeatmars.com/private-events


--- Processing: http://bricolage.nyc/ ---
Finished: http://empire-diner.com/ -> Saving.


--- Processing: http://lighthousebk.com/ ---
Finished: http://lifeatmars.com/ -> Saving.
--- Processing: http://berimbaunyc.com/ ---
Finished: http://epistrophynyc.com -> Saving.


--- Processing: http://estiatoriomilos.com/ ---
Finished: http://antonsnyc.com -> Saving.


  > Drilling down 2 URLs for http://bricolage.nyc/...


  > Flash identified Events page: https://brindleroomny.com/east-village-alphabet-city-the-brindle-room-party
  > Drilling down 2 URLs for http://berimbaunyc.com/...
--- Processing: http://littlecharli.com ---
Finished: http://lighthousebk.com/ -> Saving.


--- Processing: http://buathairamennyc.com/ ---
Finished: http://brindleroomny.com -> Saving.


  > Flash identified Events page: https://www.bricolage.nyc/private-events


--- Processing: http://fiveleavesny.com/ ---
Finished: http://bricolage.nyc/ -> Saving.


  > Drilling down 3 URLs for http://buathairamennyc.com/...
--- Processing: http://5ivespicegramercy.com ---
Finished: http://berimbaunyc.com/ -> Saving.


  > Drilling down 2 URLs for http://fiveleavesny.com/...


--- Processing: http://ariawinebar.com/hell-kitchen.html ---
Finished: http://littlecharli.com -> Saving.
  > Drilling down 4 URLs for http://estiatoriomilos.com/...


--- Processing: http://bathtubginnyc.com/ ---
Finished: http://fiveleavesny.com/ -> Saving.


--- Processing: http://flemingbylebilboquet.com/ ---
Finished: http://buathairamennyc.com/ -> Saving.
  > Drilling down 1 URLs for http://5ivespicegramercy.com...


--- Processing: http://americancutsteakhouse.com/ ---
Finished: http://ariawinebar.com/hell-kitchen.html -> Saving.
  > Flash identified Events page: https://www.estiatoriomilos.com/location/montreal/events


--- Processing: http://follianyc.com/ ---
Finished: http://estiatoriomilos.com/ -> Saving.


--- Processing: http://12chairscafe.com/ ---
Finished: http://flemingbylebilboquet.com/ -> Saving.
  > Drilling down 1 URLs for http://bathtubginnyc.com/...
  > Flash identified Events page: https://5ivespice.com/gramercy/events


--- Processing: http://amaranthrestaurant.com ---
Finished: http://5ivespicegramercy.com -> Saving.


--- Processing: http://foulwitchnyc.com ---
Finished: http://bathtubginnyc.com/ -> Saving.


  > Drilling down 3 URLs for http://follianyc.com/...


--- Processing: http://Rosehillrooftop.com ---
Finished: http://12chairscafe.com/ -> Saving.


  > Drilling down 1 URLs for http://foulwitchnyc.com...


  > Flash identified Events page: https://www.follianyc.com/private-parties
  > Drilling down 2 URLs for http://amaranthrestaurant.com...


--- Processing: http://anerestaurant.com ---
Finished: http://follianyc.com/ -> Saving.


  > Flash identified Events page: https://www.foulwitchnyc.com/private-events


--- Processing: http://frenchlouienyc.com/ ---
Finished: http://foulwitchnyc.com -> Saving.
  > Flash identified Events page: https://www.rosehillrooftop.com/theopen-1


--- Processing: http://amamibk.com/ ---
Finished: http://Rosehillrooftop.com -> Saving.


  > Flash identified Events page: https://www.amaranthrestaurant.com


--- Processing: http://bartettorooftop.com ---
Finished: http://amaranthrestaurant.com -> Saving.
  > Drilling down 2 URLs for http://anerestaurant.com...


  > Drilling down 3 URLs for http://frenchlouienyc.com/...
  > Flash identified Events page: https://www.anerestaurant.com/private-events


  > Drilling down 3 URLs for http://amamibk.com/...


  > Drilling down 1 URLs for http://bartettorooftop.com...


--- Processing: http://friendsinfood.com ---
Finished: http://anerestaurant.com -> Saving.


  > Flash identified Events page: https://frenchlouienyc.com/events


--- Processing: http://casettanyc.info ---
Finished: http://frenchlouienyc.com/ -> Saving.


  > Flash identified Events page: https://www.parchedhg.com/bartetto
--- Processing: http://bartabacny.com ---
Finished: http://amamibk.com/ -> Saving.
--- Processing: http://gooddaysbk.com ---
Finished: http://friendsinfood.com -> Saving.


--- Processing: http://Celestinebk.com ---
Finished: http://bartettorooftop.com -> Saving.


  > Drilling down 1 URLs for http://Celestinebk.com...


  > Drilling down 1 URLs for http://casettanyc.info...


--- Processing: http://casinonyc.info ---
Finished: http://bartabacny.com -> Saving.
  > Flash identified Events page: https://www.gooddaysbk.com/contact-8


--- Processing: http://arthurandsonsnyc.com ---
Finished: http://gooddaysbk.com -> Saving.


--- Processing: http://grandarmybar.com ---
Finished: http://Celestinebk.com -> Saving.


  > Flash identified Events page: https://www.casettanyc.info/private-events


--- Processing: http://charliebirdnyc.com/ ---
Finished: http://casettanyc.info -> Saving.


  > Drilling down 1 URLs for http://casinonyc.info...


  > Drilling down 1 URLs for http://arthurandsonsnyc.com...
  > Drilling down 3 URLs for http://grandarmybar.com...


  > Flash identified Events page: https://app.perfectvenue.com/venues/casino/hello


--- Processing: http://atoboynyc.com/#atoboynyc ---
Finished: http://casinonyc.info -> Saving.


  > Flash identified Events page: https://partylikeartie.com/


--- Processing: http://graziellasmenu.com/ ---
Finished: http://arthurandsonsnyc.com -> Saving.


  ! Failed to crawl: http://americancutsteakhouse.com/
--- Processing: http://6restaurant.com ---
A task: <coroutine object _AsCompletedIterator._wait_for_one at 0x14709fe00> failed returned None.
  > Flash identified Events page: https://www.grandarmybar.com/events-1


  > Drilling down 4 URLs for http://charliebirdnyc.com/...


--- Processing: http://claudnyc.com ---
Finished: http://grandarmybar.com -> Saving.


  > Drilling down 2 URLs for http://atoboynyc.com/#atoboynyc...


  > Drilling down 1 URLs for http://6restaurant.com...


  > Drilling down 2 URLs for http://claudnyc.com...
  > Flash identified Events page: https://www.charliebirdnyc.com/private-events


--- Processing: http://balthazarny.com/ ---
Finished: http://6restaurant.com -> Saving.
--- Processing: http://greenpointfish.com/ ---
Finished: http://atoboynyc.com/#atoboynyc -> Saving.


Finished: http://charliebirdnyc.com/ -> Saving.


  > Flash identified Events page: https://www.claudnyc.com/private-events
  > Drilling down 3 URLs for http://graziellasmenu.com/...


Finished: http://claudnyc.com -> Saving.
  > Drilling down 3 URLs for http://greenpointfish.com/...


  > Drilling down 2 URLs for http://balthazarny.com/...


Finished: http://greenpointfish.com/ -> Saving.
  > Flash identified Events page: https://tmt.spotapps.co/private-parties?callback_url=http%3A%2F%2Fgraziellasmenu.com%2F&spot_id=10828


Finished: http://graziellasmenu.com/ -> Saving.
  > Flash identified Events page: https://balthazarny.com/private-dining


Finished: http://balthazarny.com/ -> Saving.

--- Processing Complete ---


#### Tester

In [24]:
urls = [
        "https://www.andsonnyc.com",
        "https://www.rezdora.nyc/"
    ]
    
results = []
crawler_run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(verbose=False) as crawler:
    for url in urls:
        try:
            data = await process_restaurant(crawler, url, config=crawler_run_config)
            if data: results.append(data)
        except Exception as e:
            print(f"Error on {url}: {e}")

df = pd.DataFrame(results)
display(df)

--- Processing: https://www.andsonnyc.com ---


--- Processing: https://www.rezdora.nyc/ ---


Unnamed: 0,result,soup,scripts,footer,ts_found
0,(url='https://www.andsonnyc.com' html='<!DOCTY...,"[html, [[<meta content=""A7vZI3v+Gz7JfuRolKNM4A...","{www.googletagmanager.com, cdn.jsdelivr.net, u...",Greenwich Village 62 West 9th Street between 5...,False
1,(url='https://www.rezdora.nyc/' html='<!DOCTYP...,"[html, [[<meta content=""A7vZI3v+Gz7JfuRolKNM4A...","{app-assets.getbento.com, theme-assets.getbent...",Facebook Twitter Instagram Hours & Location Me...,False


In [25]:
all_links = [{"text": l['text'], "href": urljoin("https://www.rezdora.nyc/", l['href'])} 
                 for l in df["result"][1].links.get('internal', []) + df["result"][1].links.get('external', [])]

In [26]:
classify_test_results = classify_links_flash(all_links)

In [27]:
test_links = classify_test_results.classified_links

In [33]:
deep_dive_signals = []
scripts = set()
ordering_candidates = [l for l in test_links if l.category == "ordering"]
gift_candidates = [l for l in test_links if l.category == "gift_card"]
urls_to_drill = (ordering_candidates + gift_candidates)[:4]

In [37]:
for link_obj in urls_to_drill:
        print(f"  > Drilling down into tech link: {link_obj.url}")
        try:
            async with AsyncWebCrawler(verbose=False) as crawler:
                sub_res = await crawler.arun(url=link_obj.url, config=crawler_run_config)

                if sub_res.success:
                    # Signal A: Did we get redirected?
                    # Compare the final URL to the one we clicked.
                    # If we clicked /order and ended up on toasttab.com, that's a strong signal.
                    if urlparse(sub_res.url).netloc != urlparse(link_obj.url).netloc:
                        deep_dive_signals.append(f"Redirect from {link_obj.text}: {sub_res.url}")

                    # Signal B: Scan for External Links on this sub-page
                    # This handles the case where the page is internal but contains a button to the POS.
                    # We extract external links found on this sub-page.
                    external_links = sub_res.links.get("external", [])

                    for ext_link in external_links[:15]:
                        href = ext_link.get('href', '')
                        if href:
                            deep_dive_signals.append(f"Link on '{link_obj.text}' page: {href}")

                    # Signal C: Capture scripts on this sub-page
                    sub_soup = BeautifulSoup(sub_res.html, 'html.parser')
                    for s in sub_soup.find_all('script', src=True):
                        domain = urlparse(s.get('src')).netloc
                        if domain: scripts.add(domain)
        except Exception as e:
            print(f"  ! Failed to drill down {link_obj.url}: {e}")

  > Drilling down into tech link: https://www.rezdora.nyc/gift-cards


In [41]:
tech_profile = analyze_tech_stack_gemini3(
        test_links, 
        list(scripts)[:50], 
        results[1]['footer'],
        deep_dive_signals # <--- Passing the deep dive content
    )

In [43]:
events_link = next((l for l in test_links if l.category == "private_events"), None)
if events_link:
    print(f"  > Flash identified Events page: {events_link.url}")
    try:
        async with AsyncWebCrawler(verbose=False) as crawler:
            evt_res = await crawler.arun(url=events_link.url)
            if evt_res.success:
                if "tripleseat.com" in evt_res.html or "tripleseat" in evt_res.html.lower():
                    tech_profile.tripleseat_status = "Confirmed (Events Page)"
                elif tech_profile.tripleseat_status == "Not Found":
                    tech_profile.tripleseat_status = "Not Found on Events Page"
    except Exception as e:
            print(f"  ! Failed to drill down {events_link.url}: {e}")

  > Flash identified Events page: https://www.rezdora.nyc/private-events


In [44]:
tech_profile

RestaurantTechProfile(pos_system='Toast', tech_stack=['BentoBox', 'Resy', 'AudioEye', 'Google Maps'], ordering_provider=None, instagram_handle='rezdoranyc', tripleseat_status='Confirmed (Events Page)')