<a href="https://colab.research.google.com/github/ajayarjun117/Ajay_/blob/main/copyofAjayJobAggregation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile {BASE_DIR}/tools.py
from typing import Dict, Any, List, Optional
from dataclasses import dataclass
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from playwright.sync_api import sync_playwright
import re

@dataclass
class PageResult:
    final_url: str
    html: str

def fetch_rendered_html_impl(url: str, wait_selector: Optional[str] = None, timeout_ms: int = 15000) -> PageResult:
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.set_extra_http_headers({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9"
        })
        page.goto(url, wait_until="networkidle")
        # gentle scroll to trigger lazy load
        for _ in range(6):
            page.keyboard.press("End")
            page.wait_for_timeout(900)
        if wait_selector:
            try:
                page.wait_for_selector(wait_selector, timeout=timeout_ms)
            except Exception:
                pass
        html = page.content()
        final_url = page.url
        browser.close()
    return PageResult(final_url=final_url, html=html)

def _text(el):
    return re.sub(r"\s+", " ", el.get_text(strip=True)) if el else None

def parse_generic_listings_impl(base_url: str, html: str, company: Optional[str] = None) -> List[Dict[str, Any]]:
    soup = BeautifulSoup(html, "lxml")
    items = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        text = _text(a)
        if not text or len(text) < 4:
            continue
        if re.search(r"(careers|jobs|positions|opening|opportunit|apply)", href, re.I):
            card = a.find_parent(["li", "article", "div"]) or a
            # location
            loc = None
            for sel in [".location", "[data-location]"]:
                try:
                    cand = card.select_one(sel)
                    if cand:
                        loc = _text(cand); break
                except Exception:
                    pass
            # description
            desc_el = None
            for dsel in [".description", ".job-snippet", "p", "summary", "li"]:
                try:
                    cand = card.select_one(dsel)
                    if cand and _text(cand) and _text(cand) != text:
                        desc_el = cand; break
                except Exception:
                    pass
            items.append({
                "job_title": text,
                "location": loc,
                "company": company,
                "description_snippet": _text(desc_el) if desc_el else None,
                "apply_link": urljoin(base_url, href),
            })
    # de-dup
    out, seen = [], set()
    for r in items:
        key = (r["job_title"], r["apply_link"])
        if key not in seen:
            seen.add(key); out.append(r)
    return out[:100]

def parse_google_careers_impl(html: str) -> List[Dict[str, Any]]:
    soup = BeautifulSoup(html, "lxml")
    records = []
    for li in soup.select('[role="listitem"], li, article'):
        a = li.select_one("a[aria-label], a[jsname], a[href]") or li.find("a")
        if not a:
            continue
        title = _text(a)
        href = a.get("href")
        if not title or not href:
            continue
        loc_el = li.select_one("[data-location], .Gc7T-, .location") or li.find(string=re.compile("Location", re.I))
        loc = _text(loc_el.parent) if hasattr(loc_el, "parent") else _text(loc_el)
        desc_el = li.select_one(".Zqg4Je, .job-snippet, p")
        records.append({
            "job_title": title,
            "location": loc,
            "company": "Google",
            "description_snippet": _text(desc_el),
            "apply_link": href if href.startswith("http") else f"https://careers.google.com{href}",
        })
    return records

def parse_meta_careers_impl(base_url: str, html: str) -> List[Dict[str, Any]]:
    soup = BeautifulSoup(html, "lxml")
    out = []
    for card in soup.select("article, .job-card, ._8_1x, li"):
        a = card.find("a", href=True)
        if not a:
            continue
        title = _text(a)
        href = urljoin(base_url, a["href"])
        if not title or not href:
            continue
        loc = None
        for sel in ["[data-testid='location']", ".jobs-location", "._8_1y", "[aria-label*='Location']"]:
            el = card.select_one(sel)
            if el:
                loc = _text(el); break
        desc_el = card.select_one(".job-snippet, p, .description")
        out.append({
            "job_title": title,
            "location": loc,
            "company": "Meta",
            "description_snippet": _text(desc_el),
            "apply_link": href,
        })
    return out



Overwriting /content/job_crew/tools.py


In [None]:
%%writefile {BASE_DIR}/main.py
import json
from typing import List, Dict, Any, Optional
from crewai import Agent  # still using CrewAI agents for the POC narrative
from tools import (
    fetch_rendered_html_impl,
    parse_generic_listings_impl,
    parse_google_careers_impl,
    parse_meta_careers_impl,
)

# Agents without tools (to avoid version issues). We'll call functions directly.
navigator = Agent(
    role="Web Navigator",
    goal="Render a given careers URL (JS included) and return final_url + html.",
    backstory="Handles dynamic, JS-heavy pages and waits for relevant content to load.",
    verbose=True,
)

extractor = Agent(
    role="DOM Extractor",
    goal="Extract jobs: title, location, company, description snippet, apply link.",
    backstory="Understands common careers page patterns; falls back to heuristics.",
    verbose=True,
)

def render_page(url: str) -> Dict[str, str]:
    res = fetch_rendered_html_impl(url)
    return {"final_url": res.final_url, "html": res.html}

def extract_records(final_url: str, html: str, company_hint: Optional[str]) -> List[Dict[str, Any]]:
    if "careers.google.com" in final_url:
        recs = parse_google_careers_impl(html)
    elif "metacareers.com" in final_url or "facebook.com/careers" in final_url:
        recs = parse_meta_careers_impl(final_url, html)
    else:
        recs = parse_generic_listings_impl(final_url, html, company=company_hint)
    for r in recs:
        if company_hint and not r.get("company"):
            r["company"] = company_hint
    return recs

def run_on_url(url: str, company_hint: Optional[str] = None) -> List[Dict[str, Any]]:
    nav = render_page(url)
    return extract_records(nav["final_url"], nav["html"], company_hint)

if __name__ == "__main__":
    test_urls = [
        "https://careers.google.com/jobs/results/",   # Google
        "https://www.metacareers.com/jobs/",          # Meta
        "https://stripe.com/jobs/search",             # Third site (example)
    ]

    all_results: Dict[str, Any] = {}
    for url in test_urls:
        print(f"\n=== Running on {url} ===")
        company_hint = "Google" if "google" in url else "Meta" if ("meta" in url or "facebook" in url) else "Stripe"
        try:
            recs = run_on_url(url, company_hint)
            print(f"Extracted {len(recs)} listings")
            for r in recs[:5]:
                print(r)
            all_results[url] = recs
        except Exception as e:
            print("ERROR:", e)
            all_results[url] = {"error": str(e)}

    out_path = f"/content/job_crew/data/outputs.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    print("\nSaved results ->", out_path)



Overwriting /content/job_crew/main.py


In [None]:
!python {BASE_DIR}/main.py

import os
print("Data dir contents:", os.listdir(f"{BASE_DIR}/data"))



=== Running on https://careers.google.com/jobs/results/ ===
Extracted 12 listings
{'job_title': 'Careers', 'location': None, 'company': 'Google', 'description_snippet': None, 'apply_link': 'https://www.google.com/about/careers/applications/'}
{'job_title': 'work_outlinework_outlineJobsJobs', 'location': None, 'company': 'Google', 'description_snippet': None, 'apply_link': 'https://www.google.com/about/careers/applications/jobs/results/jobs/results'}
{'job_title': 'Help', 'location': None, 'company': 'Google', 'description_snippet': None, 'apply_link': 'https://support.google.com/googlecareers'}
{'job_title': 'Sign in', 'location': None, 'company': 'Google', 'description_snippet': None, 'apply_link': 'https://accounts.google.com/ServiceLogin?passive=1209600&continue=https%3A%2F%2Fwww.google.com%2Fabout%2Fcareers%2Fapplications%2Fjobs%2Fresults%2F&followup=https%3A%2F%2Fwww.google.com%2Fabout%2Fcareers%2Fapplications%2Fjobs%2Fresults%2F&ec=GAZA6QE'}
{'job_title': 'Job search', 'location

In [None]:
from google.colab import files
files.download("/content/job_crew/data/outputs.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>