Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions web_scraping/headless_chrome/.beamignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Generated by Beam SDK
.beamignore
pyproject.toml
.git
.idea
.python-version
.vscode
.venv
venv
__pycache__
.DS_Store
.config
drive/MyDrive
.coverage
.pytest_cache
.ipynb
.ruff_cache
.dockerignore
.ipynb_checkpoints
.env.local
.envrc
**/__pycache__/
**/.pytest_cache/
**/node_modules/
**/.venv/
*.pyc
.next/
.circleci
37 changes: 37 additions & 0 deletions web_scraping/headless_chrome/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Headless Browser API Example

A headless browser API powered by Playwright, capturing full-page website screenshots.

## Deployment

Deploy the app on Beam:

```
beam deploy app.py:browser
```

## API Usage

Send a `POST` request to the endpoint with the following JSON body:

```json
{
"url": "https://your-website-url"
}
```

### Example Request:

```json
{
"url": "https://example.com"
}
```

### Example Response:

```json
{
"output_url": "https://app.beam.cloud/output/id/9dfbb7a1-a3de-489c-a602-423b4c859f84"
}
```
49 changes: 49 additions & 0 deletions web_scraping/headless_chrome/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from beam import Output, endpoint, Image, env
import asyncio

if env.is_remote():
from playwright.async_api import async_playwright

image = (
Image(python_version="python3.11")
.add_python_packages(
[
"playwright",
]
)
.add_commands(["playwright install chromium", "playwright install-deps chromium"])
)


@endpoint(name="headless-browser", cpu=2, memory="16Gi", image=image)
async def browser(url: str = "https://example.com"):
print(f"Navigating to: {url}")
output_path = "/tmp/screenshot.png"

async with async_playwright() as playwright:
browser = await playwright.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-dev-shm-usage"],
)
print("Browser launched successfully")

try:
page = await browser.new_page()
await page.set_viewport_size({"width": 1920, "height": 1080})

await page.goto(url, wait_until="networkidle")
print("Page loaded, waiting 2 seconds for any dynamic content...")

await asyncio.sleep(2)

await page.screenshot(path=output_path, full_page=True, type="png")
print(f"Screenshot saved as: {output_path}")

finally:
await browser.close()

output_file = Output(path=output_path)
output_file.save()
public_url = output_file.public_url(expires=400)

return {"output_url": public_url}