In [None]:
!pip install docker pandas

In [2]:
import docker

client = docker.from_env()
print(client.containers.list())

# Probar si funciona
for container in client.containers.list():
    print(container.name, container.status)


[<Container: c1240976b774>, <Container: 3cae7e00c508>, <Container: 40ff65947c29>, <Container: b150c5075120>, <Container: 71b46f87db7b>, <Container: b7436c433ff9>, <Container: af995a51e9dd>, <Container: 2cf8c0985868>, <Container: 96b7760c1605>, <Container: 3759e096be4e>, <Container: 57667dd05bc9>, <Container: 865931dafb0c>, <Container: 15bfe4e011dc>, <Container: 1045d898c339>, <Container: a0c8ab42d5de>, <Container: 34e8f993857b>, <Container: a126c9a346ba>, <Container: c28331acaa87>, <Container: b33844811935>, <Container: 7bdac95bdf74>, <Container: 3c1c1327764e>, <Container: d649bd564a06>, <Container: 814022a11c16>, <Container: 1b632b848342>, <Container: 7a19f5d91b04>, <Container: cdf7f2bc0299>, <Container: db9c654c3001>, <Container: 3e2c26e396eb>]
amazing_blackburn running
sociest-odoo-odoo-1 running
sociest-odoo-postgresql-1 running
jupyter_server running
docker-frontend-1 running
docker-app-1 running
docker-redis-1 running
docker-postgres-1 running
grafana running
node-exporter runnin

In [7]:
import subprocess, threading
import schedule, docker
import pandas as pd
import time, re
from datetime import datetime
import concurrent.futures

# Semáforo para limitar el número de contenedores en paralelo
docker_semaphore = threading.Semaphore(1)  # máximo 8 contenedores simultáneos

# Carpeta donde se guardarán los WACZ
OUTPUT_DIR = "./data2"

df = pd.read_csv("datos_entidades_sitiosweb_editados.csv")
df

Unnamed: 0,nombre,sitioweb,sigla
0,Vicepresidencia del Estado Plurinacional,https://www.vicepresidencia.gob.bo,VPEP
1,Ministerio de Relaciones Exteriores,https://www.cancilleria.gob.bo,MIN-RREE
2,Ministerio de Gobierno,http://www.mingobierno.gob.bo,MIN-GOB
3,Ministerio de Educación,https://www.minedu.gob.bo,MIN-EDU
4,Ministerio de Defensa,https://www.mindef.gob.bo,MIN-DEF
...,...,...,...
388,Universidad Andina Simón Bolivar,https://www.uasb.edu.bo/,UASB
389,Universidad Pedagógica,http://www.upedagogica.edu.bo/,UNIPED
390,Valores Unión S.A. Agencia de Bolsa,https://www.valoresunion.com.bo/,UNION-AB
391,Centro de la Cultura Plurinacional (Santa Cruz),http://www.fundacionculturalbcb.gob.bo/Q-CCP.html,RN-CCP


In [None]:
import subprocess, threading, queue
import schedule, time, re
from datetime import datetime
import concurrent.futures

# Cola global para los trabajos
job_queue = queue.Queue()

# Semáforo para limitar contenedores simultáneos dentro de un job
docker_semaphore = threading.Semaphore(3)  # máx. 8 contenedores en paralelo

# Carpeta de salida
OUTPUT_DIR = "./data2"

# Lista de sitios a seguir
URLS = df['sitioweb'].tolist()
URLS.extend([
    "https://computo.oep.org.bo",
    "https://sirepre.oep.org.bo",
    "https://www.gob.bo",
])


def run_docker_command(cmd, timeout=3600):
    """Ejecuta un comando docker con timeout y semáforo (máx 8 en paralelo)."""
    with docker_semaphore:
        try:
            process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
            stdout, _ = process.communicate(timeout=timeout)
            return stdout
        except subprocess.TimeoutExpired:
            process.kill()
            return f"Timeout alcanzado para el comando: {' '.join(cmd)}"


def crawler_job():
    """Genera un job de crawling y lo mete en la cola."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    job_queue.put((URLS.copy(), timestamp))  # guardamos urls y timestamp
    print(f"[{timestamp}] ✅ Nuevo job agregado a la cola (pendientes: {job_queue.qsize()})")


def worker():
    """Thread que procesa la cola de jobs de forma secuencial."""
    while True:
        urls, timestamp = job_queue.get()  # espera hasta que haya trabajo
        print(f"\n[{timestamp}] 🚀 Iniciando crawl de lote ({len(urls)} sitios)...")

        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
            futures = {}
            for url in urls:
                safe_name = re.sub(r'[^a-zA-Z0-9_-]', '_', url.replace('https://', '').replace('http://', ''))
                collection_name = f"{safe_name}-{timestamp}"

                cmd = [
                    "docker", "run", "--rm",
                    "-v", f"{OUTPUT_DIR}:/crawls",
                    "webrecorder/browsertrix-crawler",
                    "crawl",
                    "--url", url,
                    "--generateWACZ",
                    "--blockads", "--adBlockMessage", "AD BLOCKED",
                    "--blockMessage", "URL BLOCKED",
                    "--allowHashUrls",
                    "--useSitemap",
                    "--waitUntil", "networkidle0",
                    "--lang", "es",
                    "--retries", "3",
                    "--postLoadDelay", "1",
                    "--collection", collection_name,
                    "--workers", "4",
                    "--clickSelector", "a",
                    "--clickSelector", "button"
                ]

                future = executor.submit(run_docker_command, cmd, 3600)
                futures[future] = url

            for future in concurrent.futures.as_completed(futures):
                url = futures[future]
                try:
                    result = future.result()
                except Exception as exc:
                    print(f"❌ El crawl para {url} generó una excepción: {exc}")
                else:
                    print(f"✅ Resultado para {url}:\n{result[:300]}...\n")  # muestra primeras 300 líneas

        print(f"[{datetime.now().strftime('%Y%m%d_%H%M%S')}] 🏁 Lote terminado.\n")
        job_queue.task_done()


# --- Lanzamos el worker en segundo plano ---
threading.Thread(target=worker, daemon=True).start()

# Programar ejecución cada 30 minutos
schedule.every(30).minutes.do(crawler_job)

# Primera ejecución inmediata
crawler_job()

# Loop infinito
while True:
    schedule.run_pending()
    time.sleep(1)


[20250819_034819] ✅ Nuevo job agregado a la cola (pendientes: 1)

[20250819_034819] 🚀 Iniciando crawl de lote (396 sitios)...


KeyboardInterrupt: 