In [1]:
!pip install requests pandas



In [5]:
import requests
import os
import json
import time
import random
import concurrent.futures

# Buscador

In [None]:
BASE_URL = "https://servicios.seprec.gob.bo/api/empresas/buscarEmpresas"
params = {
    "filtro": "",
    "limite": 10,
    "pagina": 1
}

palabras_clave = ["a", "e", "i", "o", "u"]

data_dir = "data"

In [12]:
def fetch_data_for_word(palabra, base_url, params, data_dir):
    params = params.copy()
    params["filtro"] = palabra
    maxpages = 2
    page = 1
    consulted = 0
    failed = 0
    total = None

    def log_event(level, event, extra=None):
        log = {
            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
            "level": level,
            "event": event,
            "palabra": palabra,
            "page": page,
        }
        if extra:
            log.update(extra)
        print(json.dumps(log))

    while page <= maxpages:
        log_event("INFO", "Procesando página")
        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                log_event("ERROR", "Solicitud fallida", {"status_code": response.status_code})
                failed += 1
            data = response.json()
        except Exception as e:
            log_event("ERROR", "Excepción al solicitar página", {"error": str(e)})
            failed += 1
            page += 1
            params["pagina"] = page
            continue

        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

        file_path = os.path.join(data_dir, f"result_{palabra}_{page}.json")
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

        consulted += 1

        if page == 1:
            try:
                total_data = data["datos"]["total"]
                maxpages = total_data // params["limite"] + 1
            except Exception as e:
                log_event("WARN", "No se pudo determinar el número de páginas", {"error": str(e)})

        total = maxpages

        # Mostrar avance parcial después de cada página
        faltantes = (total_data if total_data is not None else 0) - (consulted + failed) if total_data is not None else 'N/A'
        print(f"[Avance] Palabra: {palabra}, Página: {page}, Visitadas: {consulted}, Fallidas: {failed}, Faltantes: {faltantes}, Totales: {total if total is not None else 'N/A'}")

        time.sleep(random.uniform(1, 3))
        page += 1
        params["pagina"] = page

    return {"palabra": palabra, "consulted": consulted, "failed": failed, "total": total}

In [13]:
def main(num_workers=5):
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = {
            executor.submit(fetch_data_for_word, palabra, BASE_URL, params, data_dir): palabra
            for palabra in palabras_clave
        }
        results = []
        for future in concurrent.futures.as_completed(futures):
            res = future.result()
            results.append(res)
            # Mostrar estadísticas parciales cada vez que termina un worker
            try:
                faltantes = res.get('total', 0) - (res.get('consulted', 0) + res.get('failed', 0)) if 'total' in res else 'N/A'
                print(f"Palabra: {res['palabra']}, Visitadas: {res['consulted']}, Fallidas: {res['failed']}, Faltantes: {faltantes}, Totales: {res.get('total', 'N/A')}")
            except Exception as e:
                print(f"Error mostrando estadísticas parciales: {e}")
    return results

In [None]:
resultados = main(num_workers=5)
print("Resultados finales:", resultados)

{"timestamp": "2025-08-29T15:43:13Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "a", "page": 1}
{"timestamp": "2025-08-29T15:43:13Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "e", "page": 1}
{"timestamp": "2025-08-29T15:43:13Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "i", "page": 1}
{"timestamp": "2025-08-29T15:43:13Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "o", "page": 1}
{"timestamp": "2025-08-29T15:43:13Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "u", "page": 1}
[Avance] Palabra: a, Página: 1, Visitadas: 1, Fallidas: 0, Faltantes: 397496, Totales: 39750
[Avance] Palabra: e, Página: 1, Visitadas: 1, Fallidas: 0, Faltantes: 391736, Totales: 39174
[Avance] Palabra: a, Página: 1, Visitadas: 1, Fallidas: 0, Faltantes: 397496, Totales: 39750
[Avance] Palabra: e, Página: 1, Visitadas: 1, Fallidas: 0, Faltantes: 391736, Totales: 39174
[Avance] Palabra: i, Página: 1, Visitadas: 

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[Avance] Palabra: u, Página: 2292, Visitadas: 2292, Fallidas: 0, Faltantes: 316624, Totales: 31892
[Avance] Palabra: e, Página: 2305, Visitadas: 2305, Fallidas: 0, Faltantes: 389432, Totales: 39174
{"timestamp": "2025-08-29T18:34:26Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "i", "page": 2300}
[Avance] Palabra: e, Página: 2305, Visitadas: 2305, Fallidas: 0, Faltantes: 389432, Totales: 39174
{"timestamp": "2025-08-29T18:34:26Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "i", "page": 2300}
{"timestamp": "2025-08-29T18:34:26Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "a", "page": 2308}
{"timestamp": "2025-08-29T18:34:26Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "a", "page": 2308}
{"timestamp": "2025-08-29T18:34:28Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "o", "page": 2292}
{"timestamp": "2025-08-29T18:34:28Z", "level": "INFO", "event": "Procesando p\u00e1gina", "palabra": "e

# Detail

In [1]:
BASE_URL = "https://servicios.seprec.gob.bo/api/empresas/informacionBasicaEmpresa/{empresa_id}/establecimiento/{idEstablecimiento}"

data_dir = "data"

In [2]:
def fetch_company_info(empresa_id, establecimiento_id, base_url, data_dir):
    def log_event(level, event, extra=None):
        log = {
            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
            "level": level,
            "event": event,
            "empresa": empresa_id,
            "establecimiento": establecimiento_id,
        }
        if extra:
            log.update(extra)
        print(json.dumps(log))
    
    # Build the request URL with the provided empresa_id and establecimiento_id
    url = base_url.format(empresa_id=empresa_id, idEstablecimiento=establecimiento_id)
    log_event("INFO", "Procesando solicitud", {"url": url})
    
    try:
        response = requests.get(url)
        if response.status_code != 200:
            log_event("ERROR", "Solicitud fallida", {"status_code": response.status_code})
            return {"empresa": empresa_id, "establecimiento": establecimiento_id, "success": False, "status_code": response.status_code}
        data_resp = response.json()
    except Exception as e:
        log_event("ERROR", "Excepción al solicitar datos", {"error": str(e)})
        return {"empresa": empresa_id, "establecimiento": establecimiento_id, "success": False, "error": str(e)}
    
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    file_path = os.path.join(data_dir, f"result_{empresa_id}_{establecimiento_id}.json")
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data_resp, f, ensure_ascii=False, indent=4)
    
    log_event("INFO", "Solicitud completada exitosamente")
    return {"empresa": empresa_id, "establecimiento": establecimiento_id, "success": True, "status_code": response.status_code}


In [3]:
def main(
    num_workers=5,
    company_data=[("426118", "439571"), ],
):
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = {
            executor.submit(
                fetch_company_info, empresa_id, establecimiento_id, BASE_URL, data_dir
            ): (empresa_id, establecimiento_id)
            for empresa_id, establecimiento_id in company_data
        }
        results = []
        for future in concurrent.futures.as_completed(futures):
            res = future.result()
            results.append(res)
            try:
                print(
                    f"Empresa: {res['empresa']}, Establecimiento: {res['establecimiento']}, Success: {res['success']}, Status Code: {res.get('status_code')}"
                )
            except Exception as e:
                print(f"Error displaying result: {e}")
    return results

In [7]:
resultados = main(num_workers=5)
print("Resultados finales:", resultados)

{"timestamp": "2025-08-30T22:28:45Z", "level": "INFO", "event": "Procesando solicitud", "empresa": "426118", "establecimiento": "439571", "url": "https://servicios.seprec.gob.bo/api/empresas/informacionBasicaEmpresa/426118/establecimiento/439571"}
{"timestamp": "2025-08-30T22:28:46Z", "level": "INFO", "event": "Solicitud completada exitosamente", "empresa": "426118", "establecimiento": "439571"}
Empresa: 426118, Establecimiento: 439571, Success: True, Status Code: 200
Resultados finales: [{'empresa': '426118', 'establecimiento': '439571', 'success': True, 'status_code': 200}]
