<a href="https://colab.research.google.com/github/andres-chirinos/crawl-bolivian-websites/blob/main/notebooks/crawl_standard_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install warcio beautifulsoup4

Collecting warcio
  Downloading warcio-1.7.5-py2.py3-none-any.whl.metadata (16 kB)
Downloading warcio-1.7.5-py2.py3-none-any.whl (40 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: warcio
Successfully installed warcio-1.7.5


In [None]:
from google.colab import drive
import gzip
from warcio.archiveiterator import ArchiveIterator
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import pandas as pd
import os

In [None]:
def extract_outlinks(warc_file):
    outlinks = []

    # Abrir WARC (gz o normal)
    open_func = gzip.open if warc_file.endswith(".gz") else open
    with open_func(warc_file, "rb") as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == "response":
                url = record.rec_headers.get_header("WARC-Target-URI")
                payload = record.content_stream().read()

                try:
                    soup = BeautifulSoup(payload, "html.parser")
                    for tag in soup.find_all(["a", "link", "script", "img", "iframe"]):
                        href = tag.get("href") or tag.get("src")
                        alt = tag.get("alt") or tag.gettext()
                        if href:
                            parsed = urlparse(href)
                            if parsed.netloc:  # Tiene dominio
                                outlinks.append((url, href, alt))
                except Exception as e:
                    pass  # algunos payloads no son HTML válido

    return outlinks

def extract_json_xhr_calls(warc_file):
    json_xhr_calls = []

    open_func = gzip.open if warc_file.endswith(".gz") else open
    with open_func(warc_file, "rb") as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == "response":
                url = record.rec_headers.get_header("WARC-Target-URI")
                content_type = record.rec_headers.get_header("Content-Type")

                if content_type and ("application/json" in content_type or "application/x-xhr-json" in content_type):
                    json_xhr_calls.append(url)

    return json_xhr_calls

In [None]:
# PARAMS
zip_file = """/content/drive/Shareddrives/Repositorio de Datos/Data Lake/bolivia-web-archive/gob_bo/www_gob_bo_-20250824_015305.zip"""

In [None]:
drive.mount('/content/drive')

file = zip_file.split("/")[-1]
file_name = file.split(".")[0]

!unzip -q "{zip_file}" -d "/content"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
warc_dir = f"/content/{file_name}/archive"
all_outlinks = []

# Iterate through files in the specified directory
for root, _, files in os.walk(warc_dir):
    for file in files:
        if file.endswith(".warc.gz"): # Look for .warc.gz files
            warc_file = os.path.join(root, file)
            print(f"Processing: {warc_file}")
            outlinks = extract_outlinks(warc_file)
            all_outlinks.extend(outlinks)


print(f"Total outlinks extracted: {len(all_outlinks)}")

Processing: /content/www_gob_bo_-20250824_015305/archive/rec-bc4a53349c14-www_gob_bo_-20250824_015305-20250824015311432-3.warc.gz




Processing: /content/www_gob_bo_-20250824_015305/archive/rec-bc4a53349c14-www_gob_bo_-20250824_015305-20250824015311616-1.warc.gz




Processing: /content/www_gob_bo_-20250824_015305/archive/rec-bc4a53349c14-www_gob_bo_-20250824_015305-20250824015311411-0.warc.gz



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(payload, "html.parser")


Processing: /content/www_gob_bo_-20250824_015305/archive/rec-bc4a53349c14-www_gob_bo_-20250824_015305-20250824015311464-2.warc.gz




Total outlinks extracted: 4


In [None]:
df = pd.DataFrame(all_outlinks, headers=["url","href","alt"])

In [None]:
df

Unnamed: 0,0,1,2
0,url,href,alt
1,url,href,alt
2,url,href,alt
3,url,href,alt


In [None]:
from urllib.parse import urlparse

df["href"] = df["href"].str.replace("http://", "https://")
# Standardize URLs by adding a trailing slash if it's a domain or a path without one
def standardize_url(url):
    parsed_url = urlparse(url)
    if parsed_url.path and not parsed_url.path.endswith('/') and '.' not in os.path.basename(parsed_url.path):
        return url + '/'
    elif not parsed_url.path and not parsed_url.query and not parsed_url.fragment:
         return url + '/'
    return url

df["href"] = df["href"].apply(standardize_url)

KeyError: 'href'

In [None]:
from urllib.parse import urlparse

# Extract the domain from the outlink URL
df['outlink_domain'] = df["href"].apply(lambda x: urlparse(x).netloc)

Unnamed: 0_level_0,count
outlink_domain,Unnamed: 1_level_1
www.openstreetmap.org,6227
datos.gob.bo,3670
agetic.gob.bo,1835
legas.minedu.gob.bo,128
lapaz.bo,71
...,...
www.ylb.gob.bo,1
ssulapaz.org,1
www.segip.gob.bo,1
www.probolivia.gob.bo,1


In [None]:
df["is_bo_domain"] = df["outlink_domain"].apply(lambda x: x.endswith(".bo"))

In [None]:
sitios = list(df[df["is_bo_domain"]==True][1].unique())
sitios

['https://datos.gob.bo/',
 'https://agetic.gob.bo/',
 'https://www.seprec.gob.bo/index.php/tramite37/',
 'https://legas.minedu.gob.bo/',
 'https://www.gestora.bo/',
 'https://www.seprec.gob.bo/index.php/tramite9/',
 'https://www.seprec.gob.bo/index.php/tramite29/',
 'https://www.seprec.gob.bo/index.php/tramite49/',
 'https://vortex.produccion.gob.bo/',
 'https://www.archivos.umss.edu.bo/wp1/',
 'https://fnse.gob.bo/index.php/programas-y-proyectos/centro-de-habilitacion-y-rehabilitacion/',
 'https://lapaz.bo/revalidaciones-pago-de-alquiler-de-nichos-de-cuerpo-menor-temporales/',
 'https://www.mindef.gob.bo/node/490/',
 'https://www.senapi.gob.bo/propiedad-intelectual/derecho-de-autor/',
 'https://usfx.bo/',
 'https://www.mindef.gob.bo/node/486/',
 'https://utp.minedu.gob.bo/',
 'https://www.ribb.gob.bo/web/site/reginfo/',
 'https://tramitesv2.agetic.gob.bo/admin/tramites/0?tipo=tramite/',
 'https://siscon.ypfb.gob.bo/procesos29506/',
 'https://www.seprec.gob.bo/index.php/tramite42/',
 '

In [None]:
len(sitios)

433

In [None]:
import gzip
from warcio.archiveiterator import ArchiveIterator

def extract_json_xhr_calls(warc_file):
    json_xhr_calls = []

    open_func = gzip.open if warc_file.endswith(".gz") else open
    with open_func(warc_file, "rb") as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == "response":
                url = record.rec_headers.get_header("WARC-Target-URI")
                content_type = record.rec_headers.get_header("Content-Type")

                if content_type : #and ("application/json" in content_type or "application/x-xhr-json" in content_type):
                    json_xhr_calls.append((url,content_type))

    return json_xhr_calls

In [None]:
import os

warc_dir = "/content/www_gob_bo_-20250824_015305/archive"
all_json_xhr_calls = []

# Iterate through files in the specified directory
for root, _, files in os.walk(warc_dir):
    for file in files:
        if file.endswith(".warc.gz"):
            warc_file = os.path.join(root, file)
            print(f"Processing JSON/XHR calls from: {warc_file}")
            json_xhr_calls = extract_json_xhr_calls(warc_file)
            all_json_xhr_calls.extend(json_xhr_calls)

print(f"Total JSON/XHR calls extracted: {len(all_json_xhr_calls)}")

Processing JSON/XHR calls from: /content/www_gob_bo_-20250824_015305/archive/rec-bc4a53349c14-www_gob_bo_-20250824_015305-20250824015311432-3.warc.gz
Processing JSON/XHR calls from: /content/www_gob_bo_-20250824_015305/archive/rec-bc4a53349c14-www_gob_bo_-20250824_015305-20250824015311616-1.warc.gz
Processing JSON/XHR calls from: /content/www_gob_bo_-20250824_015305/archive/rec-bc4a53349c14-www_gob_bo_-20250824_015305-20250824015311411-0.warc.gz
Processing JSON/XHR calls from: /content/www_gob_bo_-20250824_015305/archive/rec-bc4a53349c14-www_gob_bo_-20250824_015305-20250824015311464-2.warc.gz
Total JSON/XHR calls extracted: 18193
Extracted JSON/XHR Calls:
('https://www.gob.bo/eventos-de-vida', 'application/http; msgtype=response')
('https://www.gob.bo/_next/static/css/28e3b03a6887665e.css', 'application/http; msgtype=response')
('https://www.gob.bo/_next/static/css/22777d5f94aecdf1.css', 'application/http; msgtype=response')
('https://www.gob.bo/_next/static/css/d645a791e5bd11f8.css', 

In [None]:
df_2 = pd.DataFrame(all_json_xhr_calls)

In [None]:
df_2

Unnamed: 0,0,1
0,https://www.gob.bo/eventos-de-vida,application/http; msgtype=response
1,https://www.gob.bo/_next/static/css/28e3b03a68...,application/http; msgtype=response
2,https://www.gob.bo/_next/static/css/22777d5f94...,application/http; msgtype=response
3,https://www.gob.bo/_next/static/css/d645a791e5...,application/http; msgtype=response
4,https://www.gob.bo/_next/static/chunks/main-ap...,application/http; msgtype=response
...,...,...
18188,https://www.gob.bo/entidades/gobierno-autonomo...,application/http; msgtype=response
18189,https://www.gob.bo/categorias/impuestos?_rsc=f...,application/http; msgtype=response
18190,https://www.gob.bo/categorias/empresas?_rsc=f74k1,application/http; msgtype=response
18191,https://www.gob.bo/about?_rsc=f74k1,application/http; msgtype=response
