In [1]:
import pandas as pd
import json
import glob
import yaml
from datetime import datetime
from urllib.parse import urlparse

In [2]:
with open("../history/summary.json") as f:
    data = json.load(f)

dfs = []
for item in data:
    temp_df = pd.DataFrame(
        item["dailyMinutesDown"].items(), columns=["date", "time"]
    )
    temp_df["domain"] = urlparse(item["url"]).netloc
    temp_df["type"] = "down"
    dfs.append(temp_df)

df = pd.concat(dfs, ignore_index=True)

In [3]:
# Buscar todos los archivos YAML en la carpeta "history"
yaml_files = glob.glob("../history/*.yml")
records = []

for file in yaml_files:
    with open(file) as f:
        data = yaml.safe_load(f)
    # Extraer el dominio a partir de la URL
    domain = urlparse(data.get("url", "")).netloc
    # Obtener y procesar el startTime para definir cuándo se creó el dominio
    start_time = data.get("startTime")
    creation_date = None
    if start_time:
        try:
            if isinstance(start_time, str):
                dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
            else:
                dt = start_time
            creation_date = dt.date().isoformat()
        except Exception as e:
            print(f"Error al procesar startTime en {file}: {e}")
    last_update = data.get("lastUpdated")
    if last_update:
        try:
            if isinstance(last_update, str):
                dt = datetime.fromisoformat(last_update.replace("Z", "+00:00"))
            else:
                dt = last_update
            last_update = dt.date().isoformat()
        except Exception as e:
            print(f"Error al procesar lastUpdated en {file}: {e}")

    records.append(
        {"domain": domain, "date": creation_date, "time": 0, "type": "created"}
    )
    records.append(
        {"domain": domain, "date": last_update, "time": 0, "type": "last_updated"}
    )

# Crear un registro con la información de creación de cada dominio
df_domain_created = pd.DataFrame(records)
df = pd.concat([df, df_domain_created], ignore_index=True)
df

Unnamed: 0,date,time,domain,type
0,2025-04-20,24,www.anh.gob.bo,down
1,2025-03-27,13,www.anh.gob.bo,down
2,2025-03-18,31,www.anh.gob.bo,down
3,2025-03-07,33,www.anh.gob.bo,down
4,2025-02-22,13,www.anh.gob.bo,down
...,...,...,...,...
1129,2025-04-22,0,www.rree.gob.bo,last_updated
1130,2020-08-10,0,www.google.com,created
1131,2024-11-24,0,www.google.com,last_updated
1132,2024-11-25,0,www.cns.gob.bo,created


In [4]:
df["date"] = pd.to_datetime(df["date"]).dt.date
df["time"] = df["time"].astype(int)
df["domain"] = df["domain"].astype(str)
df["type"] = df["type"].astype("category")
df

Unnamed: 0,date,time,domain,type
0,2025-04-20,24,www.anh.gob.bo,down
1,2025-03-27,13,www.anh.gob.bo,down
2,2025-03-18,31,www.anh.gob.bo,down
3,2025-03-07,33,www.anh.gob.bo,down
4,2025-02-22,13,www.anh.gob.bo,down
...,...,...,...,...
1129,2025-04-22,0,www.rree.gob.bo,last_updated
1130,2020-08-10,0,www.google.com,created
1131,2024-11-24,0,www.google.com,last_updated
1132,2024-11-25,0,www.cns.gob.bo,created


In [5]:
df.to_parquet(
    "../history/summary.parquet",
    index=False
)