In [None]:
from datetime import date

from diario_crawler.core import GazetteCrawler, CrawlerConfig
from diario_crawler.utils import setup_logging

setup_logging(level='DEBUG')

crawler = GazetteCrawler(
    CrawlerConfig(start_date=date(2025, 10, 1))
)
crawler

In [None]:
res = await crawler.run()
res

In [None]:
len(res)

In [1]:
from datetime import date

from diario_crawler.core import GazetteCrawler, CrawlerConfig
from diario_crawler.storage import ParquetStorage

config = CrawlerConfig(
    start_date=date(2025, 11, 1),
    end_date=date(2025, 11, 5),
    batch_size=10
)

storage = ParquetStorage(
    base_path="data/raw",
    partition_by="day"
)

crawler = GazetteCrawler(config=config, storage=storage)

editions = await crawler.run_and_save()

In [2]:
storage.load_editions()

[<GazetteEdition id=2555 date=2025-11-03 articles=8>,
 <GazetteEdition id=2556 date=2025-11-04 articles=9>,
 <GazetteEdition id=2557 date=2025-11-05 articles=14>]

In [2]:
import sys
from pathlib import Path
import pandas as pd


def read_parquets(base_dir: Path) -> pd.DataFrame:
    """Lê recursivamente todos os arquivos .parquet dentro de um diretório."""
    files = list(base_dir.rglob("*.parquet"))
    if not files:
        print(f"[!] Nenhum arquivo .parquet encontrado em {base_dir}")
        return pd.DataFrame()
    
    dfs = []
    for file in files:
        try:
            df = pd.read_parquet(file)
            df["__source_file__"] = str(file)
            dfs.append(df)
        except Exception as e:
            print(f"[ERRO] Falha ao ler {file}: {e}")
    
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    return pd.DataFrame()


def main(base_path: str = "data/raw"):
    base_path = Path(base_path)

    gazettes_path = base_path / "gazettes"
    articles_path = base_path / "articles"

    print("=== Lendo edições ===")
    editions_df = read_parquets(gazettes_path)
    print(f"→ Total de edições: {len(editions_df)}")
    if not editions_df.empty:
        print(editions_df.dtypes)
        print(editions_df.head(5), "\n")

    print("=== Lendo artigos ===")
    articles_df = read_parquets(articles_path)
    print(f"→ Total de artigos: {len(articles_df)}")
    if not articles_df.empty:
        print(articles_df.dtypes)
        print(articles_df.head(5), "\n")

    # Exemplo: unir artigos e edições por edition_id
    if not editions_df.empty and not articles_df.empty:
        merged = articles_df.merge(
            editions_df,
            on="edition_id",
            how="left",
            suffixes=("_article", "_edition"),
        )
        print(f"→ Total após merge: {len(merged)} registros")
        print("→ Amostra combinada:")
        print(merged[["edition_id", "publication_date_article", "title"]].head(5))

In [3]:
main()

=== Lendo edições ===
→ Total de edições: 9
edition_id           string[python]
publication_date     string[python]
edition_number                int64
supplement                  boolean
edition_type_id               int64
edition_type_name    string[python]
pdf_url              string[python]
total_articles                int64
processed_at         string[python]
__source_file__              object
dtype: object
  edition_id publication_date  edition_number  supplement  edition_type_id  \
0       2555       2025-11-03            3586       False                1   
1       2556       2025-11-04            3587       False                1   
2       2557       2025-11-05            3588       False                1   
3       2555       2025-11-03            3586       False                1   
4       2556       2025-11-04            3587       False                1   

     edition_type_name                                            pdf_url  \
0  Diário do Município  https://diar