In [20]:
import zipfile
import os

def count_zip_rows(zip_file):
    """Counts rows in text files within a zip archive.

    Args:
        zip_file (str): Path to the zip file.

    Returns:
        dict: Dictionary with zip filename as key and row count as value.
    """
    row_counts = {}
    with zipfile.ZipFile(zip_file, 'r') as zip:
        for info in zip.infolist():
            if not info.is_dir():  # Skip directories within the zip
                with zip.open(info.filename) as file:
                    # Replace this with logic to handle specific file formats (e.g., csv.reader)
                    row_counts[info.filename] = sum(1 for line in file)
    
    return row_counts

def list_and_count_zip_rows(folder_path):
    """Lists zip files in a folder and counts rows within each zip.

    Args:
        folder_path (str): Path to the folder containing zip files.

    Returns:
        list: List of dictionaries, each with zip filename and row count.
    """
    zip_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.zip'):
            zip_file_path = os.path.join(folder_path, filename)
            row_counts = count_zip_rows(zip_file_path)
            zip_count={'filename': filename, 'row_counts': row_counts}
            print(zip_count)
            zip_data.append(zip_count)
    
    return zip_data

# Example usage
folder_path = os.path.join(os.getcwd(), '..', 'data', 'DOWNLOAD_FILES')
zip_info = list_and_count_zip_rows(folder_path)

for item in zip_info:
    print(f"File: {item['filename']}")
    for filename, row_count in item['row_counts'].items():
        print(f"\t- {filename}: {row_count} rows")

{'filename': 'Socios1.zip', 'row_counts': {'K3241.K03200Y1.D40511.SOCIOCSV': 2019150}}
{'filename': 'Empresas9.zip', 'row_counts': {'K3241.K03200Y9.D40511.EMPRECSV': 4494860}}
{'filename': 'Empresas7.zip', 'row_counts': {'K3241.K03200Y7.D40511.EMPRECSV': 4494860}}
{'filename': 'Estabelecimentos3.zip', 'row_counts': {'K3241.K03200Y3.D40511.ESTABELE': 4753435}}
{'filename': 'Simples.zip', 'row_counts': {'F.K03200$W.SIMPLES.CSV.D40511': 39065196}}
{'filename': 'Estabelecimentos1.zip', 'row_counts': {'K3241.K03200Y1.D40511.ESTABELE': 4753435}}
{'filename': 'Paises.zip', 'row_counts': {'F.K03200$Z.D40511.PAISCSV': 255}}
{'filename': 'Empresas2.zip', 'row_counts': {'K3241.K03200Y2.D40511.EMPRECSV': 4494860}}
{'filename': 'Naturezas.zip', 'row_counts': {'F.K03200$Z.D40511.NATJUCSV': 90}}
{'filename': 'Socios0.zip', 'row_counts': {'K3241.K03200Y0.D40511.SOCIOCSV': 5976547}}
{'filename': 'Qualificacoes.zip', 'row_counts': {'F.K03200$Z.D40511.QUALSCSV': 68}}
{'filename': 'Municipios.zip', 'row_c

In [2]:
""" 
  - Nome do projeto : ETL - CNPJs da Receita Federal do Brasil
  - Objetivo        : Baixar, transformar e carregar dados da Receita Federal do Brasil
"""
import time

from setup.base import get_sink_folder, init_database
from core.etl import CNPJ_ETL

# Start the timer
start_time = time.time()


# Data folders
download_folder, extract_folder = get_sink_folder()

ano = str(2025)
mes = str(5).zfill(2)

database_name = f"dadosrfb_{ano}{mes}"

# Database setup
database = init_database(database_name)

host_url='https://arquivos.receitafederal.gov.br/dados/cnpj'
data_url = f'{host_url}/dados_abertos_cnpj/{ano}-{mes}'
layout_url=f'https://www.gov.br/receitafederal/dados/cnpj-metadados.pdf'

# ETL setup
scrapper = CNPJ_ETL(
    database, data_url, layout_url, download_folder, extract_folder, 
    is_parallel=True, delete_zips=True
)

audits = scrapper.retrieve_data()

KeyboardInterrupt: 