In [1]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
import base64
from xml.dom import minidom
import time
from datetime import datetime
import os.path
import xml.etree.ElementTree as ET
import warnings

warnings.filterwarnings("ignore")

In [2]:
#Variáveis globais
path_dataset = '../dataset/'
file_list_fii = f'{path_dataset}BDovI.csv'
label_document = 'Dividend' # Dividend|Monthly|Quarterly

category_document = None
type_document = None

if label_document == 'Dividend':
    category_document = 14
    type_document = 41
    path_cvm_records = f'{path_dataset}cvm_dividend_records.json'
    path_folder_cvm = f'{path_dataset}cvm_dividend'
elif label_document == 'Monthly':
    category_document = 6
    type_document = 40
    path_cvm_records = f'{path_dataset}cvm_monthly_records.json'
    path_folder_cvm = f'{path_dataset}cvm_monthly'
elif label_document == 'Quarterly':
    category_document = 6
    type_document = 45
    path_cvm_records = f'{path_dataset}cvm_quarter_records.json'
    path_folder_cvm = f'{path_dataset}cvm_quarter'
else:
    raise ValueError('Documento não especificado')

In [3]:
#Dataframe com a lista de FIIs
df_list_fii = pd.read_csv(file_list_fii)
df_list_fii = df_list_fii.iloc[:, 2:]
df_list_fii

Unnamed: 0,Codigo,Nome,CNPJ
0,ABCP11,FUNDO DE INVESTIMENTO IMOBILIÁRIO GRAND PLAZA ...,01.201.140/0001-90
1,AFHI11,AF INVEST CRI FUNDO DE INVESTIMENTO IMOBILIÁRI...,36.642.293/0001-58
2,AIEC11,AUTONOMY EDIFÍCIOS CORPORATIVOS FUNDO DE INVES...,35.765.826/0001-26
3,ALMI11,FUNDO DE INVESTIMENTO IMOBILIÁRIO FII TORRE AL...,07.122.725/0001-00
4,ALZM11,ALIANZA MULTIESTRATÉGIA FUNDO DE INVESTIMENTO ...,34.847.063/0001-08
...,...,...,...
420,XPPR11,XP PROPERTIES FUNDO DE INVESTIMENTO IMOBILIÁRIO,30.654.849/0001-40
421,XPSF11,XP SELECTION FUNDO DE FUNDOS DE INVESTIMENTO I...,30.983.020/0001-90
422,YUFI11,YUCA FUNDO DE INVESTIMENTO IMOBILIÁRIO,36.445.551/0001-06
423,ZAVI11,ZAVIT REAL ESTATE FUND FUNDO DE INVESTIMENTO I...,40.575.940/0001-23


In [4]:
#Função para download dos documentos da CVM
def download_cvm_document(category_document, type_document, path_records, path_folder, ticker, cnpj):
    url_search = f'https://fnet.bmfbovespa.com.br/fnet/publico/pesquisarGerenciadorDocumentosDados?d=1&s=0&l=0&o%5B0%5D%5BdataReferencia%5D=asc&tipoFundo=1&idCategoriaDocumento={category_document}&idTipoDocumento={type_document}&idEspecieDocumento=0&situacao=A&cnpj={cnpj}'
    content = requests.get(url_search)

    data = json.loads(content.text)
    docs = data['data']
    records_filtered = data['recordsFiltered']
    records_total = data['recordsTotal']
    add_ticker_records(path_records, ticker, cnpj, records_filtered, records_total)
    i, version, year, month, day, prev_version, prev_year, prev_month, prev_day = 0, 0, 0, 0, 0, 0, 0, 0, 0
    
    for row in docs:
        id_documento = row['id']
        version = row['versao']
        format_date_reference = int(row['formatoDataReferencia'])
        date_reference = row['dataReferencia']
        
        if format_date_reference == 2:
            year, month = [int(part) for part in date_reference.split('/')]
            if year == prev_year and month == prev_month:
                i = i + 1
            else:
                i = 0    
            file_name = f'{ticker}_{year}_{month}_v{version}_{i}'
            prev_year, prev_month = year, month
        elif format_date_reference == 3:
            day, month, year = [int(part) for part in date_reference.split('/')]
            if year == prev_year and month == prev_month and day == prev_day:
                i = i + 1
            else:
                i = 0
            file_name = f'{ticker}_{year}_{month}_{day}_v{version}_{i}'
            prev_year, prev_month, prev_day = year, month, day
        else:
            raise ValueError("Data de referência sem formato especificado.")
        
        prev_version = version

        #print(f'path_folder={path_folder} registros={records_total} id={id_documento} ano={ano} mes={mes} versao={versao}')

        url_document = f'https://fnet.bmfbovespa.com.br/fnet/publico/downloadDocumento?id={id_documento}'
        content_xml = requests.get(url_document)

        save_cvm_file(path_folder, file_name, content_xml.text, True)
        #break
    
    print(f'FIM {ticker}, registros={records_total}')

In [5]:
def save_cvm_file(path_folder, file_name, content, is_base64):
    path = f"{path_folder}/{file_name}.xml"
    
    if is_base64 == True:
        xml_string = base64.b64decode(content).decode('utf-8')
    else:
        xml_string = content
    
    xml_bytes = xml_string.encode('utf-8')
    
    with open(path, 'wb') as file:
        file.write(xml_bytes)
    
    #print(f"Arquivo salvo em: {path}")

In [6]:
def add_ticker_records(path_records, ticker, cnpj, records_filtered, records_total):
    if not os.path.exists(path_records):
        with open(path_records, 'w+') as file:
            file.writelines('[]')

    with open(path_records, 'r', encoding='utf-8') as file:
        data = json.load(file)

    item = {
        "ticker":ticker,
        "CNPJ":cnpj,
        "recordsFiltered":records_filtered,
        "recordsTotal":records_total
    }

    data.append(item)

    with open(path_records, "w") as file:
        json.dump(data, file)

In [7]:
def get_cvm_document_next_index(path_records):
    try:
        index = 0
        
        with open(path_records, 'r') as file:
            data = json.load(file)
            ticker = data[len(data)-1]['ticker']
            index = df_list_fii[df_list_fii['Codigo'] == ticker].index[0] + 1
    except:
        index = 0
    finally:
        return index

In [8]:
x1 = get_cvm_document_next_index(path_cvm_records)
x2 = len(df_list_fii)
#x2=x1+3

for i in range(x1, x2, 1):
    if i >= len(df_list_fii):
        print('Fim da importação')
        break
        
    ticker = df_list_fii.iloc[i]['Codigo']
    cnpj = df_list_fii.iloc[i]['CNPJ']

    print(f"Linha {i}: {ticker} - {cnpj}")
    download_cvm_document(category_document, type_document, path_cvm_records, path_folder_cvm, ticker, cnpj)

Linha 0: ABCP11 - 01.201.140/0001-90
FIM ABCP11, registros=86
Linha 1: AFHI11 - 36.642.293/0001-58
FIM AFHI11, registros=44
Linha 2: AIEC11 - 35.765.826/0001-26
FIM AIEC11, registros=46
