In [4]:
from typing import List
import pandas as pd
import os

In [17]:
 pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [21]:
 pd.options.display.max_rows = 1000

In [47]:
root_path = os.path.dirname(os.path.abspath(''))
consoliado_path = os.path.join(
    root_path, "data", "df_fundamentalista.parquet")

In [48]:
df = pd.read_parquet(consoliado_path)
df['symbol'] = df['symbol'].str[0:4]
df = df.drop_duplicates()

In [49]:
df.shape

(1168, 327)

In [50]:
describe = df.describe()
# O count não conta NaN
describe_count = describe.iloc[0].sort_values()

valid_columns = list(describe_count[describe_count.values > 1000].index)
valid_columns.insert(0, 'asOfDate')
valid_columns.insert(0, 'symbol')

In [51]:
def get_columns() -> List[str]:
    """Colunas filtradas a partir dos dados fundamentalisa bruto. """
    # fmt: off
    return [
        "symbol", "asOfDate", "NetIncomeFromContinuingOperations", "ReconciledDepreciation",
        "ChangeInCashSupplementalAsReported", "ChangeInWorkingCapital",
        "InvestingCashFlow", "BeginningCashPosition", "FinancingCashFlow",
        "EndCashPosition", "OperatingCashFlow", "LongTermDebtAndCapitalLeaseObligation",
        "ChangesInCash", "FreeCashFlow", "SellingGeneralAndAdministration",
        "TotalDebt", "TaxProvision", "NetPPE", "Payables", "NetInterestIncome",
        "CommonStock", "CapitalStock", "CashAndCashEquivalents", "InvestedCapital",
        "TotalCapitalization", "NetIncomeFromContinuingAndDiscontinuedOperation",
        "NetIncome", "NetIncomeCommonStockholders", "TaxRateForCalcs",
        "TaxEffectOfUnusualItems", "TotalRevenue", "NetIncomeContinuousOperations",
        "PretaxIncome", "OrdinarySharesNumber", "OperatingRevenue",
        "NetIncomeFromContinuingOperationNetMinorityInterest", 
        "NetIncomeIncludingNoncontrollingInterests",
        "NormalizedIncome", "DilutedNIAvailtoComStockholders", "ShareIssued",
        "NetTangibleAssets", "TotalEquityGrossMinorityInterest",
        "TotalAssets", "TangibleBookValue", "CommonStockEquity",
        "TotalLiabilitiesNetMinorityInterest", "StockholdersEquity"
    ]
    # fmt: on

In [52]:
df = df[get_columns()].copy()

In [53]:
df[pd.isnull(df['NetIncomeFromContinuingOperations'])][
    ['symbol','asOfDate','NetIncomeFromContinuingOperations']
].head()

Unnamed: 0,symbol,asOfDate,NetIncomeFromContinuingOperations
21,AFLT,2020-06-30,
27,AIRB,2020-09-30,
39,ALPK,2020-06-30,
64,AMBP,2020-09-30,
65,AMBP,2020-12-31,


In [54]:
df[df['symbol'] == 'POWE']

Unnamed: 0,symbol,asOfDate,NetIncomeFromContinuingOperations,ReconciledDepreciation,ChangeInCashSupplementalAsReported,ChangeInWorkingCapital,InvestingCashFlow,BeginningCashPosition,FinancingCashFlow,EndCashPosition,...,NormalizedIncome,DilutedNIAvailtoComStockholders,ShareIssued,NetTangibleAssets,TotalEquityGrossMinorityInterest,TotalAssets,TangibleBookValue,CommonStockEquity,TotalLiabilitiesNetMinorityInterest,StockholdersEquity
1489,POWE,2020-09-30,,,,,,,,,...,,,89654856.0,206743000.0,208768000.0,762192000.0,206743000.0,208949000.0,553424000.0,208949000.0


In [55]:
# Find missing dates
for symbol in df['symbol'].unique():
    df_symbol = df[df['symbol'] == symbol]
    if df_symbol.shape[0] < 4:
        print(f'Missing values for {symbol}, total data {df_symbol.shape[0]} of 4')

Missing values for AERI, total data 2 of 4
Missing values for AFLT, total data 3 of 4
Missing values for AGRO, total data 3 of 4
Missing values for AIRB, total data 2 of 4
Missing values for ALPK, total data 3 of 4
Missing values for AVLL, total data 1 of 4
Missing values for BOAS, total data 3 of 4
Missing values for CMIG, total data 2 of 4
Missing values for COGN, total data 3 of 4
Missing values for CPFE, total data 1 of 4
Missing values for CSED, total data 3 of 4
Missing values for CURY, total data 3 of 4
Missing values for DMVF, total data 3 of 4
Missing values for ELMD, total data 1 of 4
Missing values for ENJU, total data 1 of 4
Missing values for ESPA, total data 1 of 4
Missing values for GMAT, total data 1 of 4
Missing values for IGBR, total data 3 of 4
Missing values for INTB, total data 3 of 4
Missing values for JALL, total data 1 of 4
Missing values for LAVV, total data 3 of 4
Missing values for MBLY, total data 1 of 4
Missing values for MELK, total data 3 of 4
Missing val

In [None]:
df[df['']]