In [None]:
import calcbench as cb
import pandas as pd
import math
import numpy as np
from decimal import Decimal
from tqdm import tqdm, tqdm_notebook
from IPython.core.debugger import set_trace
from tqdm.notebook import tqdm
import random
import qgrid

tqdm.pandas(desc="my bar!")

In [None]:
tickers = cb.tickers(index="DJIA")

In [None]:
data = pd.DataFrame()
for ticker in tqdm(tickers):
    d = cb.point_in_time(
        company_identifiers=[ticker],
        all_face=True,
        include_preliminary=True,
        include_xbrl=True,
        all_history=True,
        include_trace=True,
        period_type="combined",
    )
    data = data.append(d)
data.reset_index(inplace=True, drop=True)

In [None]:
def preliminary_and_revision(group: pd.DataFrame) -> bool:
    """
    Does this group have a preliminary line and an XBRL line?
    """
    unconfirmed_preliminary = group[group.preliminary & ~group.XBRL].shape[0]
    XBRL_revision = group[group.XBRL & (group.revision_number > 0)].shape[0]
    return bool(unconfirmed_preliminary and XBRL_revision)

In [None]:
def error_type(g):
    # set_trace()
    preliminary_value = g[g.preliminary].iloc[0].value
    XBRL_value = g[~g.preliminary].iloc[0].value
    preliminary_decimal = Decimal(preliminary_value).normalize().as_tuple()
    XBRL_decimal = Decimal(XBRL_value).normalize().as_tuple()
    diff = (preliminary_value - XBRL_value) / XBRL_value
    d = pd.DataFrame(g)
    d["diff"] = diff
    break_type = "diff"
    if preliminary_decimal.digits == XBRL_decimal.digits:
        if preliminary_decimal.exponent == XBRL_decimal.exponent:
            break_type = "sign"
        else:
            break_type = "scale"
    elif math.isclose(preliminary_value, XBRL_value, rel_tol=0.05):
        break_type = "close_enough"
    elif math.isclose(preliminary_value * 4, XBRL_value, rel_tol=0.1):
        break_type = "quarter_error"
    else:
        break_type = "unknown"
    d["break_type"] = break_type
    return d

In [None]:
group_by = ["metric", "calendar_year", "calendar_period"]
groups = data.groupby(group_by, sort=False)
groups = list(groups)

In [None]:
all_breaks = pd.DataFrame()
for ticker in tqdm(data.ticker.unique()):
    preliminary_and_revision_lines = (
        data[data.ticker == ticker]
        .groupby(group_by)
        .filter(preliminary_and_revision)
        .reset_index(drop=True)
    )
    breaks = preliminary_and_revision_lines.groupby(group_by).apply(error_type)
    if breaks.empty:
        continue
    breaks = breaks.sort_values(group_by + ["revision_number"])
    breaks.reset_index(inplace=True, drop=True)
    all_breaks = all_breaks.append(breaks)

In [None]:
all_breaks.groupby("metric").count() / data[data.preliminary == True].groupby(
    "metric"
).count()

In [None]:
all_breaks[all_breaks.preliminary == True].groupby(
    ["calendar_year"]
).count().sort_values("calendar_year")

In [None]:
data[data.preliminary].groupby("metric").count().sort_values("preliminary")

In [None]:
data[data.preliminary].shape

In [None]:
all_breaks[all_breaks.preliminary].shape