In [None]:
import calcbench as cb
import pandas as pd
import math
import numpy as np
from decimal import Decimal
from tqdm import tqdm, tqdm_notebook
from IPython.core.debugger import set_trace
from tqdm import tqdm
import random

tqdm.pandas(desc="my bar!")

In [None]:
cb.enable_backoff()

In [None]:
tickers = cb.tickers(index='DJIA')

In [None]:
data = pd.DataFrame()
for ticker in tqdm_notebook(tickers[:1]):
    d = cb.point_in_time(
        company_identifiers=[ticker],
        all_face=True,
        include_preliminary=True,
        include_xbrl=True,
        all_history=True,
        include_trace=True,
    )
    data = data.append(d)
data.reset_index(inplace=True, drop=True)

In [None]:
def preliminary_and_revision(group):
    if group.shape[0] == 1:
        return False
    preliminary_value = group[group.preliminary]
    if preliminary_value.shape[0] == 0:
        return False
    XBRL_value = group[~group.preliminary & (group.revision_number == 1)]
    if XBRL_value.shape[0] == 0:
        return False
    return preliminary_value.iloc[0].value != XBRL_value.iloc[0].value

In [None]:
def error_type(g):
    # set_trace()
    preliminary_value = g[g.preliminary].iloc[0].value
    XBRL_value = g[~g.preliminary].iloc[0].value
    preliminary_decimal = Decimal(preliminary_value).normalize().as_tuple()
    XBRL_decimal = Decimal(XBRL_value).normalize().as_tuple()
    diff = (preliminary_value - XBRL_value) / XBRL_value
    d = pd.DataFrame(g)
    d["diff"] = diff
    break_type = "diff"
    if preliminary_decimal.digits == XBRL_decimal.digits:
        if preliminary_decimal.exponent == XBRL_decimal.exponent:
            break_type = "sign"
        else:
            break_type = "scale"
    elif math.isclose(preliminary_value, XBRL_value, rel_tol=0.05):
        break_type = "close_enough"
    else:
        break_type = "unknown"
    d["break_type"] = break_type
    return d

In [None]:
group_by = ["ticker", "metric", "calendar_year", "calendar_period"]
groups = data.groupby(group_by, sort=False)
groups = list(groups)

In [None]:
all_breaks = pd.DataFrame()
for ticker in tqdm_notebook(data.ticker.unique()):

    preliminary_and_revision_lines = (
        data[data.ticker == ticker]
        .groupby(group_by)
        .filter(preliminary_and_revision)
        .reset_index(drop=True)
    )
    breaks = preliminary_and_revision_lines.groupby(group_by).apply(error_type)
    if breaks.empty:
        continue
    breaks = breaks.sort_values(group_by)
    breaks.reset_index(inplace=True, drop=True)
    all_breaks.append(breaks)

In [None]:
error_types = preliminary_and_revision_lines.groupby(group_by).apply(error_type)

In [None]:
breaks = (
    preliminary_and_revision_lines.groupby(group_by)
    .apply(error_type)
    .sort_values(group_by)
)
breaks.reset_index(inplace=True, drop=True)

In [None]:
breaks

In [None]:
breaks.to_excel(
    r"breaks.xlsx", index=False
)