In [72]:
import calcbench as cb
import pandas as pd
import math
import numpy as np
from decimal import Decimal
from tqdm import tqdm, tqdm_notebook
from IPython.core.debugger import set_trace
from tqdm.notebook import tqdm
import random
import qgrid
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from scipy.stats.mstats import winsorize

cb.enable_backoff()

tqdm.pandas(desc="my bar!")

In [3]:
tickers = cb.tickers(index="SP500")

In [168]:
relevant_columns = [
    "preliminary",
    "value",
    "trace_url",
    "XBRL",
    "filing_type",
]

In [169]:
pickle_file_name = "sp500_PIT.pkl"
if False:
    raw_data = pd.DataFrame()
    for ticker in tqdm(tickers):
        d = cb.point_in_time(
            company_identifiers=[ticker],
            all_face=True,
            include_preliminary=True,
            include_xbrl=True,
            all_history=True,
            period_type="quarterly",
            include_trace=True,
        )
        raw_data = raw_data.append(d)
    raw_data.to_pickle(pickle_file_name)
else:
    raw_data = pd.read_pickle(pickle_file_name)
data = raw_data.reset_index(drop=True)

data = data[data.fiscal_period.isin([1, 2, 3, 4])]
data["period"] = pd.PeriodIndex(
    year=data.fiscal_year, quarter=data.fiscal_period, freq="Q"
)
data = data.set_index(["ticker", "metric", "period", "revision_number"])
data = data[relevant_columns]

In [6]:
# Difference from previous XBRL value
previous_XBRL_value = (
    data[data.XBRL]
    .groupby(["ticker", "metric"])
    .shift(1)["value"]
    .groupby(["ticker", "metric", "period"])
    .first()
    .rename("previous_XBRL_value")
)

joined = data[data.preliminary].join(
    previous_XBRL_value,
    how="left",
)

joined = joined[~joined.index.duplicated()]  # There are duplicates, because.
data["diff_from_previous"] = (joined.value - joined.previous_XBRL_value) / joined.value
data["diff_from_previous"] = (
    data["diff_from_previous"].replace([np.inf, -np.inf], np.nan).fillna(0)
)

In [181]:
def preliminary_and_revision(group: pd.DataFrame) -> bool:
    """
    Does this group have a preliminary line and an XBRL line?
    """
    unconfirmed_preliminary = group[group.preliminary & ~group.XBRL].shape[0]
    revisions = group.index.get_level_values("revision_number") > 0
    XBRL_revision = group[group.XBRL & revisions].shape[0]
    return bool(unconfirmed_preliminary and XBRL_revision)

In [8]:
ERROR_TYPE_TO_BOOL = {
    "close_enough": 0,
    "unknown": 1,
    "scale": 1,
    "sign": 1,
    "quarter_error": 1,
    0: 0,
}

In [54]:
def error_type(g: pd.DataFrame):
    '''
    g is single period/metric group
    '''
    preliminary_row = g[g.preliminary].iloc[0]
    preliminary_value = preliminary_row.value
    XBRL_value = g[~g.preliminary].iloc[0].value
    preliminary_decimal = Decimal(preliminary_value).normalize().as_tuple()
    XBRL_decimal = Decimal(XBRL_value).normalize().as_tuple()
    d = pd.DataFrame(g)
    if preliminary_decimal.digits == XBRL_decimal.digits:
        if preliminary_decimal.exponent == XBRL_decimal.exponent:
            break_type = "sign"
        else:
            break_type = "scale"
    elif math.isclose(preliminary_value, XBRL_value, rel_tol=0.05):
        break_type = "close_enough"
    elif math.isclose(preliminary_value * 4, XBRL_value, rel_tol=0.1):
        break_type = "quarter_error"
    else:
        break_type = "unknown"
    d["break_type"] = break_type
    return d

In [157]:
def z_score(g: pd.DataFrame):
    """
    How different from the preceding XBRL values is the preliminary value?
    """
    # print(g)
    XBRL_values = g[g.XBRL].value
    return (g.value - XBRL_values.mean()) / XBRL_values.std()

In [202]:
all_breaks = pd.DataFrame()
for ticker in tqdm(data.index.get_level_values("ticker").unique()[:3]):
    data = data[data.index.get_level_values("ticker") == ticker]
    data = data.groupby(["metric", "period"]).filter(preliminary_and_revision)
    '''
    data["preliminary_z_score"] = data.groupby(
        level="metric", as_index=False, group_keys=False
    ).apply(z_score)
'''
    data = data.groupby(level=['ticker', 'metric', 'period']).progress_apply(error_type)
    all_breaks = all_breaks.append(data)
all_breaks.replace([np.inf, -np.inf], np.nan, inplace=True)

  0%|          | 0/1 [00:00<?, ?it/s]

my bar!:   0%|          | 0/124 [00:00<?, ?it/s]

In [203]:
all_breaks

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,preliminary,value,trace_url,XBRL,filing_type,preliminary_z_score,break_type
ticker,metric,period,revision_number,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MMM,AcquisitionDivestitures,2019Q4,1,False,4280000000.0,http://www.calcbench.com/trace?metric=Acquisit...,True,10-K,,unknown
MMM,AcquisitionDivestitures,2019Q4,0,True,4748000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,,unknown
MMM,CAPEXgross,2019Q4,1,False,538000000.0,http://www.calcbench.com/trace?metric=CAPEXgro...,True,10-K,0.707107,unknown
MMM,CAPEXgross,2019Q4,0,True,1699000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,7.908431,unknown
MMM,CAPEXgross,2021Q1,1,False,310000000.0,http://www.calcbench.com/trace?metric=CAPEXgro...,True,10-Q,-0.707107,sign
MMM,CAPEXgross,2021Q1,0,True,-310000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,-4.552775,sign
MMM,EBIT,2016Q3,1,False,1904000000.0,http://www.calcbench.com/trace?metric=EBIT&yea...,True,10-Q,0.315518,close_enough
MMM,EBIT,2016Q3,0,True,1920000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,0.364792,close_enough
MMM,EBIT,2017Q1,1,False,1782000000.0,http://www.calcbench.com/trace?metric=EBIT&yea...,True,10-Q,-0.060192,close_enough
MMM,EBIT,2017Q1,0,True,1737000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,-0.198774,close_enough


In [None]:
all_breaks.to_pickle("sp_500_breaks.pkl")

In [None]:
# Calculate previous error
only_preliminary = all_breaks[all_breaks.preliminary]
previous_error = only_preliminary.groupby(["ticker", "metric"])["break_type"].shift()
all_breaks["previous_error"] = previous_error

In [None]:
preliminary_with_break_type = data[data.preliminary].merge(
    all_breaks[all_breaks.preliminary][["break_type", "previous_error"]],
    how="left",
    left_index=True,
    right_index=True,
)

In [None]:
one_hot_columns = []  # ["ticker", "metric"]
X = preliminary_with_break_type[["previous_error"]]
X["diff_from_previous"] = pd.qcut(
    preliminary_with_break_type["diff_from_previous"], 10, labels=False
)
X["previous_error"] = X["previous_error"].fillna(0).map(ERROR_TYPE_TO_BOOL)
for column in one_hot_columns:
    X[column] = X.index.get_level_values(column).factorize()[0]

In [None]:
from sklearn import preprocessing

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()

In [None]:
y = preliminary_with_break_type["break_type"].fillna(0).map(ERROR_TYPE_TO_BOOL)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [None]:
y_predicted = clf.predict(X_test)

In [None]:
confusion_matrix(y_test, y_predicted)