In [467]:
import calcbench as cb
import pandas as pd
import math
import numpy as np
from decimal import Decimal
from tqdm import tqdm, tqdm_notebook
from IPython.core.debugger import set_trace
from tqdm.notebook import tqdm
import random
import qgrid
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from scipy.stats.mstats import winsorize
from sklearn import preprocessing

cb.enable_backoff()

tqdm.pandas(desc="my bar!")

In [3]:
tickers = cb.tickers(index="SP500")

In [168]:
relevant_columns = [
    "preliminary",
    "value",
    "trace_url",
    "XBRL",
    "filing_type",
]

In [454]:
pickle_file_name = "dow_PIT.pkl"
if False:
    raw_data = pd.DataFrame()
    for ticker in tqdm(tickers):
        d = cb.point_in_time(
            company_identifiers=[ticker],
            all_face=True,
            include_preliminary=True,
            include_xbrl=True,
            all_history=True,
            period_type="quarterly",
            include_trace=True,
        )
        raw_data = raw_data.append(d)
    raw_data.to_pickle(pickle_file_name)
else:
    raw_data = pd.read_pickle(pickle_file_name)
data = raw_data.reset_index(drop=True)

data = data[data.fiscal_period.isin([1, 2, 3, 4])]
data["period"] = pd.PeriodIndex(
    year=data.fiscal_year, quarter=data.fiscal_period, freq="Q"
)
data = data.set_index(["ticker", "metric", "period", "revision_number"])
data = data[relevant_columns]
#data = data[data.index.get_level_values("ticker") == "MMM"]

In [434]:
# Difference from previous XBRL value
previous_XBRL_value = (
    data[data.XBRL]
    .groupby(["ticker", "metric"])
    .shift(1)["value"]
    .groupby(["ticker", "metric", "period"])
    .first()
    .rename("previous_XBRL_value")
)

joined = data[data.preliminary].join(
    previous_XBRL_value,
    how="left",
)

joined = joined[~joined.index.duplicated()]  # There are duplicates, because.
data["diff_from_previous"] = (joined.value - joined.previous_XBRL_value) / joined.value
data["diff_from_previous"] = (
    data["diff_from_previous"].replace([np.inf, -np.inf], np.nan).fillna(0)
)

In [435]:
def preliminary_and_revision(group: pd.DataFrame) -> bool:
    """
    Does this group have a preliminary line and an XBRL line?
    """
    unconfirmed_preliminary = group[group.preliminary & ~group.XBRL].shape[0]
    revisions = group.index.get_level_values("revision_number") > 0
    XBRL_revision = group[group.XBRL & revisions].shape[0]
    return bool(unconfirmed_preliminary and XBRL_revision)

In [436]:
ERROR_TYPE_TO_BOOL = {
    "close_enough": 0,
    "unknown": 1,
    "scale": 1,
    "sign": 1,
    "quarter_error": 1,
    0: 0,
}

In [437]:
def error_type(g: pd.DataFrame):
    """
    g is single period/metric group
    """
    preliminary_row = g[g.preliminary].iloc[0]
    preliminary_value = preliminary_row.value
    XBRL_value = g[~g.preliminary].iloc[0].value
    preliminary_decimal = Decimal(preliminary_value).normalize().as_tuple()
    XBRL_decimal = Decimal(XBRL_value).normalize().as_tuple()
    d = pd.DataFrame(g)
    if preliminary_decimal.digits == XBRL_decimal.digits:
        if preliminary_decimal.exponent == XBRL_decimal.exponent:
            break_type = "sign"
        else:
            break_type = "scale"
    elif math.isclose(preliminary_value, XBRL_value, rel_tol=0.05):
        break_type = "close_enough"
    elif math.isclose(preliminary_value * 4, XBRL_value, rel_tol=0.1):
        break_type = "quarter_error"
    else:
        break_type = "unknown"
    d["break_type"] = break_type
    return d

In [438]:
group = None


def z_score(g: pd.DataFrame):
    global group
    group = g
    """
    How different from the preceding XBRL values is the preliminary value?
    """
    # print(g)
    return g["value"].expanding().apply(window_z_score, raw=False)

In [439]:
z_score_window = None
z_score_XBRL = None


def window_z_score(window):
    global z_score_window
    global z_score_XBRL
    z_score_window = window
    # print( data.loc[window.index])
    #print(window.shape, window.index)
    this_data = data.loc[window.index]
    xbrl = this_data[this_data.XBRL]
    # xbrl = window[window[:, 1]]
    return np.abs((window.iloc[-1] - xbrl.value.mean()) / xbrl.value.std())

In [452]:
group

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,preliminary,value,trace_url,XBRL,filing_type
ticker,metric,period,revision_number,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MMM,PaymentsOfDividendsCommonStock,2019Q4,1,False,828000000.0,http://www.calcbench.com/trace?metric=Payments...,True,10-K
MMM,PaymentsOfDividendsCommonStock,2019Q4,0,True,3316000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K
MMM,PaymentsOfDividendsCommonStock,2021Q1,1,False,858000000.0,http://www.calcbench.com/trace?metric=Payments...,True,10-Q
MMM,PaymentsOfDividendsCommonStock,2021Q1,0,True,-858000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K


In [440]:
numeric_diff_column = "preliminary_z_score"

In [455]:
all_breaks = pd.DataFrame()
for ticker in tqdm(data.index.get_level_values("ticker").unique()[:3]):
    ticker_data = data[data.index.get_level_values("ticker") == ticker]
    #data = data[data.index.get_level_values("metric") == "Goodwill"]
    ticker_data = ticker_data.groupby(["metric", "period"]).filter(preliminary_and_revision)
    preliminary_z_score = ticker_data.groupby(
        level="metric", as_index=False, group_keys=False
    ).apply(z_score)
    ticker_data[numeric_diff_column] = preliminary_z_score.T

    ticker_data = ticker_data.groupby(level=["ticker", "metric", "period"]).progress_apply(error_type)
    all_breaks = all_breaks.append(ticker_data)
all_breaks.replace([np.inf, -np.inf], np.nan, inplace=True)

  0%|          | 0/3 [00:00<?, ?it/s]

my bar!:   0%|          | 0/124 [00:00<?, ?it/s]

my bar!:   0%|          | 0/617 [00:00<?, ?it/s]

my bar!:   0%|          | 0/154 [00:00<?, ?it/s]

In [458]:
# Calculate previous error
only_preliminary = all_breaks[all_breaks.preliminary]
previous_error = only_preliminary.groupby(["ticker", "metric"])["break_type"].shift()
all_breaks["previous_error"] = previous_error

In [474]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,preliminary,value,trace_url,XBRL,filing_type
ticker,metric,period,revision_number,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MMM,AccountsPayable,2008Q4,0,False,1.301000e+09,http://www.calcbench.com/trace?metric=Accounts...,True,10-Q
MMM,AccountsPayable,2009Q2,0,True,1.243000e+09,http://www.calcbench.com/trace?metric=Accounts...,True,8-K
MMM,AccountsPayable,2009Q3,0,True,1.404000e+09,http://www.calcbench.com/trace?metric=Accounts...,True,8-K
MMM,AccountsPayable,2009Q4,0,False,1.453000e+09,http://www.calcbench.com/trace?metric=Accounts...,True,10-K
MMM,AccountsPayable,2010Q1,0,True,1.582000e+09,http://www.calcbench.com/trace?metric=Accounts...,True,8-K
...,...,...,...,...,...,...,...,...
DIS,TreasuryStockValue,2020Q2,0,False,9.070000e+08,http://www.calcbench.com/trace?metric=Treasury...,True,10-Q
DIS,TreasuryStockValue,2020Q3,0,False,9.070000e+08,http://www.calcbench.com/trace?metric=Treasury...,True,10-Q
DIS,TreasuryStockValue,2020Q4,0,False,9.070000e+08,http://www.calcbench.com/trace?metric=Treasury...,True,10-K
DIS,TreasuryStockValue,2021Q1,0,False,9.070000e+08,http://www.calcbench.com/trace?metric=Treasury...,True,10-Q


In [475]:
preliminary_with_break_type = data[data.preliminary].merge(
    all_breaks[all_breaks.preliminary][["break_type", "previous_error", numeric_diff_column]],
    how="left",
    left_index=True,
    right_index=True,
)

In [483]:
preliminary_with_break_type

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,preliminary,value,trace_url,XBRL,filing_type,break_type,previous_error,preliminary_z_score
ticker,metric,period,revision_number,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
MMM,AccountsPayable,2009Q2,0,True,1.243000e+09,http://www.calcbench.com/trace?metric=Accounts...,True,8-K,,,
MMM,AccountsPayable,2009Q3,0,True,1.404000e+09,http://www.calcbench.com/trace?metric=Accounts...,True,8-K,,,
MMM,AccountsPayable,2010Q1,0,True,1.582000e+09,http://www.calcbench.com/trace?metric=Accounts...,True,8-K,,,
MMM,AccountsPayable,2010Q2,0,True,1.756000e+09,http://www.calcbench.com/trace?metric=Accounts...,True,8-K,,,
MMM,AccountsPayable,2010Q3,0,True,1.649000e+09,http://www.calcbench.com/trace?metric=Accounts...,True,8-K,,,
...,...,...,...,...,...,...,...,...,...,...,...
DIS,TreasuryStockValue,2011Q4,0,True,-2.865600e+10,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,,,
DIS,TreasuryStockValue,2012Q1,0,True,-2.945600e+10,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,,,
DIS,TreasuryStockValue,2012Q2,0,True,-3.032500e+10,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,,,
DIS,TreasuryStockValue,2012Q3,0,True,-3.069800e+10,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,,,


In [481]:
one_hot_columns = []  # ["ticker", "metric"]
X = preliminary_with_break_type[["previous_error"]]
X[numeric_diff_column] = pd.qcut(
    preliminary_with_break_type[numeric_diff_column], 10, labels=False
).fillna(0)
X["previous_error"] = X["previous_error"].fillna(0).map(ERROR_TYPE_TO_BOOL)
for column in one_hot_columns:
    X[column] = X.index.get_level_values(column).factorize()[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [484]:
y = preliminary_with_break_type["break_type"].fillna(0).map(ERROR_TYPE_TO_BOOL)

In [486]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [487]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [488]:
y_predicted = clf.predict(X_test)

In [489]:
confusion_matrix(y_test, y_predicted)

array([[3943,    5],
       [   8,   48]], dtype=int64)