In [72]:
import calcbench as cb
import pandas as pd
import math
import numpy as np
from decimal import Decimal
from tqdm import tqdm, tqdm_notebook
from IPython.core.debugger import set_trace
from tqdm.notebook import tqdm
import random
import qgrid
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from scipy.stats.mstats import winsorize

cb.enable_backoff()

tqdm.pandas(desc="my bar!")

In [3]:
tickers = cb.tickers(index="SP500")

In [168]:
relevant_columns = [
    "preliminary",
    "value",
    "trace_url",
    "XBRL",
    "filing_type",
]

In [236]:
pickle_file_name = "dow_PIT.pkl"
if False:
    raw_data = pd.DataFrame()
    for ticker in tqdm(tickers):
        d = cb.point_in_time(
            company_identifiers=[ticker],
            all_face=True,
            include_preliminary=True,
            include_xbrl=True,
            all_history=True,
            period_type="quarterly",
            include_trace=True,
        )
        raw_data = raw_data.append(d)
    raw_data.to_pickle(pickle_file_name)
else:
    raw_data = pd.read_pickle(pickle_file_name)
data = raw_data.reset_index(drop=True)

data = data[data.fiscal_period.isin([1, 2, 3, 4])]
data["period"] = pd.PeriodIndex(
    year=data.fiscal_year, quarter=data.fiscal_period, freq="Q"
)
data = data.set_index(["ticker", "metric", "period", "revision_number"])
data = data[relevant_columns]
data = data[data.index.get_level_values("ticker") == "MMM"]

In [6]:
# Difference from previous XBRL value
previous_XBRL_value = (
    data[data.XBRL]
    .groupby(["ticker", "metric"])
    .shift(1)["value"]
    .groupby(["ticker", "metric", "period"])
    .first()
    .rename("previous_XBRL_value")
)

joined = data[data.preliminary].join(
    previous_XBRL_value,
    how="left",
)

joined = joined[~joined.index.duplicated()]  # There are duplicates, because.
data["diff_from_previous"] = (joined.value - joined.previous_XBRL_value) / joined.value
data["diff_from_previous"] = (
    data["diff_from_previous"].replace([np.inf, -np.inf], np.nan).fillna(0)
)

In [181]:
def preliminary_and_revision(group: pd.DataFrame) -> bool:
    """
    Does this group have a preliminary line and an XBRL line?
    """
    unconfirmed_preliminary = group[group.preliminary & ~group.XBRL].shape[0]
    revisions = group.index.get_level_values("revision_number") > 0
    XBRL_revision = group[group.XBRL & revisions].shape[0]
    return bool(unconfirmed_preliminary and XBRL_revision)

In [8]:
ERROR_TYPE_TO_BOOL = {
    "close_enough": 0,
    "unknown": 1,
    "scale": 1,
    "sign": 1,
    "quarter_error": 1,
    0: 0,
}

In [54]:
def error_type(g: pd.DataFrame):
    """
    g is single period/metric group
    """
    preliminary_row = g[g.preliminary].iloc[0]
    preliminary_value = preliminary_row.value
    XBRL_value = g[~g.preliminary].iloc[0].value
    preliminary_decimal = Decimal(preliminary_value).normalize().as_tuple()
    XBRL_decimal = Decimal(XBRL_value).normalize().as_tuple()
    d = pd.DataFrame(g)
    if preliminary_decimal.digits == XBRL_decimal.digits:
        if preliminary_decimal.exponent == XBRL_decimal.exponent:
            break_type = "sign"
        else:
            break_type = "scale"
    elif math.isclose(preliminary_value, XBRL_value, rel_tol=0.05):
        break_type = "close_enough"
    elif math.isclose(preliminary_value * 4, XBRL_value, rel_tol=0.1):
        break_type = "quarter_error"
    else:
        break_type = "unknown"
    d["break_type"] = break_type
    return d

In [356]:
windows = None
group = None


def z_score(g: pd.DataFrame):
    global windows
    global group
    group = g
    """
    How different from the preceding XBRL values is the preliminary value?
    """
    # print(g)
    return g["value"].expanding().apply(window_z_score, raw=False)

In [361]:
all_breaks = pd.DataFrame()
for ticker in tqdm(data.index.get_level_values("ticker").unique()[:3]):
    data = data[data.index.get_level_values("ticker") == ticker]
    data = data[data.index.get_level_values("metric") == "Goodwill"]
    data = data.groupby(["metric", "period"]).filter(preliminary_and_revision)
    preliminary_z_score = data.groupby(
        level="metric", as_index=False, group_keys=False
    ).apply(z_score)
    data["preliminary_z_score"] = preliminary_z_score.T

    data = data.groupby(level=["ticker", "metric", "period"]).progress_apply(error_type)
    all_breaks = all_breaks.append(data)
all_breaks.replace([np.inf, -np.inf], np.nan, inplace=True)

  0%|          | 0/1 [00:00<?, ?it/s]

(1,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1)],
           names=['ticker', 'metric', 'period', 'revision_number'])
(2,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1),
            ('MMM', 'Goodwill', '2011Q2', 0)],
           names=['ticker', 'metric', 'period', 'revision_number'])
(3,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1),
            ('MMM', 'Goodwill', '2011Q2', 0),
            ('MMM', 'Goodwill', '2011Q3', 1)],
           names=['ticker', 'metric', 'period', 'revision_number'])
(4,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1),
            ('MMM', 'Goodwill', '2011Q2', 0),
            ('MMM', 'Goodwill', '2011Q3', 1),
            ('MMM', 'Goodwill', '2011Q3', 0)],
           names=['ticker', 'metric', 'period', 'revision_number'])
(5,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1),
            ('MMM', 'Goodwill', '2011Q2', 0),
            ('MMM', 'Goodwill', '2011Q3', 1),
            ('MMM', 'Goodwill', '2011Q3', 0),
            ('MMM', 'Goodwill', '2011Q4', 1)],
        

           names=['ticker', 'metric', 'period', 'revision_number'])
(52,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1),
            ('MMM', 'Goodwill', '2011Q2', 0),
            ('MMM', 'Goodwill', '2011Q3', 1),
            ('MMM', 'Goodwill', '2011Q3', 0),
            ('MMM', 'Goodwill', '2011Q4', 1),
            ('MMM', 'Goodwill', '2011Q4', 0),
            ('MMM', 'Goodwill', '2012Q1', 1),
            ('MMM', 'Goodwill', '2012Q1', 0),
            ('MMM', 'Goodwill', '2012Q2', 1),
            ('MMM', 'Goodwill', '2012Q2', 0),
            ('MMM', 'Goodwill', '2012Q3', 1),
            ('MMM', 'Goodwill', '2012Q3', 0),
            ('MMM', 'Goodwill', '2013Q1', 1),
            ('MMM', 'Goodwill', '2013Q1', 0),
            ('MMM', 'Goodwill', '2013Q2', 1),
            ('MMM', 'Goodwill', '2013Q2', 0),
            ('MMM', 'Goodwill', '2013Q3', 1),
            ('MMM', 'Goodwill', '2013Q3', 0),
            ('MMM', 'Goodwill', '2014Q1', 1),
            ('MMM', 'Goodwill', '2014Q1', 0),
      

my bar!:   0%|          | 0/29 [00:00<?, ?it/s]

In [362]:
all_breaks[]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,preliminary,value,trace_url,XBRL,filing_type,preliminary_z_score,break_type
ticker,metric,period,revision_number,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MMM,Goodwill,2011Q2,1,False,7233000000.0,http://www.calcbench.com/trace?metric=Goodwill...,True,10-Q,,unknown
MMM,Goodwill,2011Q2,0,True,9258000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,,unknown
MMM,Goodwill,2011Q3,1,False,7140000000.0,http://www.calcbench.com/trace?metric=Goodwill...,True,10-Q,-0.707107,unknown
MMM,Goodwill,2011Q3,0,True,9092000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,28.976171,unknown
MMM,Goodwill,2011Q4,1,False,7047000000.0,http://www.calcbench.com/trace?metric=Goodwill...,True,10-K,-1.0,unknown
MMM,Goodwill,2011Q4,0,True,8963000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,19.602151,unknown
MMM,Goodwill,2012Q1,1,False,7090000000.0,http://www.calcbench.com/trace?metric=Goodwill...,True,10-Q,-0.46908,unknown
MMM,Goodwill,2012Q1,0,True,8955000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,22.859829,unknown
MMM,Goodwill,2012Q2,1,False,7069000000.0,http://www.calcbench.com/trace?metric=Goodwill...,True,10-Q,-0.632334,unknown
MMM,Goodwill,2012Q2,0,True,8911000000.0,http://www.calcbench.com/trace?nonxbrlfactIDs=...,False,8-K,24.255695,unknown


In [348]:
z_score_window = None
z_score_XBRL = None


def window_z_score(window):
    global z_score_window
    global z_score_XBRL
    z_score_window = window
    # print( data.loc[window.index])
    print(window.shape, window.index)
    this_data = data.loc[window.index]
    xbrl = this_data[this_data.XBRL]
    # xbrl = window[window[:, 1]]
    return (window.iloc[-1] - xbrl.value.mean()) / xbrl.value.std()

In [349]:
z_scores = group["value"].expanding().apply(window_z_score, raw=False)

(1,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1)],
           names=['ticker', 'metric', 'period', 'revision_number'])
(2,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1),
            ('MMM', 'Goodwill', '2011Q2', 0)],
           names=['ticker', 'metric', 'period', 'revision_number'])
(3,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1),
            ('MMM', 'Goodwill', '2011Q2', 0),
            ('MMM', 'Goodwill', '2011Q3', 1)],
           names=['ticker', 'metric', 'period', 'revision_number'])
(4,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1),
            ('MMM', 'Goodwill', '2011Q2', 0),
            ('MMM', 'Goodwill', '2011Q3', 1),
            ('MMM', 'Goodwill', '2011Q3', 0)],
           names=['ticker', 'metric', 'period', 'revision_number'])
(5,) MultiIndex([('MMM', 'Goodwill', '2011Q2', 1),
            ('MMM', 'Goodwill', '2011Q2', 0),
            ('MMM', 'Goodwill', '2011Q3', 1),
            ('MMM', 'Goodwill', '2011Q3', 0),
            ('MMM', 'Goodwill', '2011Q4', 1)],
        

In [351]:
z_scores

ticker  metric    period  revision_number
MMM     Goodwill  2011Q2  1                        NaN
                          0                        NaN
                  2011Q3  1                  -0.707107
                          0                  28.976171
                  2011Q4  1                  -1.000000
                          0                  19.602151
                  2012Q1  1                  -0.469080
                          0                  22.859829
                  2012Q2  1                  -0.632334
                          0                  24.255695
                  2012Q3  1                   1.073029
                          0                  24.808174
                  2013Q1  1                   1.179866
                          0                  23.300199
                  2013Q2  1                   0.873275
                          0                  22.573032
                  2013Q3  1                   1.655349
                       

In [None]:
# Calculate previous error
only_preliminary = all_breaks[all_breaks.preliminary]
previous_error = only_preliminary.groupby(["ticker", "metric"])["break_type"].shift()
all_breaks["previous_error"] = previous_error

In [None]:
preliminary_with_break_type = data[data.preliminary].merge(
    all_breaks[all_breaks.preliminary][["break_type", "previous_error"]],
    how="left",
    left_index=True,
    right_index=True,
)

In [None]:
one_hot_columns = []  # ["ticker", "metric"]
X = preliminary_with_break_type[["previous_error"]]
X["diff_from_previous"] = pd.qcut(
    preliminary_with_break_type["diff_from_previous"], 10, labels=False
)
X["previous_error"] = X["previous_error"].fillna(0).map(ERROR_TYPE_TO_BOOL)
for column in one_hot_columns:
    X[column] = X.index.get_level_values(column).factorize()[0]

In [None]:
from sklearn import preprocessing

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()

In [None]:
y = preliminary_with_break_type["break_type"].fillna(0).map(ERROR_TYPE_TO_BOOL)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [None]:
y_predicted = clf.predict(X_test)

In [None]:
confusion_matrix(y_test, y_predicted)