In [1]:
import calcbench as cb
import pandas as pd
import math
import numpy as np
from decimal import Decimal
from tqdm import tqdm, tqdm_notebook
from IPython.core.debugger import set_trace
from tqdm.notebook import tqdm
import random
import qgrid
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

cb.enable_backoff()

tqdm.pandas(desc="my bar!")

In [2]:
tickers = cb.tickers(index="DJIA")

In [3]:
relevant_columns = [
    "preliminary",
    "revision_number",
    "value",
    "trace_url",
    "XBRL",
    "filing_type",
]

In [4]:
data = pd.DataFrame()
for ticker in tqdm(tickers):
    d = cb.point_in_time(
        company_identifiers=[ticker],
        all_face=True,
        include_preliminary=True,
        include_xbrl=True,
        all_history=True,
        period_type="quarterly",
        include_trace=True,
    )
    data = data.append(d)
data.reset_index(inplace=True, drop=True)

data = data[data.fiscal_period.isin([1, 2, 3, 4])]
data["period"] = pd.PeriodIndex(
    year=data.fiscal_year, quarter=data.fiscal_period, freq="Q"
)
data = data.set_index(
    [
        "ticker",
        "metric",
        "period",
    ]
)
data = data[relevant_columns]

  0%|          | 0/30 [00:00<?, ?it/s]

In [58]:
# Difference from previous XBRL value
previous_XBRL_value = (
    data[data.XBRL]
    .groupby(["ticker", "metric"])
    .shift(1)["value"]
    .groupby(["ticker", "metric", "period"])
    .first()
    .rename("previous_XBRL_value")
)

joined = data[data.preliminary].join(
    previous_XBRL_value,
    how="left",
)
# data["diff_from_previous"] = (joined.value - joined.previous_XBRL_value) / joined.value

In [5]:
def preliminary_and_revision(group: pd.DataFrame) -> bool:
    """
    Does this group have a preliminary line and an XBRL line?
    """
    unconfirmed_preliminary = group[group.preliminary & ~group.XBRL].shape[0]
    revisions = group.revision_number > 0
    XBRL_revision = group[group.XBRL & revisions].shape[0]
    return bool(unconfirmed_preliminary and XBRL_revision)

In [6]:
ERROR_TYPE_TO_BOOL = {
    "close_enough": 0,
    "unknown": 1,
    "scale": 1,
    "sign": 1,
    "quarter_error": 1,
    0: 0,
}

In [7]:
def error_type(g):
    preliminary_row = g[g.preliminary].iloc[0]
    preliminary_value = preliminary_row.value
    XBRL_value = g[~g.preliminary].iloc[0].value
    preliminary_decimal = Decimal(preliminary_value).normalize().as_tuple()
    XBRL_decimal = Decimal(XBRL_value).normalize().as_tuple()
    diff = (preliminary_value - XBRL_value) / XBRL_value
    d = pd.DataFrame(g)
    d["diff"] = diff
    if preliminary_decimal.digits == XBRL_decimal.digits:
        if preliminary_decimal.exponent == XBRL_decimal.exponent:
            break_type = "sign"
        else:
            break_type = "scale"
    elif math.isclose(preliminary_value, XBRL_value, rel_tol=0.05):
        break_type = "close_enough"
    elif math.isclose(preliminary_value * 4, XBRL_value, rel_tol=0.1):
        break_type = "quarter_error"
    else:
        break_type = "unknown"
    d["break_type"] = break_type
    g[g.preliminary]["error"] = break_type
    return d

In [8]:
groups_with_preliminary_and_revision = data.groupby(data.index).filter(
    preliminary_and_revision
)

In [9]:
all_breaks = groups_with_preliminary_and_revision.groupby(
    groups_with_preliminary_and_revision.index
).progress_apply(error_type)

my bar!:   0%|          | 0/5438 [00:00<?, ?it/s]

  diff = (preliminary_value - XBRL_value) / XBRL_value


In [10]:
# Calculate previous error
only_preliminary = all_breaks[all_breaks.preliminary]
previous_error = only_preliminary.groupby(["ticker", "metric"])["break_type"].shift()
all_breaks["previous_error"] = previous_error

In [24]:
preliminary_with_break_type = data[data.preliminary].merge(
    all_breaks[all_breaks.preliminary][["break_type", "previous_error"]],
    how="left",
    left_index=True,
    right_index=True,
)

In [25]:
one_hot_columns = ["ticker", "metric"]
X = preliminary_with_break_type[["previous_error"]]
X["previous_error"] = X["previous_error"].fillna(0).map(ERROR_TYPE_TO_BOOL)
for column in one_hot_columns:
    X[column] = X.index.get_level_values(column).factorize()[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [26]:
y = preliminary_with_break_type["break_type"].fillna(0).map(ERROR_TYPE_TO_BOOL)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [28]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [29]:
y_predicted = clf.predict(X_test)

In [30]:
confusion_matrix(y_test, y_predicted)

array([[3536,   18],
       [  89,  354]], dtype=int64)

In [31]:
confusion_matrix(y_test, y_predicted)

array([[3536,   18],
       [  89,  354]], dtype=int64)