In [None]:
%pip install "calcbench-api-client[Pandas]"

In [34]:
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pandas as pd
import calcbench as cb
from scipy.stats import zscore
from datetime import date
from qgridnext import show_grid

### File is the Calcbench Point-In-Time data, see https://github.com/calcbench/notebooks/blob/master/standardized_numeric_point_in_time.ipynb

### Talk to us@calcbench.com if you want a historical file.


In [2]:
# last date
last_date = pd.to_datetime(date(2024, 9, 30))

# XBRL data starts ~2010
first_date = pd.to_datetime(date(2010, 1, 1))

In [3]:
d = pq.read_table("C:/Users/andre/Downloads/standardized_data_oct_23.parquet")

In [4]:
# Filter for the line items we want

expenses = [

    "CostOfRevenue",

    "SGAExpense",

    "InterestExpense",

    "IncomeTaxes",

]


expr = pc.field("metric").isin(["Revenue", "NetIncome"] + expenses)

metric_data = d.filter(expr).to_pandas()

In [5]:
# do things in millions for readability

metric_data["value"] = metric_data["value"].astype(float) / 1e6

In [6]:
# Remove small companies

# revenue = revenue[revenue["value"] > 1e7]

In [7]:
# Quarterly only
metric_data = metric_data[
    ~metric_data.index.get_level_values("fiscal_period").str.endswith("-0")
]

## Add SIC Code meta-data


In [8]:
company_data = cb.companies(entire_universe=True)

In [9]:
merged = metric_data.reset_index().merge(
    company_data[["ticker", "sic_code"]], right_on=["ticker"], left_on=["ticker"]
)[["ticker", "period_end", "sic_code", "value", "metric"]]

##### Split SIC code into levels, GICs codes would probably be better but Calcbench does not have them

###### https://www.sec.gov/corpfin/division-of-corporation-finance-standard-industrial-classification-sic-code-list


In [10]:
merged["top_level_SIC_code"] = merged["sic_code"].floordiv(1000).astype("string")
merged["second_level_SIC_code"] = (
    (merged["sic_code"] % 1000).floordiv(100).astype("string")
)
merged["third_level_SIC_code"] = (
    (merged["sic_code"] % 100).floordiv(10).astype("string")
)
merged["fourth_level_SIC_code"] = (merged["sic_code"] % 10).astype("string")

In [11]:
merged["ds"] = (merged["period_end"] + pd.offsets.QuarterEnd()).dt.normalize()

In [12]:
merged = merged[
    ~merged.isnull().T.any()
]  # get rid of rows with any null values, the model builder does not like nulls

In [13]:
# Get first record, there are revisions in the Calcbench data set.
merged = merged.groupby(["ticker", "ds", "metric"]).first()

In [14]:
# only want companies for which we have more than a year of history
merged = merged.groupby("ticker").filter(lambda g: g.shape[0] > 4)

In [15]:
# throw out outliers
z_scores = merged.groupby(["ticker", "metric"])["value"].transform(lambda x: zscore(x))
merged = merged[z_scores < 4]

In [23]:
accounts = (
    merged.reset_index()
    .set_index(
        [
            "top_level_SIC_code",
            "second_level_SIC_code",
            "third_level_SIC_code",
            "fourth_level_SIC_code",
            "ticker",
            "ds",
            "metric",
        ]
    )["value"]
    .unstack("metric")
)

KeyError: "None of ['metric'] are in the columns"

In [17]:
calculated_net_income = accounts["Revenue"] - accounts[expenses].sum(axis=1)

In [18]:
accounts["Other"] = accounts["NetIncome"] - calculated_net_income

In [40]:
# No longer need NetIncome
accounts = accounts.drop('NetIncome', axis=1)

In [41]:
merged = accounts.stack("metric").to_frame(name='y')

In [51]:
merged = merged.reset_index('ds')

In [52]:
merged = merged[merged["ds"] <= last_date]
merged = merged[merged["ds"] >= first_date]

In [54]:
# remove companies for which we do not have current data
merged = merged.groupby("ticker").filter(lambda x: x["ds"].max() == last_date)

In [55]:
# Get rid of companies for which we do not have enough data
MINIMUM_NUMBER_OF_OBSERVATIONS = 40
merged = merged.groupby("ticker").filter(
    lambda x: x.shape[0] > MINIMUM_NUMBER_OF_OBSERVATIONS
)

In [58]:
merged = merged.reset_index()

In [59]:
pivot_columns = [
    "ticker",
    "top_level_SIC_code",
    "second_level_SIC_code",
    "third_level_SIC_code",
    "fourth_level_SIC_code",
    "metric",
]

# forward fill missing data
merged = (
    merged.pivot(
        index="ds",
        columns=pivot_columns,
        values="y",
    )
    .ffill()
    .melt(ignore_index=False, value_name="y")
    .reset_index()
)

In [60]:
merged

Unnamed: 0,ds,ticker,top_level_SIC_code,second_level_SIC_code,third_level_SIC_code,fourth_level_SIC_code,metric,y
0,2010-03-31,ALCO,0,1,0,0,CostOfRevenue,
1,2010-06-30,ALCO,0,1,0,0,CostOfRevenue,
2,2010-09-30,ALCO,0,1,0,0,CostOfRevenue,23.057
3,2010-12-31,ALCO,0,1,0,0,CostOfRevenue,5.513
4,2011-03-31,ALCO,0,1,0,0,CostOfRevenue,14.062
...,...,...,...,...,...,...,...,...
1527269,2023-09-30,NVRI,8,9,0,0,Other,-16.486
1527270,2023-12-31,NVRI,8,9,0,0,Other,-10.702
1527271,2024-03-31,NVRI,8,9,0,0,Other,-51.664
1527272,2024-06-30,NVRI,8,9,0,0,Other,-15.738


In [61]:
merged.to_parquet("test_train_data.parquet", index=False)