In [1]:
%pip install "calcbench-api-client[Pandas]"

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow as pa
import numpy as np
import calcbench as cb
from ipydatagrid import DataGrid
from qgridnext import show_grid
from scipy.stats import zscore

### File is the Calcbench Point-In-Time data, see https://github.com/calcbench/notebooks/blob/master/standardized_numeric_point_in_time.ipynb
### Talk to us@calcbench.com if you want a historical file.

In [3]:
d = pq.read_table("C:/Users/andre/Downloads/standardized_data_oct_23.parquet")

In [4]:
# Only dealing with Revenue for now
expr = pc.field("metric") == "NetIncome"

In [5]:
metric_data = d.filter(expr).to_pandas()

In [6]:
metric_data["value"] = metric_data["value"].astype(float)

In [7]:
# Remove small companies

#revenue = revenue[revenue["value"] > 1e7]

In [8]:
# Quarterly only
metric_data = metric_data[~metric_data.index.get_level_values("fiscal_period").str.endswith("-0")]

## Add SIC Code meta-data

In [9]:
company_data = cb.companies(entire_universe=True)

In [10]:
merged = metric_data.reset_index().merge(
    company_data[["ticker", "sic_code"]], right_on=["ticker"], left_on=["ticker"]
)[["ticker", "period_end", "sic_code", "value"]]

##### Split SIC code into levels, GICs codes would probably be better but Calcbench does not have them
###### https://www.sec.gov/corpfin/division-of-corporation-finance-standard-industrial-classification-sic-code-list

In [11]:
merged["top_level_SIC_code"] = merged["sic_code"].floordiv(1000).astype("string")
merged["second_level_SIC_code"] = (
    (merged["sic_code"] % 1000).floordiv(100).astype("string")
)
merged["third_level_SIC_code"] = (
    (merged["sic_code"] % 100).floordiv(10).astype("string")
)
merged["fourth_level_SIC_code"] = (merged["sic_code"] % 10).astype("string")

In [12]:
merged["ds"] = (merged["period_end"] + pd.offsets.QuarterEnd()).dt.normalize()

In [13]:
merged = merged[
    ~merged.isnull().T.any()
]  # get rid of rows with any null values, the model builder does not like nulls

In [14]:
# only get retail companies 
#merged = merged[merged['top_level_SIC_code'] == '5']

In [15]:
# Get first record, there are revisions in the Calcbench data set.
merged = merged.groupby(["ticker", "ds"]).first()

In [16]:
# only want companies for which we have more than a year of history
merged = merged.groupby("ticker").filter(lambda g: g.shape[0] > 4)

In [17]:
# throw out outliers
z_scores = merged.groupby("ticker")["value"].transform(lambda x: zscore(x))
merged = merged[z_scores < 4]

In [21]:
merged

Unnamed: 0_level_0,Unnamed: 1_level_0,period_end,sic_code,y,top_level_SIC_code,second_level_SIC_code,third_level_SIC_code,fourth_level_SIC_code
ticker,ds,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0000739708,2010-09-30,2010-06-30,4832,-77207000.0,4,8,3,2
0000739708,2010-12-31,2010-09-30,4832,-150392000.0,4,8,3,2
0000739708,2011-03-31,2010-12-31,4832,-55627000.0,4,8,3,2
0000739708,2011-06-30,2011-03-31,4832,-131363000.0,4,8,3,2
0000739708,2011-09-30,2011-06-30,4832,-37975000.0,4,8,3,2
...,...,...,...,...,...,...,...,...
talo,2023-09-30,2023-06-30,1311,13677000.0,1,3,1,1
talo,2023-12-31,2023-09-30,1311,-2103000.0,1,3,1,1
talo,2024-03-31,2023-12-31,1311,85898000.0,1,3,1,1
talo,2024-06-30,2024-03-31,1311,-112439000.0,1,3,1,1


In [18]:
merged = merged.rename({"value": "y"}, axis=1)

In [23]:
merged.reset_index().to_parquet("net_income.parquet", index=False)