In [None]:
%pip install "calcbench-api-client[Pandas]"

In [2]:
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow as pa
import numpy as np
import calcbench as cb
from ipydatagrid import DataGrid
from qgridnext import show_grid
from scipy.stats import zscore

### File is the Calcbench Point-In-Time data, see https://github.com/calcbench/notebooks/blob/master/standardized_numeric_point_in_time.ipynb
### Talk to us@calcbench.com if you want a historical file.

In [3]:
d = pq.read_table("C:/Users/andre/Downloads/standardized_data_oct_23.parquet")

In [7]:
#pc.field('metric').isin(["Revenue", 'CostOfRevenue', )

Object `isin` not found.


In [3]:
# Only dealing with Revenue for now
expr = pc.field("metric") == "Revenue"

In [4]:
metric_data = d.filter(expr).to_pandas()

In [5]:
metric_data["value"] = metric_data["value"].astype(float)

In [6]:
# Remove small companies

#revenue = revenue[revenue["value"] > 1e7]

In [7]:
# Quarterly only
metric_data = metric_data[~metric_data.index.get_level_values("fiscal_period").str.endswith("-0")]

## Add SIC Code meta-data

In [8]:
company_data = cb.companies(entire_universe=True)

In [9]:
merged = metric_data.reset_index().merge(
    company_data[["ticker", "sic_code"]], right_on=["ticker"], left_on=["ticker"]
)[["ticker", "period_end", "sic_code", "value"]]

##### Split SIC code into levels, GICs codes would probably be better but Calcbench does not have them
###### https://www.sec.gov/corpfin/division-of-corporation-finance-standard-industrial-classification-sic-code-list

In [10]:
merged["top_level_SIC_code"] = merged["sic_code"].floordiv(1000).astype("string")
merged["second_level_SIC_code"] = (
    (merged["sic_code"] % 1000).floordiv(100).astype("string")
)
merged["third_level_SIC_code"] = (
    (merged["sic_code"] % 100).floordiv(10).astype("string")
)
merged["fourth_level_SIC_code"] = (merged["sic_code"] % 10).astype("string")

In [11]:
merged["ds"] = (merged["period_end"] + pd.offsets.QuarterEnd()).dt.normalize()

In [12]:
merged = merged[
    ~merged.isnull().T.any()
]  # get rid of rows with any null values, the model builder does not like nulls

In [13]:
# only get retail companies 
#merged = merged[merged['top_level_SIC_code'] == '5']

In [14]:
# Get first record, there are revisions in the Calcbench data set.
merged = merged.groupby(["ticker", "ds"]).first()

In [15]:
# only want companies for which we have more than a year of history
merged = merged.groupby("ticker").filter(lambda g: g.shape[0] > 4)

In [16]:
# throw out outliers
z_scores = merged.groupby("ticker")["value"].transform(lambda x: zscore(x))
merged = merged[z_scores < 4]

In [18]:
merged = merged.rename({"value": "y"}, axis=1)

In [19]:
merged.reset_index().to_parquet("test_train_data.parquet", index=False)