In [3]:
from __future__ import annotations

import numpy as np
import pandas as pd
from research.fractional_momentum.fractional_momentum.config.experiment_config import (
    ExperimentConfig,
)

In [4]:
jkp_data = pd.read_csv(ExperimentConfig.PATH_INPUT / "crsp_all_data.csv")
jkp_data = jkp_data.rename(columns={c: c.lower() for c in jkp_data.columns})
jkp_data = jkp_data.dropna(subset=["permno"])
jkp_data["permno"] = jkp_data["permno"].astype(int)
jkp_data["date"] = pd.to_datetime(jkp_data["date"])
jkp_data = jkp_data.sort_values(["date", "permno"])
jkp_data = jkp_data.drop_duplicates(subset=["date", "permno"])
jkp_data = jkp_data.set_index(["date", "permno"])
jkp_data.head()

  jkp_data = pd.read_csv(ExperimentConfig.PATH_INPUT / "crsp_all_data.csv")


Unnamed: 0_level_0,Unnamed: 1_level_0,nameendt,shrcd,exchcd,siccd,ncusip,ticker,comnam,shrcls,tsymbol,naics,...,cfacpr,cfacshr,openprc,numtrd,retx,vwretd,vwretx,ewretd,ewretx,sprtrn
date,permno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2000-01-03,10001,,11,3,4920,29274A10,EWST,ENERGY WEST INC,,EWST,,...,1.5,1.5,8.4375,2.0,0.007353,-0.006803,-0.00681,0.002878,0.00286,-0.009549
2000-01-03,10002,,11,3,6020,83623410,SABC,SOUTH ALABAMA BANCORPORATION INC,,SABC,,...,1.0,1.0,12.5,6.0,-0.015385,-0.006803,-0.00681,0.002878,0.00286,-0.009549
2000-01-03,10009,,11,3,6030,46334710,IROQ,IROQUOIS BANCORP INC,,IROQ,,...,1.0,1.0,14.75,10.0,0.135593,-0.006803,-0.00681,0.002878,0.00286,-0.009549
2000-01-03,10012,,11,3,3670,24871930,DPAC,DENSE PACIFIC MICROSYSTEMS INC,,DPAC,,...,1.0,1.0,8.375,409.0,-0.003891,-0.006803,-0.00681,0.002878,0.00286,-0.009549
2000-01-03,10016,,11,3,3560,81002230,SCTT,SCOTT TECHNOLOGIES INC,,SCTT,,...,1.0,1.0,18.125,25.0,0.039735,-0.006803,-0.00681,0.002878,0.00286,-0.009549


In [5]:
CRSP_IGNORED = [-66, -77, -88, -99]

jkp_data = jkp_data[
    (jkp_data["ret"] != CRSP_IGNORED[0])
    & (jkp_data["ret"] != CRSP_IGNORED[1])
    & (jkp_data["ret"] != CRSP_IGNORED[2])
    & (jkp_data["ret"] != CRSP_IGNORED[3])
]

In [6]:
jkp_data["prc_new"] = np.abs(jkp_data["prc"]) / jkp_data["cfacpr"].ffill().fillna(
    1
).replace(0, 1.0)
jkp_data["prc"] = jkp_data["prc_new"]

In [7]:
jkp_data["ret"] = jkp_data["ret"].replace("C", np.nan).astype(float)

In [8]:
jkp_data["dolvol"] = jkp_data["vol"] * jkp_data["prc"]

In [9]:
MIN_OBS = 5

n_obs = jkp_data.groupby("permno").apply(lambda x: (x["prc"] >= 0).sum())
valid_ids = n_obs[n_obs > MIN_OBS].index

In [10]:
N_TOP_LIQUID = 5_000

dolvol = (
    jkp_data[jkp_data.index.get_level_values("permno").isin(valid_ids)]
    .groupby("permno")["dolvol"]
    .mean()
)
valid_ids = dolvol.sort_values(ascending=False).iloc[:N_TOP_LIQUID].index

In [11]:
selected = jkp_data[jkp_data.index.get_level_values("permno").isin(valid_ids)]
selected["spread"] = (selected["ask"] - selected["bid"]) / selected["prc"]
spread = selected.groupby("permno")["spread"].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected["spread"] = (selected["ask"] - selected["bid"]) / selected["prc"]


In [12]:
spread.min(), spread.max(), spread.median()

(np.float64(-7.058486800494879e-05),
 np.float64(0.3935876939468288),
 np.float64(0.0026028029163351124))

In [13]:
spread.quantile(0.9)

np.float64(0.010285263877724424)

In [14]:
dolvol.sort_values(ascending=False).iloc[5000]

np.float64(5605295.355753517)

In [15]:
pivoted_prices = (
    jkp_data.loc[jkp_data.index.get_level_values("permno").isin(valid_ids)]
    .reset_index()
    .pivot_table(index="date", columns="permno", values="prc")
)
pivoted_prices = pivoted_prices.replace(0, np.nan)

pivoted_prices.columns = [f"{col}_Price" for col in pivoted_prices.columns]

pivoted_prices = pivoted_prices.replace(np.inf, np.nan)

pivoted_prices = np.abs(pivoted_prices)

In [16]:
pivoted_returns = (
    jkp_data.loc[jkp_data.index.get_level_values("permno").isin(valid_ids)]
    .reset_index()
    .pivot_table(index="date", columns="permno", values="ret")
)

In [17]:
full_df = pivoted_prices.merge(
    pivoted_returns, left_index=True, right_index=True, how="inner"
)

In [18]:
data_df = pd.read_csv(ExperimentConfig.PATH_OUTPUT / "data_df.csv")
data_df["date"] = pd.to_datetime(data_df["date"])
data_df = data_df.set_index("date")

In [19]:
full_df = full_df.merge(data_df[["acc_rate", "spx"]], left_index=True, right_index=True)

In [20]:
full_df.to_csv(ExperimentConfig.PATH_OUTPUT / "liquid_data.csv")

In [21]:
pd.DataFrame([price.split("_Price")[0] for price in pivoted_prices]).to_csv(
    ExperimentConfig.PATH_OUTPUT / "liquid_stocks_list.csv",
    index=False,
)

In [22]:
len([price.split("_Price")[0] for price in pivoted_prices])

5000