In [1]:
from __future__ import annotations

from pathlib import Path

import pandas as pd

from run import Dataset

DATASET = Dataset.TOPN_US.value(topn=500)

In [2]:
SELECTED_FACTORS = ["low_risk", "momentum", "size", "quality", "value"]

jkp_factors = pd.read_csv(Path("../../data/jkp_raw") / "jkp_factors.csv")
jkp_factors = jkp_factors[jkp_factors["name"].isin(SELECTED_FACTORS)]
jkp_factors["date"] = pd.to_datetime(jkp_factors["date"])

In [3]:
factors = jkp_factors.pivot_table(index="date", columns="name", values="ret")
factors

name,low_risk,momentum,quality,size,value
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1926-01-02,,,,0.002354,
1926-01-04,,,,0.005459,
1926-01-05,,,,0.003777,
1926-01-06,,,,-0.000590,
1926-01-07,,,,0.002057,
...,...,...,...,...,...
2024-12-24,-0.003400,0.001616,-0.000962,0.001315,-0.002292
2024-12-26,-0.003880,-0.000633,-0.002414,0.008669,-0.001183
2024-12-27,0.007210,-0.004166,0.000425,-0.003512,0.005444
2024-12-30,0.003394,0.000445,-0.001043,0.001675,0.004711


In [4]:
data_df = pd.read_csv(Path("../../data/output") / DATASET.DF_FILENAME)
data_df["date"] = pd.to_datetime(data_df["date"])
data_df = data_df.set_index("date")

In [5]:
spx = pd.read_excel(Path("../../data/gw_replication") / "spx.xlsx", skiprows=6)
spx = spx.rename(columns={"Date": "date", "PX_LAST": "spx"})
spx["date"] = pd.to_datetime(spx["date"])
spx = spx.set_index("date")
spx = spx.sort_index()
spx = spx[["spx"]].pct_change()
spx

Unnamed: 0_level_0,spx
date,Unnamed: 1_level_1
1950-01-30,
1950-01-31,0.001763
1950-02-01,0.000000
1950-02-02,0.010557
1950-02-03,0.003482
...,...
2025-01-22,0.006138
2025-01-23,0.005314
2025-01-24,-0.002855
2025-01-27,-0.014581


In [6]:
rf = pd.read_excel(Path("../../data/ff") / "FFDaily.xlsx")
rf = rf.rename(columns={"Date": "date", "RF": "rf"})
rf["date"] = pd.to_datetime(rf["date"], format="%Y%m%d")
rf = rf.set_index("date")
rf = rf["rf"] / 100
rf

date
1926-07-01    0.00009
1926-07-02    0.00009
1926-07-06    0.00009
1926-07-07    0.00009
1926-07-08    0.00009
               ...   
2024-08-26    0.00022
2024-08-27    0.00022
2024-08-28    0.00022
2024-08-29    0.00022
2024-08-30    0.00022
Name: rf, Length: 25817, dtype: float64

In [7]:
from qamsi.market_data.risk_free_conventions import accrue_risk_free_rate

rf = accrue_risk_free_rate(rf_rate=rf)

In [8]:
spx = spx.merge(rf.rename("acc_rate"), left_index=True, right_index=True, how="left")
spx

Unnamed: 0_level_0,spx,acc_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1950-01-30,,0.00008
1950-01-31,0.001763,0.00004
1950-02-01,0.000000,0.00004
1950-02-02,0.010557,0.00004
1950-02-03,0.003482,0.00004
...,...,...
2025-01-22,0.006138,
2025-01-23,0.005314,
2025-01-24,-0.002855,
2025-01-27,-0.014581,


In [9]:
spx = spx[["spx"]].sub(spx["acc_rate"], axis=0)
spx

Unnamed: 0_level_0,spx
date,Unnamed: 1_level_1
1950-01-30,
1950-01-31,0.001723
1950-02-01,-0.000040
1950-02-02,0.010517
1950-02-03,0.003442
...,...
2025-01-22,
2025-01-23,
2025-01-24,
2025-01-27,


In [10]:
factors = factors.merge(spx, left_index=True, right_index=True, how="left")

In [11]:
data_df = data_df.merge(factors, left_index=True, right_index=True, how="left")
data_df.shape

(11324, 2484)

In [12]:
data_df = data_df.merge(
    rf.rename("acc_rate"), left_index=True, right_index=True, how="left"
)
data_df.shape

(11324, 2485)

In [13]:
data_df.columns

Index(['10006', '10078', '10095', '10104', '10107', '10108', '10119', '10137',
       '10138', '10145',
       ...
       'lagged_target', 'target_rolling_mean', 'target_rolling_vol',
       'low_risk', 'momentum', 'quality', 'size', 'value', 'spx', 'acc_rate'],
      dtype='object', length=2485)

In [14]:
# data_df[factors.columns].to_csv(Path("../../data/output") / "factors.csv")

In [15]:
# data_df[data_df.columns.difference(factors.columns.tolist() + ["spx"])].to_csv(Path("../../data/output") / "spx_data.csv")

In [16]:
data_df.to_csv(Path("../../data/output") / DATASET.DF_FILENAME)