In [1]:
!rm -rf /kaggle/working/multimodal-eq-sizing
!git clone https://github.com/brianrp09232000/multimodal-eq-sizing.git /kaggle/working/multimodal-eq-sizing
!pip install -r /kaggle/working/multimodal-eq-sizing/requirements.txt

Cloning into '/kaggle/working/multimodal-eq-sizing'...
remote: Enumerating objects: 448, done.[K
remote: Counting objects: 100% (68/68), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 448 (delta 46), reused 30 (delta 30), pack-reused 380 (from 3)[K
Receiving objects: 100% (448/448), 173.26 KiB | 2.25 MiB/s, done.
Resolving deltas: 100% (256/256), done.


In [2]:
import sys
import pathlib
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import yfinance as yf

In [3]:
np.seterr(invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [4]:
repo_root = pathlib.Path("/kaggle/working/multimodal-eq-sizing")
sys.path.append(str(repo_root))

In [5]:
from src.data.loaders import (
    get_single_ticker_history,
    get_tickers_history,
    get_return_data,
    get_excess_return,
    get_vix_data,
    get_spread_z,
    get_sector_map,
    get_adv_dollar
)

In [6]:
""" Active this code after PR approved
from src.data.features.price_features import (
compute_momentum_rank,
get_log_mktcap,
compute_SPY_r1
)
"""

' Active this code after PR approved\nfrom src.data.features.price_features import (\ncompute_momentum_rank,\nget_log_mktcap,\ncompute_SPY_r1\n)\n'

In [7]:
def get_date_range(df: pd.DataFrame) -> tuple:
    grouped_by_date = df.groupby(["ticker"]).agg(['min', 'max', 'count'])["Date"]
    start = grouped_by_date["min"].min()
    end = grouped_by_date["max"].max()
    return start, end

In [8]:
df = get_return_data("/kaggle/input/news-trading/return_data.csv")
start, end = get_date_range(df)

In [9]:
start

Timestamp('2010-01-04 05:00:00+0000', tz='UTC')

In [10]:
end

Timestamp('2018-12-28 05:00:00+0000', tz='UTC')

In [11]:
df['ticker'].unique()

array(['AAPL', 'MSFT', 'AMZN', 'GOOG', 'META', 'WMT', 'JPM', 'TSLA',
       'NFLX', 'ADBE'], dtype=object)

# Add 12-1 momentum and momentum rank

In [12]:
def compute_momentum_rank(input_df):
    """
    Compute 12-1 momentum and cross-sectional momentum rank for each ticker.
    12-1 momentum = Close[t-21] / Close[t-252] - 1.
    cross-sectional rank each day = position / N.
    ----------
    Input dataset must contain columns ['ticker', 'Date']; 
    ----------
    Output dataset adds additional columns:['mom_12_1','mom_rank']
    """

    # Fetch data from yfinance starting at a buffer_start date to support 12â€“1 momentum calculations
    start = input_df['Date'].min() 
    end = input_df['Date'].max() 
    buffer_start = start - timedelta(days=400)
    df = get_tickers_history(list(input_df['ticker'].unique()), buffer_start, end)
    df = df.sort_values(['ticker', 'Date']).reset_index(drop=True)

    # Compute 12-1 momentum
    df['Close_t_21']  = df.groupby('ticker')['Close'].shift(21)
    df['Close_t_252'] = df.groupby('ticker')['Close'].shift(252)
    df['mom_12_1'] = df['Close_t_21'] / df['Close_t_252'] - 1

    # Compute cross-sectional 12-1 momentum ranking
    df['mom_position'] = (
        df.groupby('Date')['mom_12_1']
        .rank(method='first')
    )
    df['N'] = df.groupby('Date')['ticker'].transform('count')
    df['mom_rank'] = df['mom_position'] / df['N']

    # Drop buffer dates data; Keep mom_12_1 and mom_rank columns
    df2 = df[df['Date']>=start]
    df2 = df2[['Date','ticker','mom_12_1','mom_rank']]
    
    # Merge 12-1 momentum and rank into the input dataset
    output_df = pd.merge(input_df, df2, on =['ticker','Date'], how='left')
    return output_df

In [13]:
df = compute_momentum_rank(df)
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,mom_12_1,mom_rank
0,2010-01-04 05:00:00+00:00,6.487649,6.520174,6.455732,6.505279,493729600,0.0,0.0,AAPL,1.162314,0.875
1,2010-01-05 05:00:00+00:00,6.523214,6.553307,6.482178,6.516527,601904800,0.0,0.0,AAPL,1.077395,0.875
2,2010-01-06 05:00:00+00:00,6.516527,6.542364,6.406185,6.412873,552160000,0.0,0.0,AAPL,1.078262,0.875
3,2010-01-07 05:00:00+00:00,6.436583,6.444183,6.354511,6.401018,477131200,0.0,0.0,AAPL,1.076146,0.875
4,2010-01-08 05:00:00+00:00,6.392506,6.444181,6.354814,6.443573,447610800,0.0,0.0,AAPL,1.048220,0.875
...,...,...,...,...,...,...,...,...,...,...,...
21904,2018-12-21 05:00:00+00:00,219.550003,222.259995,207.369995,208.800003,8828100,0.0,0.0,ADBE,0.261004,0.800
21905,2018-12-24 05:00:00+00:00,206.899994,212.580002,204.949997,205.160004,2940500,0.0,0.0,ADBE,0.294569,0.900
21906,2018-12-26 05:00:00+00:00,206.529999,222.960007,206.250000,222.949997,5897900,0.0,0.0,ADBE,0.288914,0.900
21907,2018-12-27 05:00:00+00:00,219.990005,225.169998,214.160004,225.139999,3931100,0.0,0.0,ADBE,0.329741,0.800


# Add log_mkt_cap

In [14]:
def get_log_mktcap(input_df):
    """
    Compute log(market capitalization) for each ticker.
    log(market cap_t) = log(Close_t) * SharesOutstanding. 
    * Note that Yahoo only provides the latest shares outstanding, so we don't have the historical 
    shares outstanding at time t. The calculation method for market capitalization may not be accurate.
    ----------
    Input dataset must contain columns ['ticker', 'Date']; 
    ----------
    Output dataset adds additional columns:['log_mktcap']
    """
    start = input_df['Date'].min() 
    end = input_df['Date'].max()
    tickers = list(input_df['ticker'].unique())
    
    rows = []
    for ticker in tickers:
        stock = yf.Ticker(ticker)

        # Get CURRENT shares outstanding
        shares = stock.info.get("sharesOutstanding", None)

        # Fetch historical price
        df_price = get_single_ticker_history(ticker, start, end)

        # Add ticker column
        df_price["ticker"] = ticker
        df_price = df_price.reset_index()

        # Compute market cap and log market cap
        if shares is not None:
            df_price["mktcap"] = df_price["Close"] * shares
            df_price["log_mktcap"] = (df_price["mktcap"]).apply(
                lambda x: None if pd.isna(x) else np.log(x)
            )
        else:
            df_price["market_cap"] = None
            df_price["log_mktcap"] = None

        rows.append(df_price)
    # concat all tickers and merge with the input dataset    
    mktcap = pd.concat(rows, ignore_index=True)[['Date','ticker','log_mktcap']]
    output_df = pd.merge(input_df, mktcap, on=['Date','ticker'], how='left')
    
    return output_df

In [15]:
df = get_log_mktcap(df)
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,mom_12_1,mom_rank,log_mktcap
0,2010-01-04 05:00:00+00:00,6.487649,6.520174,6.455732,6.505279,493729600,0.0,0.0,AAPL,1.162314,0.875,25.275460
1,2010-01-05 05:00:00+00:00,6.523214,6.553307,6.482178,6.516527,601904800,0.0,0.0,AAPL,1.077395,0.875,25.277188
2,2010-01-06 05:00:00+00:00,6.516527,6.542364,6.406185,6.412873,552160000,0.0,0.0,AAPL,1.078262,0.875,25.261153
3,2010-01-07 05:00:00+00:00,6.436583,6.444183,6.354511,6.401018,477131200,0.0,0.0,AAPL,1.076146,0.875,25.259303
4,2010-01-08 05:00:00+00:00,6.392506,6.444181,6.354814,6.443573,447610800,0.0,0.0,AAPL,1.048220,0.875,25.265929
...,...,...,...,...,...,...,...,...,...,...,...,...
21904,2018-12-21 05:00:00+00:00,219.550003,222.259995,207.369995,208.800003,8828100,0.0,0.0,ADBE,0.261004,0.800,25.193803
21905,2018-12-24 05:00:00+00:00,206.899994,212.580002,204.949997,205.160004,2940500,0.0,0.0,ADBE,0.294569,0.900,25.176217
21906,2018-12-26 05:00:00+00:00,206.529999,222.960007,206.250000,222.949997,5897900,0.0,0.0,ADBE,0.288914,0.900,25.259374
21907,2018-12-27 05:00:00+00:00,219.990005,225.169998,214.160004,225.139999,3931100,0.0,0.0,ADBE,0.329741,0.800,25.269149


# Add SPY r1

In [16]:
def compute_SPY_r1 (input_df):
    """
    Compute SPY r1 = (Close_t/Clost_t_1) -1
    ----------
    Input dataset must contain columns ['Date']. 
    ----------
    Output dataset adds additional columns:['spy_r1']
    """
    start = input_df['Date'].min() 
    end = input_df['Date'].max()
    buffer_start = start - timedelta(days=7)
    
    df_spy = get_single_ticker_history("SPY", buffer_start, end)
    df_spy["spy_r1"] = df_spy["Close"] / df_spy["Close"].shift(1) - 1
    df_spy = df_spy[df_spy['Date']>=start]
    df_spy = df_spy[['Date','spy_r1']]
    
    output_df = pd.merge(input_df, df_spy, on=['Date'], how='left')
    return output_df

In [17]:
df = compute_SPY_r1(df)
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,mom_12_1,mom_rank,log_mktcap,spy_r1
0,2010-01-04 05:00:00+00:00,6.487649,6.520174,6.455732,6.505279,493729600,0.0,0.0,AAPL,1.162314,0.875,25.275460,0.016959
1,2010-01-05 05:00:00+00:00,6.523214,6.553307,6.482178,6.516527,601904800,0.0,0.0,AAPL,1.077395,0.875,25.277188,0.002647
2,2010-01-06 05:00:00+00:00,6.516527,6.542364,6.406185,6.412873,552160000,0.0,0.0,AAPL,1.078262,0.875,25.261153,0.000704
3,2010-01-07 05:00:00+00:00,6.436583,6.444183,6.354511,6.401018,477131200,0.0,0.0,AAPL,1.076146,0.875,25.259303,0.004222
4,2010-01-08 05:00:00+00:00,6.392506,6.444181,6.354814,6.443573,447610800,0.0,0.0,AAPL,1.048220,0.875,25.265929,0.003327
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21904,2018-12-21 05:00:00+00:00,219.550003,222.259995,207.369995,208.800003,8828100,0.0,0.0,ADBE,0.261004,0.800,25.193803,-0.020490
21905,2018-12-24 05:00:00+00:00,206.899994,212.580002,204.949997,205.160004,2940500,0.0,0.0,ADBE,0.294569,0.900,25.176217,-0.026423
21906,2018-12-26 05:00:00+00:00,206.529999,222.960007,206.250000,222.949997,5897900,0.0,0.0,ADBE,0.288914,0.900,25.259374,0.050525
21907,2018-12-27 05:00:00+00:00,219.990005,225.169998,214.160004,225.139999,3931100,0.0,0.0,ADBE,0.329741,0.800,25.269149,0.007677


# Add excess return

In [18]:
def add_excess_return(df, start, end):
    excess_return_df = get_excess_return(df, start, end)
    df = df.merge(excess_return_df, on=["ticker", "Date"], how="left")
    return df

In [19]:
df = add_excess_return(df, start, end)

# Add market regime VIX z-score

In [20]:
def add_vix_z(df, start, end):
    vix_z_df = get_vix_data(start, end)
    format_str = "%Y-%m-%d"
    vix_z_df["Date"] = vix_z_df["Date"].dt.strftime(format_str)
    df["Date"] = df["Date"].dt.strftime(format_str) 
    df = df.merge(vix_z_df, on=["Date"], how="left")
    df['Date'] = pd.to_datetime(df['Date'], utc=True)
    return df

In [21]:
df = add_vix_z(df, start, end)

Yay!ðŸ¥³


# Add spread z-score

In [22]:
def add_spread_z(existing_df: pd.DataFrame, buffer_days=380) -> pd.DataFrame:
    """
    Use existing OHLCV df, pull buffered history, compute young-safe spread_z on the combined
    Then merge back only the target window rows to prevent nulls.
    """
    df = existing_df.copy()
    start, end = df["Date"].min(), df["Date"].max()

    tickers = sorted(df['ticker'].unique())
    fetch_start = start - timedelta(days=buffer_days)
    fetch_end   = end

    # You already have get_tickers_history(tickers, start, end)
    hist = get_tickers_history(tickers, fetch_start, fetch_end)
    hist["Date"] = pd.to_datetime(hist["Date"], utc=True)

    # Combine buffer + existing; keep existing rows on overlap
    combined = pd.concat([hist, df], ignore_index=True)
    combined = combined.sort_values(['ticker', "Date"])
    combined = combined.drop_duplicates(subset=['ticker', "Date"], keep="last")

    # Compute young-safe spread_z on the full combined range
    combined = get_spread_z(combined)

    # Merge only computed columns back to target window
    cols_to_merge = ['ticker', 'Date', "spread_z"]
    out = df.merge(combined[cols_to_merge], on=['ticker', 'Date'], how="left")

    # Final minimal, causal clean-up to guarantee NON-NULL spread_z in target window:
    # 1) per-ticker forward-fill (past only), 2) same-day cross-section median, 3) final 0
    out["spread_z"] = (
        out.groupby('ticker')["spread_z"].ffill()
           .fillna(out.groupby('Date')["spread_z"].transform("median"))
           .fillna(0.0)
    ).clip(-3, 3)

    return out

In [23]:
df = add_spread_z(df)

# Add sector

In [24]:
def add_sector(df):
    tickers = df["ticker"].unique()
    sector_map = get_sector_map(tickers)
    df = df.join(sector_map, on="ticker")
    return df

In [25]:
df = add_sector(df)

# Add dollar-volume ADV 

In [26]:
def add_adv_dollar(df):
    adv_df = get_adv_dollar(df)
    
    df = df.merge(
        adv_df,
        on=["Date", "ticker"],
        how="left",
    )
    return df

In [27]:
df = add_adv_dollar(df)

# Final Complete Dataset

In [28]:
df.to_csv('final_dataset.csv', index=False)

In [29]:
get_return_data("/kaggle/working/final_dataset.csv")

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,mom_12_1,...,log_mktcap,spy_r1,o2c_return,excess_return,VIX_Close,VIX_z,spread_z,sector,dollar_volume,adv_dollar
0,2010-01-04 00:00:00+00:00,6.487649,6.520174,6.455732,6.505279,493729600,0.0,0.0,AAPL,1.162314,...,25.275460,0.016959,0.002718,-0.005826,20.040001,-1.249591,-0.231351,Technology,3.211849e+09,
1,2010-01-05 00:00:00+00:00,6.523214,6.553307,6.482178,6.516527,601904800,0.0,0.0,AAPL,1.077395,...,25.277188,0.002647,-0.001025,-0.004292,19.350000,-1.314181,-0.015453,Technology,3.922329e+09,
2,2010-01-06 00:00:00+00:00,6.516527,6.542364,6.406185,6.412873,552160000,0.0,0.0,AAPL,1.078262,...,25.261153,0.000704,-0.015906,-0.017580,19.160000,-1.323599,0.871331,Technology,3.540932e+09,
3,2010-01-07 00:00:00+00:00,6.436583,6.444183,6.354511,6.401018,477131200,0.0,0.0,AAPL,1.076146,...,25.259303,0.004222,-0.005525,-0.011605,19.059999,-1.324017,1.082035,Technology,3.054125e+09,
4,2010-01-08 00:00:00+00:00,6.392506,6.444181,6.354814,6.443573,447610800,0.0,0.0,AAPL,1.048220,...,25.265929,0.003327,0.007989,0.002018,18.129999,-1.413993,0.266572,Technology,2.884213e+09,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21904,2018-12-21 00:00:00+00:00,219.550003,222.259995,207.369995,208.800003,8828100,0.0,0.0,ADBE,0.261004,...,25.193803,-0.020490,-0.048964,-0.024485,30.110001,2.909232,-1.825644,Technology,1.843307e+09,1.187080e+09
21905,2018-12-24 00:00:00+00:00,206.899994,212.580002,204.949997,205.160004,2940500,0.0,0.0,ADBE,0.294569,...,25.176217,-0.026423,-0.008410,0.011252,36.070000,3.000000,-2.249653,Technology,6.032730e+08,1.198173e+09
21906,2018-12-26 00:00:00+00:00,206.529999,222.960007,206.250000,222.949997,5897900,0.0,0.0,ADBE,0.288914,...,25.259374,0.050525,0.079504,0.036236,30.410000,2.812017,-2.164573,Technology,1.314937e+09,1.222124e+09
21907,2018-12-27 00:00:00+00:00,219.990005,225.169998,214.160004,225.139999,3931100,0.0,0.0,ADBE,0.329741,...,25.269149,0.007677,0.023410,0.000736,29.959999,2.675121,-2.151093,Technology,8.850479e+08,1.232904e+09


In [30]:
!rm -rf /kaggle/working/multimodal-eq-sizing