In [18]:
import pandas as pd
import yfinance as yf
from datetime import timedelta

In [4]:
!git clone https://github.com/brianrp09232000/multimodal-eq-sizing.git

Cloning into 'multimodal-eq-sizing'...
remote: Enumerating objects: 270, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 270 (delta 18), reused 5 (delta 3), pack-reused 227 (from 1)[K
Receiving objects: 100% (270/270), 88.76 KiB | 1.23 MiB/s, done.
Resolving deltas: 100% (137/137), done.


In [7]:
import os
os.getcwd()

'/kaggle/working'

In [12]:
import sys
sys.path.append("/kaggle/working/multimodal-eq-sizing/src")

In [14]:
from data.loaders import get_return_data, get_single_ticker_history, get_tickers_history

In [None]:
"""
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shtrausslearning/news-trading")

print("Path to dataset files:", path)

return_data = pd.read_csv("/kaggle/input/news-trading/return_data.csv")
"""

In [16]:
tickers = ['AAPL','MSFT','AMZN','GOOG',
           'AMD','NVDA','TSLA','YELP',
           'NFLX','ADBE','BA','AIG']

start = pd.to_datetime('2010-01-01', utc=True)
end = pd.to_datetime('2018-12-31', utc=True)

input_df = get_tickers_history(tickers, start, end)

In [19]:
buffer_start = start - timedelta(days=400)
df = get_tickers_history(list(input_df['ticker'].unique()), buffer_start, end)

# sort by ticker and date
df = df.sort_values(['ticker', 'Date']).reset_index(drop=True)

In [24]:
# Calculate 12-1momentum 
df['Close_t_21']  = df.groupby('ticker')['Close'].shift(21)
df['Close_t_252'] = df.groupby('ticker')['Close'].shift(252)
df['mom_12_1'] = df['Close_t_21'] / df['Close_t_252'] - 1

# Calculate 12-1momentum rank
df['mom_position'] = (
df
.groupby('Date')['mom_12_1']
.rank(method='first')
)
df['N'] = df.groupby('Date')['ticker'].transform('count')
df['mom_rank'] = df['mom_position'] / df['N']

In [25]:
# Drop buffer dates data; Keep mom_12_1 and mom_rank columns
df2 = df[df['Date']>=start]
df2 = df2[['Date','ticker','mom_12_1','mom_rank']]

# Merge 12-1 momentum and rank into the input dataset
output_df = pd.merge(input_df, df2, on =['ticker','Date'], how='left')

In [26]:
output_df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,mom_12_1,mom_rank
0,2010-01-04 00:00:00-05:00,6.400988,6.433079,6.369498,6.418383,493729600,0.0,0.0,AAPL,1.162314,0.800000
1,2010-01-05 00:00:00-05:00,6.436077,6.465768,6.395588,6.429479,601904800,0.0,0.0,AAPL,1.077394,0.800000
2,2010-01-06 00:00:00-05:00,6.429478,6.454970,6.320610,6.327209,552160000,0.0,0.0,AAPL,1.078263,0.800000
3,2010-01-07 00:00:00-05:00,6.350604,6.358102,6.269628,6.315514,477131200,0.0,0.0,AAPL,1.076145,0.800000
4,2010-01-08 00:00:00-05:00,6.307117,6.358102,6.269929,6.357502,447610800,0.0,0.0,AAPL,1.048219,0.800000
...,...,...,...,...,...,...,...,...,...,...,...
26496,2018-12-24 00:00:00-05:00,31.095769,32.152419,30.659689,31.305422,5266300,0.0,0.0,AIG,-0.256927,0.250000
26497,2018-12-26 00:00:00-05:00,31.305417,32.387226,30.642913,32.362068,6190300,0.0,0.0,AIG,-0.255689,0.250000
26498,2018-12-27 00:00:00-05:00,32.068555,32.982643,31.657633,32.982643,6799200,0.0,0.0,AIG,-0.248971,0.083333
26499,2018-12-28 00:00:00-05:00,33.125202,33.318083,32.169184,32.387222,6410300,0.0,0.0,AIG,-0.257539,0.166667


In [None]:
def compute_momentum_rank(input_df):
    """
    Compute 12-1 momentum and cross-sectional momentum rank for each ticker.
    12-1 momentum = Close[t-21] / Close[t-252] - 1.
    cross-sectional rank each day = position / N.
    ----------
    Input dataset must contain columns ['ticker', 'Date']; 
    ----------
    Output dataset adds additional columns:['mom_12_1','mom_rank']
    """

    # Fetch data from yfinance starting at a buffer_start date to support 12–1 momentum calculations
    start = input_df['Date'].min() 
    end = input_df['Date'].max() 
    buffer_start = start - timedelta(days=400)
    df = get_tickers_history(list(input_df['ticker'].unique()), buffer_start, end)
    df = df.sort_values(['ticker', 'Date']).reset_index(drop=True)

    # Compute 12-1 momentum
    df['Close_t_21']  = df.groupby('ticker')['Close'].shift(21)
    df['Close_t_252'] = df.groupby('ticker')['Close'].shift(252)
    df['mom_12_1'] = df['Close_t_21'] / df['Close_t_252'] - 1

    # Compute cross-sectional 12-1 momentum ranking
    df['mom_position'] = (
        df.groupby('Date')['mom_12_1']
        .rank(method='first')
    )
    df['N'] = df.groupby('Date')['ticker'].transform('count')
    df['mom_rank'] = df['mom_position'] / df['N']

    # Drop buffer dates data; Keep mom_12_1 and mom_rank columns
    df2 = df[df['Date']>=start]
    df2 = df2[['Date','ticker','mom_12_1','mom_rank']]
    
    # Merge 12-1 momentum and rank into the input dataset
    output_df = pd.merge(input_df, df2, on =['ticker','Date'], how='left')

    return output_df

In [None]:
# Check n_obs per year per brand
return_data['year'] = return_data['date'].dt.year
count_df = return_data.groupby(['ticker', 'year']).size().reset_index(name='n_obs')
count_df["n_obs"].unique()

# log market price