In [4]:
import pandas as pd
import yfinance as yf
from datetime import timedelta

In [8]:
!git clone https://github.com/brianrp09232000/multimodal-eq-sizing.git

Cloning into 'multimodal-eq-sizing'...
remote: Enumerating objects: 284, done.[K
remote: Counting objects: 100% (57/57), done.[K
remote: Compressing objects: 100% (54/54), done.[K
remote: Total 284 (delta 23), reused 5 (delta 3), pack-reused 227 (from 1)[K
Receiving objects: 100% (284/284), 95.86 KiB | 1.28 MiB/s, done.
Resolving deltas: 100% (142/142), done.


In [5]:
import os
os.getcwd()

'/kaggle/working'

In [11]:
import sys
sys.path.append("/kaggle/working/multimodal-eq-sizing/src")

In [12]:
from data.loaders import get_return_data, get_single_ticker_history, get_tickers_history

In [8]:
"""
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shtrausslearning/news-trading")

print("Path to dataset files:", path)

return_data = pd.read_csv("/kaggle/input/news-trading/return_data.csv")
"""

'\nimport kagglehub\n\n# Download latest version\npath = kagglehub.dataset_download("shtrausslearning/news-trading")\n\nprint("Path to dataset files:", path)\n\nreturn_data = pd.read_csv("/kaggle/input/news-trading/return_data.csv")\n'

In [13]:
tickers = ['AAPL','MSFT','AMZN','GOOG',
           'AMD','NVDA','TSLA','YELP',
           'NFLX','ADBE','BA','AIG']

start = pd.to_datetime('2010-01-01', utc=True)
end = pd.to_datetime('2018-12-31', utc=True)

input_df = get_tickers_history(tickers, start, end)

In [10]:
buffer_start = start - timedelta(days=400)
df = get_tickers_history(list(input_df['ticker'].unique()), buffer_start, end)

# sort by ticker and date
df = df.sort_values(['ticker', 'Date']).reset_index(drop=True)

In [14]:
# Calculate 12-1momentum 
df['Close_t_21']  = df.groupby('ticker')['Close'].shift(21)
df['Close_t_252'] = df.groupby('ticker')['Close'].shift(252)
df['mom_12_1'] = df['Close_t_21'] / df['Close_t_252'] - 1

# Calculate 12-1momentum rank
df['mom_position'] = (
df
.groupby('Date')['mom_12_1']
.rank(method='first')
)
df['N'] = df.groupby('Date')['ticker'].transform('count')
df['mom_rank'] = df['mom_position'] / df['N']

In [15]:
# Drop buffer dates data; Keep mom_12_1 and mom_rank columns
df2 = df[df['Date']>=start]
df2 = df2[['Date','ticker','mom_12_1','mom_rank']]

# Merge 12-1 momentum and rank into the input dataset
output_df = pd.merge(input_df, df2, on =['ticker','Date'], how='left')

In [16]:
output_df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,mom_12_1,mom_rank
0,2010-01-04 00:00:00-05:00,6.400988,6.433079,6.369498,6.418383,493729600,0.0,0.0,AAPL,1.162314,0.800000
1,2010-01-05 00:00:00-05:00,6.436079,6.465770,6.395590,6.429481,601904800,0.0,0.0,AAPL,1.077394,0.800000
2,2010-01-06 00:00:00-05:00,6.429480,6.454973,6.320613,6.327211,552160000,0.0,0.0,AAPL,1.078262,0.800000
3,2010-01-07 00:00:00-05:00,6.350605,6.358103,6.269629,6.315515,477131200,0.0,0.0,AAPL,1.076145,0.800000
4,2010-01-08 00:00:00-05:00,6.307116,6.358101,6.269928,6.357501,447610800,0.0,0.0,AAPL,1.048221,0.800000
...,...,...,...,...,...,...,...,...,...,...,...
26496,2018-12-24 00:00:00-05:00,31.095763,32.152414,30.659684,31.305416,5266300,0.0,0.0,AIG,-0.256927,0.250000
26497,2018-12-26 00:00:00-05:00,31.305417,32.387226,30.642913,32.362068,6190300,0.0,0.0,AIG,-0.255689,0.250000
26498,2018-12-27 00:00:00-05:00,32.068570,32.982658,31.657648,32.982658,6799200,0.0,0.0,AIG,-0.248971,0.083333
26499,2018-12-28 00:00:00-05:00,33.125210,33.318091,32.169192,32.387230,6410300,0.0,0.0,AIG,-0.257539,0.166667


In [30]:
def compute_momentum_rank(input_df):
    """
    Compute 12-1 momentum and cross-sectional momentum rank for each ticker.
    12-1 momentum = Close[t-21] / Close[t-252] - 1.
    cross-sectional rank each day = position / N.
    ----------
    Input dataset must contain columns ['ticker', 'Date']; 
    ----------
    Output dataset adds additional columns:['mom_12_1','mom_rank']
    """

    # Fetch data from yfinance starting at a buffer_start date to support 12–1 momentum calculations
    start = input_df['Date'].min() 
    end = input_df['Date'].max() 
    buffer_start = start - timedelta(days=400)
    df = get_tickers_history(list(input_df['ticker'].unique()), buffer_start, end)
    df = df.sort_values(['ticker', 'Date']).reset_index(drop=True)

    # Compute 12-1 momentum
    df['Close_t_21']  = df.groupby('ticker')['Close'].shift(21)
    df['Close_t_252'] = df.groupby('ticker')['Close'].shift(252)
    df['mom_12_1'] = df['Close_t_21'] / df['Close_t_252'] - 1

    # Compute cross-sectional 12-1 momentum ranking
    df['mom_position'] = (
        df.groupby('Date')['mom_12_1']
        .rank(method='first')
    )
    df['N'] = df.groupby('Date')['ticker'].transform('count')
    df['mom_rank'] = df['mom_position'] / df['N']

    # Drop buffer dates data; Keep mom_12_1 and mom_rank columns
    df2 = df[df['Date']>=start]
    df2 = df2[['Date','ticker','mom_12_1','mom_rank']]
    
    # Merge 12-1 momentum and rank into the input dataset
    output_df = pd.merge(input_df, df2, on =['ticker','Date'], how='left')

    return output_df

In [31]:
compute_momentum_rank(input_df)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,mom_12_1,mom_rank
0,2010-01-04 00:00:00-05:00,6.400988,6.433079,6.369498,6.418383,493729600,0.0,0.0,AAPL,1.162314,0.800000
1,2010-01-05 00:00:00-05:00,6.436079,6.465770,6.395591,6.429481,601904800,0.0,0.0,AAPL,1.077394,0.800000
2,2010-01-06 00:00:00-05:00,6.429479,6.454972,6.320612,6.327210,552160000,0.0,0.0,AAPL,1.078263,0.800000
3,2010-01-07 00:00:00-05:00,6.350603,6.358101,6.269627,6.315513,477131200,0.0,0.0,AAPL,1.076146,0.800000
4,2010-01-08 00:00:00-05:00,6.307116,6.358101,6.269928,6.357501,447610800,0.0,0.0,AAPL,1.048220,0.800000
...,...,...,...,...,...,...,...,...,...,...,...
26496,2018-12-24 00:00:00-05:00,31.095765,32.152416,30.659686,31.305418,5266300,0.0,0.0,AIG,-0.256927,0.250000
26497,2018-12-26 00:00:00-05:00,31.305425,32.387233,30.642920,32.362076,6190300,0.0,0.0,AIG,-0.255689,0.250000
26498,2018-12-27 00:00:00-05:00,32.068559,32.982647,31.657637,32.982647,6799200,0.0,0.0,AIG,-0.248971,0.083333
26499,2018-12-28 00:00:00-05:00,33.125202,33.318083,32.169184,32.387222,6410300,0.0,0.0,AIG,-0.257539,0.166667


In [None]:
# Check n_obs per year per brand
return_data['year'] = return_data['date'].dt.year
count_df = return_data.groupby(['ticker', 'year']).size().reset_index(name='n_obs')
count_df["n_obs"].unique()

# log market price

In [17]:
input_df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
0,2010-01-04 00:00:00-05:00,6.400988,6.433079,6.369498,6.418383,493729600,0.0,0.0,AAPL
1,2010-01-05 00:00:00-05:00,6.436079,6.465770,6.395590,6.429481,601904800,0.0,0.0,AAPL
2,2010-01-06 00:00:00-05:00,6.429480,6.454973,6.320613,6.327211,552160000,0.0,0.0,AAPL
3,2010-01-07 00:00:00-05:00,6.350605,6.358103,6.269629,6.315515,477131200,0.0,0.0,AAPL
4,2010-01-08 00:00:00-05:00,6.307116,6.358101,6.269928,6.357501,447610800,0.0,0.0,AAPL
...,...,...,...,...,...,...,...,...,...
2259,2018-12-24 00:00:00-05:00,31.095763,32.152414,30.659684,31.305416,5266300,0.0,0.0,AIG
2260,2018-12-26 00:00:00-05:00,31.305417,32.387226,30.642913,32.362068,6190300,0.0,0.0,AIG
2261,2018-12-27 00:00:00-05:00,32.068570,32.982658,31.657648,32.982658,6799200,0.0,0.0,AIG
2262,2018-12-28 00:00:00-05:00,33.125210,33.318091,32.169192,32.387230,6410300,0.0,0.0,AIG


In [14]:
import numpy as np
def get_log_mktcap(input_df):
    """
    Compute log(market capitalization) for each ticker.
    log(market cap_t) = log(Close_t) * SharesOutstanding. 
    * Note that yahoo only provides the latest shares outstanding, so we don't have the historical 
    shares outstanding at time t. This caculation method of getting market capitalization would not be precise.
    * In addition, some tickers may don't have SharesOutstanding information in yfinance.
    ----------
    Input dataset must contain columns ['ticker', 'Date']; 
    ----------
    Output dataset adds additional columns:['log_mktcap']
    """
    start = input_df['Date'].min() 
    end = input_df['Date'].max()
    tickers = list(input_df['ticker'].unique())
    
    rows = []
    for ticker in tickers:
        stock = yf.Ticker(ticker)

        # Get CURRENT shares outstanding
        shares = stock.info.get("sharesOutstanding", None)

        # Fetch historical price
        df_price = stock.history(start=start, end=end)

        # Add ticker column
        df_price["ticker"] = ticker
        df_price = df_price.reset_index()

        # Compute market cap and log market cap
        if shares is not None:
            df_price["mktcap"] = df_price["Close"] * shares
            df_price["log_mktcap"] = (df_price["mktcap"]).apply(
                lambda x: None if pd.isna(x) else np.log(x)
            )
        else:
            df_price["market_cap"] = None
            df_price["log_mktcap"] = None

        rows.append(df_price)
    # concat all tickers and merge with the input dataset    
    mktcap = pd.concat(rows, ignore_index=True)[['Date','ticker','log_mktcap']]
    output_df = pd.merge(input_df, mktcap, on=['Date','ticker'], how='left')
    
    return output_df

In [15]:
get_log_mktcap(input_df)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,log_mktcap
0,2010-01-04 00:00:00-05:00,6.400988,6.433079,6.369498,6.418383,493729600,0.0,0.0,AAPL,25.275460
1,2010-01-05 00:00:00-05:00,6.436079,6.465770,6.395591,6.429481,601904800,0.0,0.0,AAPL,25.277188
2,2010-01-06 00:00:00-05:00,6.429479,6.454972,6.320612,6.327210,552160000,0.0,0.0,AAPL,25.261153
3,2010-01-07 00:00:00-05:00,6.350603,6.358101,6.269627,6.315513,477131200,0.0,0.0,AAPL,25.259303
4,2010-01-08 00:00:00-05:00,6.307116,6.358101,6.269928,6.357501,447610800,0.0,0.0,AAPL,25.265929
...,...,...,...,...,...,...,...,...,...,...
26496,2018-12-24 00:00:00-05:00,31.095765,32.152416,30.659686,31.305418,5266300,0.0,0.0,AIG,23.550086
26497,2018-12-26 00:00:00-05:00,31.305425,32.387233,30.642920,32.362076,6190300,0.0,0.0,AIG,23.583281
26498,2018-12-27 00:00:00-05:00,32.068559,32.982647,31.657637,32.982647,6799200,0.0,0.0,AIG,23.602276
26499,2018-12-28 00:00:00-05:00,33.125202,33.318083,32.169184,32.387222,6410300,0.0,0.0,AIG,23.584058


# SPY r1

In [27]:
def compute_SPY_r1 (input_df):
    """
    Compute SPY r1 = (Close_t/Clost_t_1) -1
    ----------
    Input dataset must contain columns ['Date']; 
    ----------
    Output dataset adds additional columns:['spy_r1']
    """
    start = input_df['Date'].min() 
    end = input_df['Date'].max()
    buffer_start = start - timedelta(days=7)
    
    df_spy = get_single_ticker_history("SPY", buffer_start, end)
    df_spy["spy_r1"] = df_spy["Close"] / df_spy["Close"].shift(1) - 1
    df_spy = df_spy[df_spy['Date']>=start]
    df_spy = df_spy[['Date','ticker','spy_r1']]
    
    output_df = pd.merge(input_df, df_spy, on=['Date'], how='left')
    return output_df

In [28]:
spy = compute_SPY_r1(input_df)
spy

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker_x,ticker_y,spy_r1
0,2010-01-04 00:00:00-05:00,6.400988,6.433079,6.369498,6.418383,493729600,0.0,0.0,AAPL,SPY,0.016959
1,2010-01-05 00:00:00-05:00,6.436079,6.465770,6.395591,6.429481,601904800,0.0,0.0,AAPL,SPY,0.002647
2,2010-01-06 00:00:00-05:00,6.429479,6.454972,6.320612,6.327210,552160000,0.0,0.0,AAPL,SPY,0.000704
3,2010-01-07 00:00:00-05:00,6.350603,6.358101,6.269627,6.315513,477131200,0.0,0.0,AAPL,SPY,0.004221
4,2010-01-08 00:00:00-05:00,6.307116,6.358101,6.269928,6.357501,447610800,0.0,0.0,AAPL,SPY,0.003327
...,...,...,...,...,...,...,...,...,...,...,...
26496,2018-12-24 00:00:00-05:00,31.095765,32.152416,30.659686,31.305418,5266300,0.0,0.0,AIG,SPY,-0.026423
26497,2018-12-26 00:00:00-05:00,31.305425,32.387233,30.642920,32.362076,6190300,0.0,0.0,AIG,SPY,0.050525
26498,2018-12-27 00:00:00-05:00,32.068559,32.982647,31.657637,32.982647,6799200,0.0,0.0,AIG,SPY,0.007677
26499,2018-12-28 00:00:00-05:00,33.125202,33.318083,32.169184,32.387222,6410300,0.0,0.0,AIG,SPY,-0.001290
