In [574]:
!rm -rf /kaggle/working/multimodal-eq-sizing
!git clone https://github.com/brianrp09232000/multimodal-eq-sizing.git /kaggle/working/multimodal-eq-sizing
!pip install -r /kaggle/working/multimodal-eq-sizing/requirements.txt

Cloning into '/kaggle/working/multimodal-eq-sizing'...
remote: Enumerating objects: 647, done.[K
remote: Counting objects: 100% (267/267), done.[K
remote: Compressing objects: 100% (229/229), done.[K
remote: Total 647 (delta 172), reused 50 (delta 37), pack-reused 380 (from 3)[K
Receiving objects: 100% (647/647), 479.71 KiB | 15.47 MiB/s, done.
Resolving deltas: 100% (382/382), done.


In [575]:
import sys
import pathlib
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [576]:
np.seterr(invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [577]:
repo_root = pathlib.Path("/kaggle/working/multimodal-eq-sizing")
sys.path.append(str(repo_root))

In [578]:
from src.data.loaders import (
    get_tickers_history,
    get_return_data,
    get_excess_return,
    get_vix_data,
    get_spread_z,
    get_sector_map,
    get_adv_dollar
)

# Find Tickers with the Most Headlines

In [579]:
from src.data.universe import tickers_with_most_headlines

In [580]:
news_df = pd.read_csv("/kaggle/input/financial-news/combined.csv") 

In [581]:
def count_headlines_per_ticker(news_df, start=None, end=None):
    """Counts the number of headlines for each ticker symbol 
    Input: news_df pandas dataframe with ticker column for ticker symbols
    Output: pandas dataframe containing two columns: ticker names and the 
                number of headlines for the ticker"""
    
    #check columns in dataframe
    columns = list(news_df.columns)
    if (('date' not in columns) and ('Date' not in columns)) or (('ticker' not in columns) and ('Stock_symbol' not in columns)):
        print('input dataframe does not have both ticker and date columns')
        return pd.DataFrame()
    
    #find column names
    date_col = 'date' if 'date' in columns else 'Date'
    ticker_col = 'ticker' if 'ticker' in columns else 'Stock_symbol'

    #filter dates
    if start is not None: 
        start_filter = news_df[date_col] >= str(start)
        news_df = news_df[start_filter]
    if end is not None: 
        end_filter = news_df[date_col] <= str(end)
        news_df = news_df[end_filter]
    
    # Count occurrences in a specific column
    headline_counts = news_df[ticker_col].value_counts()
    df = headline_counts.to_frame(name='count')
    df['ticker'] = list(df.index)
    df = df.reset_index(drop=True)
    
    return df[['ticker','count']]



def tickers_with_most_headlines(news_df, start=None, end=None, n=200):
    """Finds the tickers with the most headlines 
    Input: news_df pandas dataframe with ticker column for ticker symbols
            optional: n interger, number of top tickers to return
    Output: list containing the number of headlines per ticker
                for the tickers with the most headlines"""
    
    #count headlines for each ticker
    df = count_headlines_per_ticker(news_df, start, end)

    #limit dataframe to n tickers
    df = df.sort_values(['count'], ascending=False)
    df = df[:n]
    df.reset_index(drop=True, inplace=True)
    
    return df

In [582]:
start = pd.Timestamp('2010-01-04 05:00:00+0000', tz='UTC')
end   = pd.Timestamp('2018-12-28 05:00:00+0000', tz='UTC')
tickers = tickers_with_most_headlines(news_df, str(start), str(end), 300)

# Get Price Data

In [583]:
#get yfinance ticker history for all tickers in tickers df
#yfinance will produce the "possibly delisted" message for tickers without information
df = get_tickers_history(list(tickers['ticker']), start=start, end=end)

$FB: possibly delisted; no price data found  (1d 2010-01-04 05:00:00+00:00 -> 2018-12-29 05:00:00+00:00) (Yahoo error = "Data doesn't exist for startDate = 1262581200, endDate = 1546059600")
  return pd.concat(tickers_history_dfs)


In [584]:
#limit df to only 200 tickers and tickers with data
keep_tickers = list(df['ticker'].drop_duplicates()[:200])
df = df[df['ticker'].isin(keep_tickers)]
tickers = tickers[tickers['ticker'].isin(keep_tickers)]

In [585]:
def get_date_range(df: pd.DataFrame) -> tuple:
    grouped_by_date = df.groupby(["ticker"]).agg(['min', 'max', 'count'])["Date"]
    start = grouped_by_date["min"].min()
    end = grouped_by_date["max"].max()
    return start, end

In [586]:
#df = get_return_data("/kaggle/input/news-trading/return_data.csv")
#start, end = get_date_range(df)

# Add excess return

In [587]:
def add_excess_return(df, start, end):
    excess_return_df = get_excess_return(df, start, end)
    df = df.merge(excess_return_df, on=["ticker", "Date"], how="left")
    return df

In [588]:
df = add_excess_return(df, start, end)

# Add market regime VIX z-score

In [589]:
def add_vix_z(df, start, end):
    vix_z_df = get_vix_data(start, end)
    format_str = "%Y-%m-%d"
    vix_z_df["Date"] = vix_z_df["Date"].dt.strftime(format_str)
    df["Date"] = df["Date"].dt.strftime(format_str) 
    df = df.merge(vix_z_df, on=["Date"], how="left")
    df['Date'] = pd.to_datetime(df['Date'], utc=True)
    return df

In [590]:
df = add_vix_z(df, start, end)

Yay!ðŸ¥³


# Add spread z-score

In [591]:
def add_spread_z(existing_df: pd.DataFrame, buffer_days=380) -> pd.DataFrame:
    """
    Use existing OHLCV df, pull buffered history, compute young-safe spread_z on the combined
    Then merge back only the target window rows to prevent nulls.
    """
    df = existing_df.copy()
    start, end = df["Date"].min(), df["Date"].max()

    tickers = sorted(df['ticker'].unique())
    fetch_start = start - timedelta(days=buffer_days)
    fetch_end   = end

    # You already have get_tickers_history(tickers, start, end)
    hist = get_tickers_history(tickers, fetch_start, fetch_end)
    hist["Date"] = pd.to_datetime(hist["Date"], utc=True)

    # Combine buffer + existing; keep existing rows on overlap
    combined = pd.concat([hist, df], ignore_index=True)
    combined = combined.sort_values(['ticker', "Date"])
    combined = combined.drop_duplicates(subset=['ticker', "Date"], keep="last")

    # Compute young-safe spread_z on the full combined range
    combined = get_spread_z(combined)

    # Merge only computed columns back to target window
    cols_to_merge = ['ticker', 'Date', "spread_z"]
    out = df.merge(combined[cols_to_merge], on=['ticker', 'Date'], how="left")

    # Final minimal, causal clean-up to guarantee NON-NULL spread_z in target window:
    # 1) per-ticker forward-fill (past only), 2) same-day cross-section median, 3) final 0
    out["spread_z"] = (
        out.groupby('ticker')["spread_z"].ffill()
           .fillna(out.groupby('Date')["spread_z"].transform("median"))
           .fillna(0.0)
    ).clip(-3, 3)

    return out

In [592]:
df = add_spread_z(df)

# Add sector

In [593]:
def add_sector(df):
    tickers = df["ticker"].unique()
    sector_map = get_sector_map(tickers)
    df = df.join(sector_map, on="ticker")
    return df

In [594]:
df = add_sector(df)

# Add dollar-volume ADV 

In [595]:
def add_adv_dollar(df):
    adv_df = get_adv_dollar(df)
    
    df = df.merge(
        adv_df,
        on=["Date", "ticker"],
        how="left",
    )
    return df

In [596]:
df = add_adv_dollar(df)

# Add Next Day Excess

In [597]:
df["next_day_excess_return"] = df.groupby('ticker')['excess_return'].shift(-1)

In [598]:
df_org = df.copy()

In [599]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Adj Close,o2c_return,excess_return,VIX_Close,VIX_z,spread_z,sector,dollar_volume,adv_dollar,next_day_excess_return
0,2010-06-29 00:00:00+00:00,1.266667,1.666667,1.169333,1.592667,281494500.0,0.0,0.0,TSLA,,0.257368,0.274441,34.130001,2.072601,-3.000000,Consumer Cyclical,4.483270e+08,,-0.069262
1,2010-06-30 00:00:00+00:00,1.719333,2.028000,1.553333,1.588667,257806500.0,0.0,0.0,TSLA,,-0.075998,-0.069262,34.540001,2.129714,-1.413205,Consumer Cyclical,4.095687e+08,,-0.117819
2,2010-07-01 00:00:00+00:00,1.666667,1.728000,1.351333,1.464000,123282000.0,0.0,0.0,TSLA,,-0.121600,-0.117819,32.860001,1.780547,-1.381747,Consumer Cyclical,1.804848e+08,,-0.156392
3,2010-07-02 00:00:00+00:00,1.533333,1.540000,1.247333,1.280000,77097000.0,0.0,0.0,TSLA,,-0.165217,-0.156392,30.120001,1.236204,-0.352851,Consumer Cyclical,9.868416e+07,,-0.187070
4,2010-07-06 00:00:00+00:00,1.333333,1.333333,1.055333,1.074000,103003500.0,0.0,0.0,TSLA,,-0.194500,-0.187070,29.650000,1.142517,-0.453971,Consumer Cyclical,1.106258e+08,,-0.065481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20240,2018-12-21 00:00:00+00:00,219.550003,222.259995,207.369995,208.800003,8828100.0,0.0,0.0,ADBE,,-0.048964,-0.024485,30.110001,2.909232,-1.825644,Technology,1.843307e+09,1.187080e+09,0.011252
20241,2018-12-24 00:00:00+00:00,206.899994,212.580002,204.949997,205.160004,2940500.0,0.0,0.0,ADBE,,-0.008410,0.011252,36.070000,3.000000,-2.249653,Technology,6.032730e+08,1.198173e+09,0.036236
20242,2018-12-26 00:00:00+00:00,206.529999,222.960007,206.250000,222.949997,5897900.0,0.0,0.0,ADBE,,0.079504,0.036236,30.410000,2.812017,-2.164573,Technology,1.314937e+09,1.222124e+09,0.000736
20243,2018-12-27 00:00:00+00:00,219.990005,225.169998,214.160004,225.139999,3931100.0,0.0,0.0,ADBE,,0.023410,0.000736,29.959999,2.675121,-2.151093,Technology,8.850479e+08,1.232904e+09,-0.012221


# Add News Flag

In [600]:
from src.data.features.news_features import count_headlines_all_days


def add_news_flag(news_df, price_df, start=None, end=None):
    """adds a new news flag column: 0=no news, 1=news
    input: news_df with 'date', 'ticker', and other columns
            price_df with 'Date', 'ticker', and other columns
            optional start and end Timestamps
    output: dataframe df
    """

    #count headlines per ticker per day
    news_count = count_headlines_all_days(news_df)

    #filter count_df by date
    news_count = news_count.T
    news_count['date'] = pd.to_datetime(list(news_count.index), utc=True)
    if start is not None: news_count = news_count[news_count['date'] >= start]
    if end is not None: news_count = news_count[news_count['date'] <= end]
    
    #convert news_count df to different format
    news_cols = list(news_count.columns)
    news_count = news_count.melt(id_vars=['date'], value_vars=news_cols, 
                  var_name='ticker', value_name='news flag')

    # change count to flag: 0=no news, 1=news
    news_count['news flag'] = [flag if flag < 2 else 1 for flag in news_count['news flag']]
    news_count['date'] = pd.to_datetime(news_count['date'], utc=True)
    news_count.sort_values(['date','ticker'], inplace=True)
    
    #add news flag: 0=no news, 1=news
    price_df = pd.merge(price_df, news_count, left_on=['Date','ticker'], 
              right_on=['date','ticker'])

    return price_df

In [601]:
df = add_news_flag(news_df, df, start=start, end=end)

# Split Data

In [602]:
df['split'] = 'train'
df.loc[df['Date'] >="2015-01-01", "split"] = "val"
df.loc[df['Date'] >= "2017-01-01", "split"] = 'test'

# Final Complete Dataset

In [603]:
df.to_csv('final_dataset.csv', index=False)

In [604]:
tickers.to_csv('top_tickers.csv', index=False)

In [605]:
get_return_data("/kaggle/working/final_dataset.csv")

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Adj Close,...,VIX_Close,VIX_z,spread_z,sector,dollar_volume,adv_dollar,next_day_excess_return,date,news flag,split
0,2010-06-29 00:00:00+00:00,1.266667,1.666667,1.169333,1.592667,281494500.0,0.0,0.0,TSLA,,...,34.130001,2.072601,-3.000000,Consumer Cyclical,4.483270e+08,,-0.069262,2010-06-29 00:00:00+00:00,0,train
1,2010-06-30 00:00:00+00:00,1.719333,2.028000,1.553333,1.588667,257806500.0,0.0,0.0,TSLA,,...,34.540001,2.129714,-1.413205,Consumer Cyclical,4.095687e+08,,-0.117819,2010-06-30 00:00:00+00:00,0,train
2,2010-07-01 00:00:00+00:00,1.666667,1.728000,1.351333,1.464000,123282000.0,0.0,0.0,TSLA,,...,32.860001,1.780547,-1.381747,Consumer Cyclical,1.804848e+08,,-0.156392,2010-07-01 00:00:00+00:00,0,train
3,2010-07-02 00:00:00+00:00,1.533333,1.540000,1.247333,1.280000,77097000.0,0.0,0.0,TSLA,,...,30.120001,1.236204,-0.352851,Consumer Cyclical,9.868416e+07,,-0.187070,2010-07-02 00:00:00+00:00,0,train
4,2010-07-06 00:00:00+00:00,1.333333,1.333333,1.055333,1.074000,103003500.0,0.0,0.0,TSLA,,...,29.650000,1.142517,-0.453971,Consumer Cyclical,1.106258e+08,,-0.065481,2010-07-06 00:00:00+00:00,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20232,2018-12-21 00:00:00+00:00,219.550003,222.259995,207.369995,208.800003,8828100.0,0.0,0.0,ADBE,,...,30.110001,2.909232,-1.825644,Technology,1.843307e+09,1.187080e+09,0.011252,2018-12-21 00:00:00+00:00,0,test
20233,2018-12-24 00:00:00+00:00,206.899994,212.580002,204.949997,205.160004,2940500.0,0.0,0.0,ADBE,,...,36.070000,3.000000,-2.249653,Technology,6.032730e+08,1.198173e+09,0.036236,2018-12-24 00:00:00+00:00,0,test
20234,2018-12-26 00:00:00+00:00,206.529999,222.960007,206.250000,222.949997,5897900.0,0.0,0.0,ADBE,,...,30.410000,2.812017,-2.164573,Technology,1.314937e+09,1.222124e+09,0.000736,2018-12-26 00:00:00+00:00,0,test
20235,2018-12-27 00:00:00+00:00,219.990005,225.169998,214.160004,225.139999,3931100.0,0.0,0.0,ADBE,,...,29.959999,2.675121,-2.151093,Technology,8.850479e+08,1.232904e+09,-0.012221,2018-12-27 00:00:00+00:00,0,test


In [606]:
!rm -rf /kaggle/working/multimodal-eq-sizing