# Clone GitHub Repo and Install Dependencies 

In [2]:
import os

!rm -rf multimodal-eq-sizing
!git clone https://github.com/brianrp09232000/multimodal-eq-sizing.git
!pip install -r multimodal-eq-sizing/requirements.txt

Cloning into 'multimodal-eq-sizing'...
remote: Enumerating objects: 843, done.[K
remote: Counting objects: 100% (162/162), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 843 (delta 123), reused 93 (delta 93), pack-reused 681 (from 3)[K
Receiving objects: 100% (843/843), 886.58 KiB | 14.78 MiB/s, done.
Resolving deltas: 100% (518/518), done.


# Imports

In [1]:
import sys
import pathlib
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
np.seterr(invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
# Uses the current directory where the notebook is running
repo_root = pathlib.Path("multimodal-eq-sizing")
sys.path.append(str(repo_root.resolve())) # .resolve() gets the full absolute path locally

In [5]:
from src.data.loaders import (
    get_tickers_history,
    get_return_data,
    get_excess_return,
    get_vix_data,
    get_spread_z,
    get_sector_map,
    get_adv_dollar
)

from src.data.features.price_features import calculate_leg_one_features
from src.data.features.news_features import built_news_features
from src.data.universe import tickers_with_most_headlines
from src.data.universe import tickers_with_most_headlines

In [None]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', 100)

# Functions Definition

In [6]:
def count_headlines_per_ticker(news_df, start=None, end=None):
    """Counts the number of headlines for each ticker symbol 
    Input: news_df pandas dataframe with ticker column for ticker symbols
    Output: pandas dataframe containing two columns: ticker names and the 
                number of headlines for the ticker"""
    
    #check columns in dataframe
    columns = list(news_df.columns)
    if (('date' not in columns) and ('Date' not in columns)) or (('ticker' not in columns) and ('Stock_symbol' not in columns)):
        print('input dataframe does not have both ticker and date columns')
        return pd.DataFrame()
    
    #find column names
    date_col = 'date' if 'date' in columns else 'Date'
    ticker_col = 'ticker' if 'ticker' in columns else 'Stock_symbol'

    #filter dates
    if start is not None: 
        start_filter = news_df[date_col] >= str(start)
        news_df = news_df[start_filter]
    if end is not None: 
        end_filter = news_df[date_col] <= str(end)
        news_df = news_df[end_filter]
    
    # Count occurrences in a specific column
    headline_counts = news_df[ticker_col].value_counts()
    df = headline_counts.to_frame(name='count')
    df['ticker'] = list(df.index)
    df = df.reset_index(drop=True)
    
    return df[['ticker','count']]



def tickers_with_most_headlines(news_df, start=None, end=None, n=200):
    """Finds the tickers with the most headlines 
    Input: news_df pandas dataframe with ticker column for ticker symbols
            optional: n interger, number of top tickers to return
    Output: list containing the number of headlines per ticker
                for the tickers with the most headlines"""
    
    #count headlines for each ticker
    df = count_headlines_per_ticker(news_df, start, end)

    #limit dataframe to n tickers
    df = df.sort_values(['count'], ascending=False)
    df = df[:n]
    df.reset_index(drop=True, inplace=True)
    
    return df


def get_date_range(df: pd.DataFrame) -> tuple:
    grouped_by_date = df.groupby(["ticker"]).agg(['min', 'max', 'count'])["Date"]
    start = grouped_by_date["min"].min()
    end = grouped_by_date["max"].max()
    return start, end


def add_sector(df):
    tickers = df["ticker"].unique()
    sector_map = get_sector_map(tickers)
    df = df.join(sector_map, on="ticker")
    
    # remove None values in 'sector', these are ETFs not stocks
    df = df.dropna(subset=['sector'])
    return df


def add_excess_return(df, start, end):
    excess_return_df = get_excess_return(df, start, end)
    df = df.merge(excess_return_df, on=["ticker", "Date"], how="left")
    return df


def add_vix_z(df, start, end):
    vix_z_df = get_vix_data(start, end)
    format_str = "%Y-%m-%d"
    vix_z_df["Date"] = vix_z_df["Date"].dt.strftime(format_str)
    df["Date"] = df["Date"].dt.strftime(format_str) 
    df = df.merge(vix_z_df, on=["Date"], how="left")
    df['Date'] = pd.to_datetime(df['Date'], utc=True)
    return df


def add_spread_z(existing_df: pd.DataFrame, buffer_days=380) -> pd.DataFrame:
    """
    Use existing OHLCV df, pull buffered history, compute young-safe spread_z on the combined
    Then merge back only the target window rows to prevent nulls.
    """
    df = existing_df.copy()
    start, end = df["Date"].min(), df["Date"].max()

    tickers = sorted(df['ticker'].unique())
    fetch_start = start - timedelta(days=buffer_days)
    fetch_end   = end

    # You already have get_tickers_history(tickers, start, end)
    hist = get_tickers_history(tickers, fetch_start, fetch_end)
    hist["Date"] = pd.to_datetime(hist["Date"], utc=True)

    # Combine buffer + existing; keep existing rows on overlap
    combined = pd.concat([hist, df], ignore_index=True)
    combined = combined.sort_values(['ticker', "Date"])
    combined = combined.drop_duplicates(subset=['ticker', "Date"], keep="last")

    # Compute young-safe spread_z on the full combined range
    combined = get_spread_z(combined)

    # Merge only computed columns back to target window
    cols_to_merge = ['ticker', 'Date', "spread_z"]
    out = df.merge(combined[cols_to_merge], on=['ticker', 'Date'], how="left")

    # Final minimal, causal clean-up to guarantee NON-NULL spread_z in target window:
    # 1) per-ticker forward-fill (past only), 2) same-day cross-section median, 3) final 0
    out["spread_z"] = (
        out.groupby('ticker')["spread_z"].ffill()
           .fillna(out.groupby('Date')["spread_z"].transform("median"))
           .fillna(0.0)
    ).clip(-3, 3)

    return out


def add_adv_dollar(df):
    adv_df = get_adv_dollar(df)
    
    df = df.merge(
        adv_df,
        on=["Date", "ticker"],
        how="left",
    )
    return df


def count_headlines_all_days(news_df):
    """Counts the number of headlines for each ticker symbol each day
    Input: news_df pandas dataframe with ticker column for ticker symbols and date for the headline date
    Output: pandas dataframe containing the number of headlines per ticker per day
                indexes are dates in string and tickers as the column names"""
    
    #check columns in dataframe
    columns = list(news_df.columns)
    if (('date' not in columns) and ('Date' not in columns)) or (('ticker' not in columns) and ('Stock_symbol' not in columns)):
        print('input dataframe does not have both ticker and date columns')
        return pd.DataFrame()
    
    #find column names
    date_col = 'date' if 'date' in columns else 'Date'
    ticker_col = 'ticker' if 'ticker' in columns else 'Stock_symbol'
    
    # Count occurrences in the date column
    headline_dates = news_df[date_col]#.str[:10]#.value_counts()
    df = pd.DataFrame({ticker_col: news_df[ticker_col],
                       date_col: headline_dates})
    
    # count headlines per day per ticker
    df = df.groupby([date_col, ticker_col]).size().unstack(fill_value=0)
    
    #create list of dates needed
    format_code = "%Y-%m-%d"# Corresponds to 'YYYY-MM-DD'
    set_of_dates = set(df.index)
    date_min = start#datetime.strptime(min('2010-01-04',min(set_of_dates)), format_code).date() #datetime(2000,1,1).date()#
    date_max = end #datetime.strptime(max('2018-12-28',max(set_of_dates)), format_code).date()
    date_lst = [(date_min+timedelta(i)) for i in range(int((date_max-date_min).days)+1)]
    
    #find dates not in dataframe
    missing_dates = dict([(day,int(0)) for day in set(date_lst).difference(set(df.index))])
    
    #add missing dates to dataframe
    tickers = list(set(df.columns))
    tickers.sort()
    empty_dict = dict([(ticker, missing_dates) for ticker in tickers])
    add_dates = pd.DataFrame(empty_dict)
    df = pd.concat([df, add_dates], ignore_index=False)
    
    #sort rows and columns
    df = df.sort_index()
    df = df.T
    df = df.sort_index()
    
    return df


def add_news_flag(news_df, price_df):
    """adds a new news flag column: 0=no news, 1=news
    input: news_df with 'date', 'ticker', and other columns
            price_df with 'Date', 'ticker', and other columns
            optional start and end Timestamps
    output: dataframe df
    """

    #count headlines per ticker per day
    news_count = count_headlines_all_days(news_df)
    
    #filter count_df by date
    news_count = news_count.T
    news_count['date'] = pd.to_datetime(list(news_count.index), utc=True)
    
    #convert news_count df to different format
    news_cols = list(news_count.columns)
    news_count = news_count.melt(id_vars=['date'], value_vars=news_cols, 
                  var_name='ticker', value_name='news flag')

    # change count to flag: 0=no news, 1=news
    news_count['news flag'] = [flag if flag < 2 else 1 for flag in news_count['news flag']]
    news_count['date'] = pd.to_datetime(news_count['date'], utc=True)
    news_count.sort_values(['date','ticker'], inplace=True)
    
    #add news flag: 0=no news, 1=news
    price_df = pd.merge(price_df, news_count, left_on=['Date','ticker'], 
              right_on=['date','ticker'])

    return price_df

# Data Loading/Sourcing

## Load News Dataset

In [9]:
# Get dataset from Kaggle Hub
import kagglehub
dir_path = kagglehub.dataset_download("zeroadamantium/nasdaq-news", force_download=True)
file_name = "nasdaq_news.csv"
path = os.path.join(dir_path, file_name)
print(path)

# This will print all the folders in the input directory
print(os.listdir("/kaggle/input"))

# Once you see the folder name above, replace 'YOUR_FOLDER_NAME' below to see the files inside
# print(os.listdir("/kaggle/input/YOUR_FOLDER_NAME"))

/kaggle/input/nasdaq-news/nasdaq_news.csv
['news-trading', 'nasdaq-news']


In [10]:
filename = "/kaggle/input/nasdaq-news/nasdaq_news.csv"
news_df = pd.read_csv(filename)
news_df = news_df.drop(columns=['Article'])

# 1. Check Local (Git/PC) first
# if os.path.exists(filename):
#     news_df = pd.read_csv(filename)
#     print(f"Success: Loaded {filename} from local folder.")

In [11]:
start = pd.Timestamp('2010-01-04 05:00:00+0000', tz='UTC')
end   = pd.Timestamp('2018-12-28 05:00:00+0000', tz='UTC')

# Limit news to start and stop times
news_df['Date'] = pd.to_datetime(list(news_df['Date']), utc=True)
news_df = news_df[news_df['Date'] >= start]
news_df = news_df[news_df['Date'] <= end]
print(news_df.shape)

tickers = tickers_with_most_headlines(news_df, str(start), str(end), 300)

## Source Pricing Dataset (Yahoo Finance)

In [12]:
# Get yfinance ticker history for all tickers in tickers df
# yfinance will produce the "possibly delisted" message for tickers without information
df = get_tickers_history(list(tickers['ticker']), start=start, end=end)

# Remove unnecessary columns
df = df.drop(['Capital Gains','Adj Close'], axis=1)

# Limit df to only 200 tickers and tickers with data
keep_tickers = list(df['ticker'].drop_duplicates()[:200])
df = df[df['ticker'].isin(keep_tickers)]
tickers = tickers[tickers['ticker'].isin(keep_tickers)]

ERROR:yfinance:$X: possibly delisted; no timezone found
ERROR:yfinance:$DISH: possibly delisted; no timezone found
ERROR:yfinance:$WBA: possibly delisted; no timezone found
ERROR:yfinance:$FL: possibly delisted; no timezone found
ERROR:yfinance:$SPWR: possibly delisted; no price data found  (1d 2010-01-04 05:00:00+00:00 -> 2018-12-29 05:00:00+00:00) (Yahoo error = "Data doesn't exist for startDate = 1262581200, endDate = 1546059600")
ERROR:yfinance:$BRK: possibly delisted; no price data found  (1d 2010-01-04 05:00:00+00:00 -> 2018-12-29 05:00:00+00:00)
ERROR:yfinance:$DFS: possibly delisted; no timezone found
ERROR:yfinance:$PXD: possibly delisted; no timezone found
ERROR:yfinance:$AI: possibly delisted; no price data found  (1d 2010-01-04 05:00:00+00:00 -> 2018-12-29 05:00:00+00:00) (Yahoo error = "Data doesn't exist for startDate = 1262581200, endDate = 1546059600")
ERROR:yfinance:$MRO: possibly delisted; no timezone found
ERROR:yfinance:$AMTD: possibly delisted; no price data found 

# Data Preprocessing and Enrichment

## Add Pricing Features

In [13]:
# 1. Add Sector
df = add_sector(df)
print(df.shape)

(391851, 10)


In [14]:
# 2. Add Leg One Indicators
leg_one_inds = calculate_leg_one_features(df)
print(leg_one_inds.shape)

(391851, 19)


In [15]:
# 3. Add Excess Return
df = add_excess_return(df, start, end)
print(df.shape)

(391851, 12)


In [16]:
# 4. Add VIX_Z
df = add_vix_z(df, start, end)
print(df.shape)

Yay!ðŸ¥³
(391851, 14)


In [17]:
# 5. Add SPREAD_Z
df = add_spread_z(df)
print(df.shape)

(391851, 15)


In [18]:
# 6. Add ADV_DOLLAR
df = add_adv_dollar(df)
print(df.shape)

(391851, 17)


In [19]:
# 7. Preprocessing and Clean up

# Change Date formats to UTCdf['Date'] = pd.to_datetime(df['Date'], utc=True).dt.normalize()
leg_one_inds['Date'] = pd.to_datetime(leg_one_inds['Date'], utc=True).dt.normalize()

# Select only the new features
join_keys = ['ticker', 'Date']
new_features = [col for col in leg_one_inds.columns if col not in df.columns or col in join_keys]
leg_one_clean = leg_one_inds[new_features].copy()

# Remove duplicate rows in the indicators
leg_one_clean = leg_one_clean.drop_duplicates(subset=join_keys)

# Merge
df = df.merge(
    leg_one_clean, 
    on=join_keys, 
    how='left'
)

# Verification
print(f"New shape: {df.shape}")
print("New columns added:", list(set(df.columns) - set(leg_one_inds.columns).symmetric_difference(set(new_features))))
# (Simple print of added columns for sanity check)
print("Added Columns:", [c for c in new_features if c not in join_keys])

New shape: (391851, 26)
New columns added: ['vol_parkinson_20d', 'dollar_volume', 'VIX_Close', 'r1', 'r5', 'mom_rank', 'spread_z', 'o2c_return', 'VIX_z', 'ticker', 'Date', 'r10', 'excess_return', 'mom_12_1', 'adv_dollar', 'spy_r1', 'trend_ema_diff', 'vol_realized_20d']
Added Columns: ['r1', 'r5', 'r10', 'trend_ema_diff', 'vol_realized_20d', 'vol_parkinson_20d', 'mom_12_1', 'mom_rank', 'spy_r1']


In [20]:
# 8. Add NEXT_DAY EXCESS RETURN
df["next_day_excess_return"] = df.groupby('ticker')['excess_return'].shift(-1)

In [21]:
# 9. Add flags train/val/test flags to identify split sets
df['split'] = 'train'
df.loc[df['Date'] >="2015-01-01", "split"] = "val"
df.loc[df['Date'] >= "2017-01-01", "split"] = 'test'

print(df.shape)

(391851, 28)


In [22]:
# 10. Add NEWS_FLAG
df = add_news_flag(news_df, df)
print(df.shape)

(391686, 30)


In [25]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,sector,...,trend_ema_diff,vol_realized_20d,vol_parkinson_20d,mom_12_1,mom_rank,spy_r1,next_day_excess_return,split,date,news flag
0,2010-01-05 00:00:00+00:00,53.698801,54.428219,53.66407,53.94194,13469263.0,0.0,0.0,GE,Industrials,...,0.032598,,,,,0.002647,-0.006825,train,2010-01-05 00:00:00+00:00,0
1,2010-01-06 00:00:00+00:00,53.94191,54.254516,53.629303,53.66404,11573422.0,0.0,0.0,GE,Industrials,...,0.024568,,,,,0.000704,0.043662,train,2010-01-06 00:00:00+00:00,0
2,2010-01-07 00:00:00+00:00,53.768241,57.241645,53.594575,56.44276,38701038.0,0.0,0.0,GE,Industrials,...,0.344084,,,,,0.004221,0.01181,train,2010-01-07 00:00:00+00:00,0
3,2010-01-08 00:00:00+00:00,56.651173,57.971062,56.512232,57.658455,24019636.0,0.0,0.0,GE,Industrials,...,0.727206,,,,,0.003328,-0.001118,train,2010-01-08 00:00:00+00:00,0
4,2010-01-11 00:00:00+00:00,58.45735,58.631016,57.450062,58.214211,15999249.0,0.0,0.0,GE,Industrials,...,1.077108,,,,,0.001396,0.01418,train,2010-01-11 00:00:00+00:00,0


## Checkpoint: Save datasets

In [45]:
!mkdir /kaggle/working/datasets
news_df.to_csv('/kaggle/working/datasets/filtered_news_dataset.csv', index=False)
tickers.to_csv('/kaggle/working/datasets/top_tickers.csv', index=False)
df.to_csv('/kaggle/working/datasets/prices_dataset.csv', index=False)

In [59]:
import datetime as _dt
import kagglehub

# Save files to Kaggle Hub
handle = "zeroadamantium/nasdaq-news"
local_dataset_dir = "/kaggle/working/datasets"
current_date = _dt.date.today().strftime("%Y-%m-%d")
kagglehub.dataset_upload(handle, local_dataset_dir, version_notes= f"Prepare Data Notebook {current_date}")

Uploading Dataset https://www.kaggle.com/datasets/zeroadamantium/nasdaq-news ...
Starting upload for file /kaggle/working/datasets/prices_dataset.csv


Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 199M/199M [00:02<00:00, 97.9MB/s]

Upload successful: /kaggle/working/datasets/prices_dataset.csv (190MB)
Starting upload for file /kaggle/working/datasets/filtered_news_dataset.csv



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 74.7M/74.7M [00:00<00:00, 99.3MB/s]

Upload successful: /kaggle/working/datasets/filtered_news_dataset.csv (71MB)
Starting upload for file /kaggle/working/datasets/top_tickers.csv



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1.78k/1.78k [00:00<00:00, 8.85kB/s]

Upload successful: /kaggle/working/datasets/top_tickers.csv (2KB)





Your dataset has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/zeroadamantium/nasdaq-news


## Add News Features

In [7]:
news_features_df = built_news_features(
    ner_text_column = "Article_title",
    output_path = "/kaggle/working/datasets/news_features.pkl",
    file_path = "/kaggle/working/datasets/filtered_news_dataset.csv",
    chunk_size = 100_000,
)

Loading NER model...


Device set to use cuda:0


Loading encoder model for z_news (FinBERT)...




Loading dataset in chunks...
Found 1 CSV files:
  - /kaggle/working/datasets/filtered_news_dataset.csv
Processing chunk with 100000 rows...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Encoding headline embeddings (FinBERT) on 100000 rows...
Running NER on column 'Article_title' (batched) for 100000 rows...
Processing chunk with 100000 rows...
Encoding headline embeddings (FinBERT) on 100000 rows...
Running NER on column 'Article_title' (batched) for 100000 rows...
Processing chunk with 100000 rows...
Encoding headline embeddings (FinBERT) on 100000 rows...
Running NER on column 'Article_title' (batched) for 100000 rows...
Processing chunk with 100000 rows...
Encoding headline embeddings (FinBERT) on 100000 rows...
Running NER on column 'Article_title' (batched) for 100000 rows...
Processing chunk with 100000 rows...
Encoding headline embeddings (FinBERT) on 100000 rows...
Running NER on column 'Article_title' (batched) for 100000 rows...
Processing chunk with 100000 rows...
Encoding headline embeddings (FinBERT) on 100000 rows...
Running NER on column 'Article_title' (batched) for 100000 rows...
Processing chunk with 100000 rows...
Encoding headline embeddings (FinB

In [8]:
news_features_df.shape

(549379, 10)

## Checkpoint: Save News Features Dataset

In [9]:
news_features_df.to_csv("/kaggle/working/datasets/news_features.csv", index=False)

In [10]:
import datetime as _dt
import kagglehub

# Save files to Kaggle Hub
handle = "zeroadamantium/nasdaq-news"
local_dataset_dir = "/kaggle/working/datasets"
current_date = _dt.date.today().strftime("%Y-%m-%d")
kagglehub.dataset_upload(handle, local_dataset_dir, version_notes= f"Added News Features {current_date}")

Uploading Dataset https://www.kaggle.com/datasets/zeroadamantium/nasdaq-news ...
Starting upload for file /kaggle/working/datasets/news_features.csv


Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6.90G/6.90G [01:09<00:00, 98.9MB/s]

Upload successful: /kaggle/working/datasets/news_features.csv (6GB)
Starting upload for file /kaggle/working/datasets/prices_dataset.csv



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 199M/199M [00:02<00:00, 98.4MB/s] 

Upload successful: /kaggle/working/datasets/prices_dataset.csv (190MB)
Starting upload for file /kaggle/working/datasets/filtered_news_dataset.csv



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 75.5M/75.5M [00:01<00:00, 71.6MB/s]

Upload successful: /kaggle/working/datasets/filtered_news_dataset.csv (72MB)
Starting upload for file /kaggle/working/datasets/top_tickers.csv



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1.78k/1.78k [00:00<00:00, 8.57kB/s]

Upload successful: /kaggle/working/datasets/top_tickers.csv (2KB)
Starting upload for file /kaggle/working/datasets/news_features.pkl



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1.76G/1.76G [00:07<00:00, 228MB/s]

Upload successful: /kaggle/working/datasets/news_features.pkl (2GB)





Your dataset has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/zeroadamantium/nasdaq-news


In [18]:
print(news_features_df.shape)
news_features_df.head()

(549379, 10)


Unnamed: 0,Date,Stock_symbol,velocity,novelty,earnings_flag,guidance_flag,merger_flag,rating_flag,z_news,entities_today
0,2010-01-12 00:00:00+00:00,AA,1,0.0,0,0,0,0,"[-0.09942879, -0.7900245, -1.5315706, -0.03880...",[]
1,2010-01-18 00:00:00+00:00,AA,1,1.0,0,0,0,0,"[-0.12355294, -0.22699751, -0.54240745, -0.222...","[ci, group, ti]"
2,2010-02-03 00:00:00+00:00,AA,1,0.0,0,0,0,0,"[-0.31843555, -0.499312, -1.284549, 0.3205093,...",[]
3,2010-02-16 00:00:00+00:00,AA,1,1.0,0,0,0,0,"[-0.32647607, -0.513768, -0.8571328, 1.0574433...",[bulls]
4,2010-03-02 00:00:00+00:00,AA,1,1.0,0,0,0,0,"[0.57808304, -0.20705728, -1.0697136, 0.525321...","[citigroup, pandit]"


## Checkpoint: Load Prices Dataset

In [19]:
prices_dataset_df = pd.read_csv("/kaggle/working/datasets/prices_dataset.csv")
prices_dataset_df['Date'] = pd.to_datetime(prices_dataset_df['Date'], utc=True)
print(prices_dataset_df.shape)
prices_dataset_df.head()

(391686, 30)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,sector,...,trend_ema_diff,vol_realized_20d,vol_parkinson_20d,mom_12_1,mom_rank,spy_r1,next_day_excess_return,split,date,news flag
0,2010-01-05 00:00:00+00:00,53.698801,54.428219,53.66407,53.94194,13469263.0,0.0,0.0,GE,Industrials,...,0.032598,,,,,0.002647,-0.006825,train,2010-01-05 00:00:00+00:00,0
1,2010-01-06 00:00:00+00:00,53.94191,54.254516,53.629303,53.66404,11573422.0,0.0,0.0,GE,Industrials,...,0.024568,,,,,0.000704,0.043662,train,2010-01-06 00:00:00+00:00,0
2,2010-01-07 00:00:00+00:00,53.768241,57.241645,53.594575,56.44276,38701038.0,0.0,0.0,GE,Industrials,...,0.344084,,,,,0.004221,0.01181,train,2010-01-07 00:00:00+00:00,0
3,2010-01-08 00:00:00+00:00,56.651173,57.971062,56.512232,57.658455,24019636.0,0.0,0.0,GE,Industrials,...,0.727206,,,,,0.003328,-0.001118,train,2010-01-08 00:00:00+00:00,0
4,2010-01-11 00:00:00+00:00,58.45735,58.631016,57.450062,58.214211,15999249.0,0.0,0.0,GE,Industrials,...,1.077108,,,,,0.001396,0.01418,train,2010-01-11 00:00:00+00:00,0


# Final Dataset: Merge Prices and News Features Datasets

In [26]:
processed_multimodal_eq_sizing_dataset_df = pd.merge(
    prices_dataset_df,
    news_features_df,
    left_on = ['Date', 'ticker'],
    right_on = ['Date', 'Stock_symbol'],
    how = 'left'
)

In [57]:
processed_multimodal_eq_sizing_dataset_df = processed_multimodal_eq_sizing_dataset_df.drop(columns=['Stock_symbol'])

In [68]:
print(processed_multimodal_eq_sizing_dataset_df.shape)
processed_multimodal_eq_sizing_dataset_df.sample(5)

(391686, 38)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,sector,...,date,news flag,velocity,novelty,earnings_flag,guidance_flag,merger_flag,rating_flag,z_news,entities_today
79298,2018-03-21 00:00:00+00:00,14.012327,14.277348,13.955939,14.187128,6282800.0,0.0,0.0,EPD,Energy,...,2018-03-21 00:00:00+00:00,1,1.0,1.0,0.0,0.0,0.0,0.0,"[0.22823562, -0.042468984, -0.74258715, -0.145...","[e, mp, ra energy, se, sr]"
237209,2014-09-25 00:00:00+00:00,8.955884,8.95868,8.841244,8.885982,17915100.0,0.0,0.0,CSX,Industrials,...,2014-09-25 00:00:00+00:00,1,1.0,0.5,1.0,0.0,0.0,0.0,"[-0.21329702, -0.78418994, 0.33254468, -0.8496...","[cs, x]"
296435,2016-08-03 00:00:00+00:00,6.379677,6.616366,6.277718,6.503483,1983400.0,0.0,0.0,CC,Basic Materials,...,2016-08-03 00:00:00+00:00,0,,,,,,,,
253505,2016-07-27 00:00:00+00:00,26.793909,27.254408,25.43367,25.596615,11405300.0,0.0,0.0,DVN,Energy,...,2016-07-27 00:00:00+00:00,1,2.0,1.0,1.0,0.0,0.0,0.0,"[-0.09879635, 0.087337136, 0.23949988, -0.8212...","[ana, ap, c, d, dar, devon energy, ko, vn]"
211375,2018-02-05 00:00:00+00:00,89.849939,91.08405,86.147615,86.276169,939500.0,0.0,0.0,CE,Basic Materials,...,2018-02-05 00:00:00+00:00,1,1.0,0.4,0.0,0.0,1.0,0.0,"[0.21518224, -0.36966068, -1.3591042, -0.10072...","[ce, lane, ni plastics l. l. c, om, se]"


## Chekpoint: Save Final Multimodal Eq Sizing Dataset

In [69]:
processed_multimodal_eq_sizing_dataset_df.to_pickle("/kaggle/working/datasets/proc_multimodal_eq_sizing_dataset.pkl")

In [70]:
# processed_multimodal_eq_sizing_dataset_df.to_csv("/kaggle/working/datasets/proc_multimodal_eq_sizing_dataset.csv")

In [71]:
import datetime as _dt
import kagglehub

# Save files to Kaggle Hub
handle = "zeroadamantium/nasdaq-news"
local_dataset_dir = "/kaggle/working/datasets"
current_date = _dt.date.today().strftime("%Y-%m-%d")
kagglehub.dataset_upload(handle, local_dataset_dir, version_notes= f"Final Dataset {current_date}")

Uploading Dataset https://www.kaggle.com/datasets/zeroadamantium/nasdaq-news ...
Starting upload for file /kaggle/working/datasets/proc_multimodal_eq_sizing_dataset.pkl


Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 550M/550M [00:02<00:00, 185MB/s] 

Upload successful: /kaggle/working/datasets/proc_multimodal_eq_sizing_dataset.pkl (525MB)
Starting upload for file /kaggle/working/datasets/news_features.csv



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6.90G/6.90G [01:21<00:00, 84.3MB/s]

Upload successful: /kaggle/working/datasets/news_features.csv (6GB)
Starting upload for file /kaggle/working/datasets/prices_dataset.csv



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 199M/199M [00:01<00:00, 110MB/s] 

Upload successful: /kaggle/working/datasets/prices_dataset.csv (190MB)
Starting upload for file /kaggle/working/datasets/filtered_news_dataset.csv



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 75.5M/75.5M [00:00<00:00, 93.5MB/s]

Upload successful: /kaggle/working/datasets/filtered_news_dataset.csv (72MB)
Starting upload for file /kaggle/working/datasets/top_tickers.csv



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1.78k/1.78k [00:00<00:00, 8.86kB/s]

Upload successful: /kaggle/working/datasets/top_tickers.csv (2KB)
Starting upload for file /kaggle/working/datasets/news_features.pkl



Uploading: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1.76G/1.76G [00:07<00:00, 222MB/s]

Upload successful: /kaggle/working/datasets/news_features.pkl (2GB)





Your dataset has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/zeroadamantium/nasdaq-news


In [72]:
# get_return_data("/kaggle/working/final_dataset.csv")