# Experiment 1: Prompting for News Factors

This notebook tests the prompts to generate the news factors and anonymize entities.

## Prepare Notebook

In [None]:
import os
import sys
import logging
import warnings
warnings.filterwarnings("ignore")

INSTALL_DEPS = True
if INSTALL_DEPS:
    %pip install openai==1.51.2

%load_ext dotenv

NEWS_PATH = os.getenv("NEWS_PATH")
HISTORIC_PATH = os.getenv("HISTORIC_PATH")
LLM_PROMPTS_PATH = os.getenv("LLM_PROMPTS_PATH")
LLM_OUTPUT_PATH = os.getenv("LLM_OUTPUT_PATH")
LOGS_PATH = os.getenv("LOGS_PATH")
paths = [LOGS_PATH, LLM_OUTPUT_PATH]
for path in paths:
    if path and not os.path.exists(path):
        os.makedirs(path)

if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
    logging.info("Running in Kaggle...")
    for dirname, _, filenames in os.walk("/kaggle/input"):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    DATA_PATH = "/kaggle/input/drl-dataset-quant"
    sys.path.insert(1, "/kaggle/usr/lib/drlutil")

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from openai import OpenAI

module_path = os.path.abspath(os.path.join(os.getcwd(), 'utils'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_utils import load_yaml_template, prepare_yaml_with_articles, call_openai_to_extract_news, combine_news_by_month, process_news_with_llm, TICKER_COMPANY_NAME_MAP

## Environment and Constants

In [None]:
START_DATE = '20180528'
END_DATE = '20180712'
OPENAI_MODEL = os.getenv("OPENAI_MODEL") # Best use a more rationale model like 4o or 1o.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize LLM

In [None]:
OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
PROMPT_YML = f'{LLM_PROMPTS_PATH}/news_analyst_prompt_v1.yml'
TICKERS = ["AAPL", "MSFT", "GOOGL", "TSLA", "AMZN", "META"]
TARGET = TICKERS[3]

In [None]:
grouped_news = pd.read_parquet(f'{NEWS_PATH}/{TARGET}_2012-01-01_2025-01-21.parquet')
grouped_news['datetime'] = pd.to_datetime(grouped_news['datetime'], utc=True)

sample_start_date = pd.to_datetime(START_DATE, format='%Y%m%d').tz_localize('UTC')
sample_end_date = pd.to_datetime(END_DATE, format='%Y%m%d').tz_localize('UTC')

grouped_news = grouped_news.set_index('datetime')
grouped_news = grouped_news[(grouped_news.index >= sample_start_date) & (grouped_news.index < sample_end_date)]

grouped_news.tail(1)


# Get News and Summarize

In [None]:
test_news = ["Stocks slipped into negative territory during early trading today, reversing some of yesterday's gains. Monday's rally raised questions about its sustainability, driven by an unusual combination: crude oil prices and shares of Apple Inc. (NASDAQ: AAPL). The day saw every major benchmark climb higher, with analysts debating the reasoning behind Berkshire Hathaway's recent investment in Apple—a sector Warren Buffett has historically avoided.",
             "Buffett, known for his skepticism toward technology investments due to challenges in predicting future earnings, didn't personally make the over $1 billion bet on AAPL. Instead, his investment managers, Ted Weschler and Todd Combs, took the initiative, reportedly without requiring Buffett's direct approval.",
             "Apple's CEO, Tim Cook, is preparing for the company's earnings call next week, scheduled for March 25, 2016, where updates on its performance and future product plans are expected. By this time, the iPhone 6 had already been released in 2014, with anticipation now shifting toward newer models and updates."]

llm_response = call_openai_to_extract_news(articles=test_news, news_yml_file=PROMPT_YML, ticker='AAPL', target_name='Apple Inc.', date='2016-03', client=OPENAI_CLIENT, model=OPENAI_MODEL, LLM_OUTPUT_PATH=LLM_OUTPUT_PATH)
llm_response

In [None]:

grouped_news = combine_news_by_month(grouped_news)


In [None]:

grouped_news.tail(3)

In [None]:
results = process_news_with_llm(grouped_news, ticker=TARGET, target_name='Apple Inc.', news_yml_file=PROMPT_YML, llm_client=OPENAI_CLIENT, llm_model=OPENAI_MODEL, LLM_OUTPUT_PATH=LLM_OUTPUT_PATH)
results


In [None]:
for factor in results[0]['factors']:
    print(factor)

In [None]:
START_DATE = '20120101'
END_DATE = '20200101'

output_file = f"{HISTORIC_PATH}/engineered_{TARGET}_data.parquet"
stock_aug_data = pd.read_parquet(output_file)
stock_aug_data['Date'] = pd.to_datetime(stock_aug_data['Date'], utc=True).dt.normalize()
stock_aug_data.set_index('Date', inplace=True)
stock_aug_data.tail(3)

In [None]:
from datetime import timedelta
from itertools import chain
import numpy as np

def generate_monthly_news_factors(ticker_df, ticker, LLM_OUTPUT_PATH, news_yml_file, client, model, start_date, end_date):
    start_date = pd.to_datetime(start_date, utc=True)
    end_date = pd.to_datetime(end_date, utc=True)
    filtered_ticker_df = ticker_df[(ticker_df.index >= start_date) & (ticker_df.index <= end_date)].copy()
    filtered_ticker_df['news_factor_file'] = np.nan
    current_month = None

    for date, _ in tqdm(filtered_ticker_df.iterrows(), total=len(filtered_ticker_df), desc="Processing rows"):
        month = (date.year, date.month)

        if current_month != month:
            if current_month is not None:
                last_5_days = list(chain.from_iterable(
                    content for content in filtered_ticker_df.loc[:date].iloc[-5:]['content'].dropna().tolist()
                ))
                if last_5_days:
                    call_openai_to_extract_news(
                        articles=last_5_days[:min(35, len(last_5_days))],
                        news_yml_file=news_yml_file,
                        ticker=ticker,
                        target_name=ticker,
                        date=f"{current_month[0]}-{current_month[1]:02d}",
                        client=client,
                        model=model,
                        LLM_OUTPUT_PATH=LLM_OUTPUT_PATH
                    )
                    factors_file = f'{date}_news.yml'
                    filtered_ticker_df.loc[date:, 'news_factor_file'] = factors_file
                    filtered_ticker_df['news_factor_file'] = filtered_ticker_df['news_factor_file'].bfill().ffill()
            current_month = month
    return filtered_ticker_df


processed_data = generate_monthly_news_factors(ticker_df=stock_aug_data.copy(),
                                                ticker=TARGET,
                                                news_yml_file=PROMPT_YML,
                                                client=OPENAI_CLIENT,
                                                model=OPENAI_MODEL,
                                                LLM_OUTPUT_PATH=LLM_OUTPUT_PATH,
                                                start_date=START_DATE,
                                                end_date=END_DATE)
processed_data.tail(3)