# Experiment 2: LLM+RL Testbench for Risk Profiles

In this notebook we run our hybrid-architecture test bench.

## Notebook Setup



In [None]:
import os
import sys
import logging
import warnings
import pickle
import numpy as np
import pandas as pd
import time
warnings.filterwarnings("ignore")

%load_ext dotenv

FUNDAMENTALS_PATH = os.getenv("FUNDAMENTALS_PATH", '/fundamentals')
LLM_PROMPTS_PATH = os.getenv("LLM_PROMPTS_PATH", '/prompts')
FUNDAMENTALS_PATH = os.getenv("FUNDAMENTALS_PATH", '/fundamentals')
HISTORIC_PATH = os.getenv("HISTORIC_PATH", '/historic')
MACRO_PATH = os.getenv("MACRO_PATH", '/macro')
OPTIONS_PATH = os.getenv("OPTIONS_PATH", '/options')
RL_OUTPUT_PATH = os.getenv("RL_OUTPUT_PATH", '/rl_data')
LLM_OUTPUT_PATH = os.getenv("LLM_OUTPUT_PATH", '/llm_data')
LOGS_PATH = os.getenv("LOGS_PATH", '/logs')
paths = [LLM_OUTPUT_PATH, LOGS_PATH, 'images']
for path in paths:
    if path and not os.path.exists(path):
        os.makedirs(path)

DATA_PATH = './data'
module_path = os.path.abspath(os.path.join(os.getcwd(), 'utils'))
if module_path not in sys.path:
    sys.path.append(module_path)

from tqdm.notebook import tqdm
from openai import OpenAI
from rl_agent_utils import *
from data_utils import *

## LLM Setup

In [None]:
OPENAI_MODEL = os.getenv("OPENAI_MODEL")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)


# RL Environmnet

In [None]:
money=100_000.
stateLength = 30
actionSpace = 2
bounds = [1, 30]
step = 1
numberOfEpisodes = 50
percentageCosts = [0, 0.1, 0.2]
transactionCosts = percentageCosts[1]/100
simulator = TradingSimulator()

STARTDATE = '2012-01-01'
SPLITDATE = '2018-01-01'
ENDDATE = '2020-01-01'
N_EXPERIMENTS = 10
STOCKS = {
    'Tesla' : 'TSLA',
    'Apple' : 'AAPL',
    'Facebook' : 'META',
    'Amazon' : 'AMZN',
    'Google' : 'GOOGL',
    'Microsoft' : 'MSFT',
}

# Run Replication

In [None]:
STOCK_RESULTS = {}
for stock_name, stock in tqdm(STOCKS.items(), disable=False, desc="Running work Bench..."):
    rl_output_dir = f'{RL_OUTPUT_PATH}/response/{RISK_EXPERIMENT}/{PROMPT_VERSION}'
    os.makedirs(rl_output_dir, exist_ok=True)

    train_file = f'{rl_output_dir}/{stock}_train_results.pkl'
    test_file = f'{rl_output_dir}/{stock}_test_results.pkl'
    sharpe_train_file = f'{rl_output_dir}/{stock}_sharpe_train_results.pkl'
    sharpe_test_file = f'{rl_output_dir}/{stock}_sharpe_test_results.pkl'
    time_file = f'{rl_output_dir}/{stock}_time_results.pkl'
    q_train_file = f'{rl_output_dir}/{stock}_q_train.pkl'
    q_test_file = f'{rl_output_dir}/{stock}_q_test.pkl'

    input_file = f"{HISTORIC_PATH}/engineered_{stock}_data.parquet"
    engineered_df = pd.read_parquet(input_file)
    engineered_df.set_index('Date', inplace=True)
    output_dir = f'{LLM_OUTPUT_PATH}/response/{RISK_EXPERIMENT}/{PROMPT_VERSION}'
    engineered_df = generate_strategy_for_ticker(
        ticker_df=engineered_df,
        ticker=stock,
        LLM_OUTPUT_PATH=output_dir,
        persona=PERSONA,
        HIGH_RISK_PROFILE=HIGH_RISK_PROFILE if RISK_EXPERIMENT == 'r' else LOW_RISK_PROFILE,
        HIGH_OBJECTIVES=HIGH_OBJECTIVES if RISK_EXPERIMENT == 'r' else LOW_OBJECTIVES,
        client=OPENAI_CLIENT,
        model=OPENAI_MODEL,
        strategy_yaml_file=f'{LLM_PROMPTS_PATH}/strat_prompt_{PROMPT_VERSION}.yml',
        news_yaml_file=f'{LLM_PROMPTS_PATH}/analyst_prompt_v1.yml' if PROMPT_VERSION in ['v4'] else None,
        start_date=STARTDATE,
        end_date=ENDDATE,
        max_news=5,
        time_horizon='monthly',
    )

    if all(os.path.exists(f) for f in [train_file, test_file, time_file, q_train_file, q_test_file]):
        with open(train_file, 'rb') as f:
            train_results = pickle.load(f)
        with open(test_file, 'rb') as f:
            test_results = pickle.load(f)
        with open(time_file, 'rb') as f:
            time_results = pickle.load(f)
        with open(q_train_file, 'rb') as f:
            q_train_values = pickle.load(f)
        with open(q_test_file, 'rb') as f:
            q_test_values = pickle.load(f)
        STOCK_RESULTS[stock] = (train_results, test_results, time_results, q_train_values, q_test_values)
        continue

    train_results = []
    test_results = []
    sharpe_train_results = []
    sharpe_test_results = []
    train_times = []
    test_times = []

    for i in tqdm(range(N_EXPERIMENTS), desc=f"Running test episodes for {stock}...", disable=False, leave=False):
        start_train_time = time.time()
        strat, train_env, qt0, qt1, test_env, q0, q1 = simulator.simulateNewStrategy(
            stock_df=engineered_df.copy(),
            startingDate=STARTDATE,
            endingDate=ENDDATE,
            splitingDate=SPLITDATE,
            verbose=True,
            plotTraining=False,
            rendering=False,
            showPerformance=False,
            saveStrategy=False,
            money=money,
            observationSpace=observationSpace,
            actionSpace=actionSpace,
            stateLength=stateLength,
            bounds=bounds,
            step=step,
            numberOfEpisodes=numberOfEpisodes,
            transactionCosts=transactionCosts,
            ticker_symbol=stock,
        )
        end_train_time = time.time()
        train_times.append(end_train_time - start_train_time)

        analyser = PerformanceEstimator(train_env.data)
        train_perf = analyser.getComputedPerformance()
        sharpe_perf = analyser.computeSharpeRatio()
        sharpe_train_results.append(sharpe_perf)
        train_results.append(train_perf)

        start_test_time = time.time()
        analyser = PerformanceEstimator(test_env.data)
        test_perf = analyser.getComputedPerformance()
        sharpe_perf = analyser.computeSharpeRatio()
        sharpe_test_results.append(sharpe_perf)
        test_results.append(test_perf)
        end_test_time = time.time()
        test_times.append(end_test_time - start_test_time)

    avg_train_time = sum(train_times) / N_EXPERIMENTS
    avg_test_time = sum(test_times) / N_EXPERIMENTS
    time_results = {
        'avg_train_time': avg_train_time,
        'avg_test_time': avg_test_time
    }
    q_train_values = (qt0, qt1)
    q_test_values = (q0, q1)
    STOCK_RESULTS[stock] = (train_results, test_results, time_results, q_train_values, q_test_values)

    with open(train_file, 'wb') as f:
        pickle.dump(train_results, f)
    with open(test_file, 'wb') as f:
        pickle.dump(test_results, f)
    with open(sharpe_train_file, 'wb') as f:
        pickle.dump(sharpe_train_results, f)
    with open(sharpe_test_file, 'wb') as f:
        pickle.dump(sharpe_test_results, f)
    with open(time_file, 'wb') as f:
        pickle.dump(time_results, f)
    with open(q_train_file, 'wb') as f:
        pickle.dump(q_train_values, f)
    with open(q_test_file, 'wb') as f:
        pickle.dump(q_test_values, f)

## Aggregate Results and T-Tests

In [None]:
from pprint import pprint
from scipy.stats import ttest_rel

final_summary_df = pd.DataFrame()
rl_output_dir = f'{RL_OUTPUT_PATH}/response/{RISK_EXPERIMENT}/{PROMPT_VERSION}'
replication_dir = "./papers/bm_experiment"

for stock_name, stock in tqdm(STOCKS.items(), disable=False, desc="Testing stock workbench..."):

    f = f'{rl_output_dir}/{stock}_test_results.pkl'
    new_results = safe_pickle_load(f)
    f = f'{replication_dir}/{stock}_bm_test_results.pkl'
    baseline_results = safe_pickle_load(f)
    f = f'{rl_output_dir}/{stock}_time_results.pkl'
    time_results = safe_pickle_load(f)

    metrics_data = {}

    for new_df, base_df in zip(new_results, baseline_results):
        new_metrics = new_df.set_index("Metric")["Value"].to_dict()
        base_metrics = base_df.set_index("Metric")["Value"].to_dict()

        for metric in new_metrics:
            if metric not in base_metrics:
                continue
            if metric not in metrics_data:
                metrics_data[metric] = {'new': [], 'base': []}
            metrics_data[metric]['new'].append(new_metrics[metric])
            metrics_data[metric]['base'].append(base_metrics[metric])

    stock_summary = {}
    for metric, sets in metrics_data.items():
        new_data = pd.Series(sets['new']).fillna(0)
        base_data = pd.Series(sets['base']).fillna(0)

        mean_val = round(new_data.mean(), 2)
        std_val = round(new_data.std(), 2)

        if len(new_data) == len(base_data) and len(new_data) > 1:
            t_stat, p_value = ttest_rel(new_data, base_data)
            p_value = round(p_value, 2)
        else:
            p_value = 0

        stock_summary[metric] = [mean_val, std_val, p_value]

    stock_df = pd.DataFrame(stock_summary, index=["Mean", "+/-", "P-Value"]).T
    stock_df = stock_df.T.unstack().to_frame().T
    stock_df.index = [stock]

    stock_df['Avg Train Time (s)'] = time_results['avg_train_time']
    stock_df['Avg Test Time (s)'] = time_results['avg_test_time']

    final_summary_df = pd.concat([final_summary_df, stock_df])

final_summary_df.to_csv('final_summary_vs_replication.csv', index=True)
pprint(final_summary_df.T)
