# Experiment - Baseline Prompt with Exemplars and EDA

This notebook analyzes the exemplar trades and attempts to explain the causal actions of the expert. Additionally we do a prompt EDA, to see what the LLM is focusing on.

For the Writer-Judge setup, refer to the Writer-Judge notebook. Writer judge is used to finetune the selected features and instructions.

## Prepare Notebook

In [None]:
import os
import sys
import logging
import warnings
warnings.filterwarnings("ignore")

INSTALL_DEPS = False
if INSTALL_DEPS:
    %pip install openai==1.51.2

%load_ext dotenv

FUNDAMENTALS_PATH = os.getenv("FUNDAMENTALS_PATH")
LLM_PROMPTS_PATH = os.getenv("LLM_PROMPTS_PATH")
FUNDAMENTALS_PATH = os.getenv("FUNDAMENTALS_PATH")
HISTORIC_PATH = os.getenv("HISTORIC_PATH")
MACRO_PATH = os.getenv("MACRO_PATH")
OPTIONS_PATH = os.getenv("OPTIONS_PATH")
LLM_OUTPUT_PATH = os.getenv("LLM_OUTPUT_PATH")
LOGS_PATH = os.getenv("LOGS_PATH")
paths = [LLM_OUTPUT_PATH, LOGS_PATH]
for path in paths:
    if path and not os.path.exists(path):
        os.makedirs(path)

if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
    logging.info("Running in Kaggle...")
    for dirname, _, filenames in os.walk("/kaggle/input"):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    DATA_PATH = "/kaggle/input/drl-dataset-quant"
    FUNDAMENTALS_PATH = DATA_PATH + FUNDAMENTALS_PATH
    HISTORIC_PATH = DATA_PATH + HISTORIC_PATH
    MACRO_PATH = DATA_PATH + MACRO_PATH
    OPTIONS_PATH = DATA_PATH + OPTIONS_PATH

    sys.path.insert(1, "/kaggle/usr/lib/drlutil")

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from openai import OpenAI

module_path = os.path.abspath(os.path.join(os.getcwd(), 'utils'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_utils import *
from rl_agent_utils import *

## Environment and Constants

In [None]:
START_DATE = '20120101'
END_DATE = '20200101'
TARGET = "AAPL"
OPENAI_MODEL = os.getenv("OPENAI_MODEL") # Best use a more rationale model like 4o or 1o.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
output_file = f"{HISTORIC_PATH}/engineered_{TARGET}_data.parquet"
stock_aug_data = pd.read_parquet(output_file)
stock_aug_data.set_index('Date', inplace=True)
stock_aug_data.tail(3)


# Expert Examples

In [None]:
expert_df = expert_trades(stock_aug_data.copy())
expert_df.tail(3)


In [None]:
sample_start_date = pd.Timestamp("2019-05-01 00:00:00+02:00")
sample_end_date = pd.Timestamp("2019-07-30 00:00:00+02:00")

engineered_sample_df = stock_aug_data.loc[sample_start_date:sample_end_date]
expert_sample_df = expert_df.loc[sample_start_date:sample_end_date]
engineered_sample_df.tail(3)

# Initialize LLM

In [None]:
import yaml
import pickle
from pydantic import BaseModel
from enum import Enum
from textwrap import dedent
from typing import List
from pprint import pprint


def enum_to_str_representer(dumper, data):
    """Helper function to represent enums as their string values in YAML."""
    return dumper.represent_str(data.value)

yaml.add_representer(Action, enum_to_str_representer)

OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)


In [None]:
RISK_EXPERIMENT = 'r'
TRAIN_PROMPT_YML = f'{LLM_PROMPTS_PATH}/writer_trainer_v1.yml'

TICKERS = ["AAPL", "MSFT", "GOOGL", "TSLA", "AMZN", "META"]
PROMPT_SAMPLES = 5

In [None]:
from typing import List
from pydantic import BaseModel


class ExpertReview(BaseModel):
    explanation: str
    features: List[str]

class ReflectTradeStrategy(BaseModel):
    reflection: str
    suggestions: List[str]
    features: List[str]
    explanation: str

def train_trade_strategy(step, context, persona, client, model, response_format=ExpertReview, top_k_tokens=5, max_retries=2):
    retries = 0
    response_data = None
    while retries < max_retries:
        try:
            response = client.beta.chat.completions.parse(
                model=model,
                messages=[
                    {"role": "user", "content": f"Step: {step}\nPrompt: {context}\n"}
                ],
                temperature=0.7,
                top_p=1,
                top_logprobs=top_k_tokens,
                response_format=response_format,
                max_completion_tokens=3500,
                frequency_penalty=1,
                presence_penalty=0.25,
                logprobs=True,
            )

            trade_strategy_response = response.choices[0].message
            trade_strategy = trade_strategy_response.parsed if isinstance(trade_strategy_response.parsed, dict) else trade_strategy_response.parsed.__dict__

            if trade_strategy is None:
                raise Exception("trade_strategy None!")

            total_tokens = response.usage.total_tokens
            prompt_tokens = response.usage.prompt_tokens
            completion_tokens = response.usage.completion_tokens
            cost = ((prompt_tokens / 1_000_000) * 0.15) + ((completion_tokens / 1_000_000) * 0.6)



            token_logprobs = response.choices[0].logprobs.content
            perplexity = calc_uncertainty_metrics(response.choices[0].logprobs.content)
            long_token_proba = -float('inf')
            short_token_proba = -float('inf')

            for token_probs in token_logprobs:
                if Action.LONG.value in token_probs.token:
                    long_token_proba = token_probs.logprob
                if Action.SHORT.value in token_probs.token:
                    short_token_proba = token_probs.logprob

            response_data = {
                "features": trade_strategy['features'] if 'features' in trade_strategy else np.nan,
                "explanation": trade_strategy['explanation'] if 'explanation' in trade_strategy else np.nan,
                "long_token_proba": long_token_proba,
                "short_token_proba": short_token_proba,
                "perplexity": perplexity["perplexity"],
                "entropy": perplexity["entropy"],
                "tokens_meta": {
                    "total_tokens": total_tokens,
                    "prompt_tokens": prompt_tokens,
                    "completion_tokens": completion_tokens,
                    "cost": cost
                },
            }
            return response_data  # Return once successful

        except Exception as e:
            logging.error(f"Error in LLM call: {e}")
            retries += 1
            logging.info(f"Retrying LLM call ({retries}/{max_retries})...")

    if response_data is None:
        raise Exception(f"None strategy encountered after {max_retries} retries")

    return response_data

context = update_historical_data_context(engineered_sample_df.head(1),
                                         PERSONA,
                                         HIGH_RISK_PROFILE,
                                         HIGH_OBJECTIVES,
                                         expert_df=expert_sample_df)


train_template = load_yaml_template(TRAIN_PROMPT_YML)
filled_template = fill_yaml_template(context, train_template)
context_yaml = yaml.dump(filled_template, default_flow_style=True, allow_unicode=True)

trade_strategy = train_trade_strategy(1, context_yaml, PERSONA, OPENAI_CLIENT, OPENAI_MODEL)

pprint(trade_strategy)

In [None]:
def generate_trade_prompt(engineered_df,
                            anchor_date,
                            risk_profile,
                            objectives,
                            persona,
                            client, model,
                            prev_trade_decision = None,
                            prev_trade_returns = None,
                            prev_trade_days = None,
                            yaml_file = TRAIN_PROMPT_YML,
                            Peak_Returns=None,
                            Trough_Returns=None,
                            expert_df=None):
    train_template = load_yaml_template(yaml_file)
    sample_engineered_df = engineered_df.loc[anchor_date - pd.Timedelta(days=1):anchor_date]
    if sample_engineered_df.empty:
        return None, None, None

    # Update context with historical data
    context = update_historical_data_context(
        engineered_df=sample_engineered_df.head(1),
        persona=PERSONA,
        HIGH_RISK_PROFILE=risk_profile,
        HIGH_OBJECTIVES=objectives,
        Last_LLM_Strat=prev_trade_decision,
        Last_LLM_Strat_Returns=prev_trade_returns,
        Last_LLM_Strat_Days = prev_trade_days,
        Peak_Returns=Peak_Returns,
        Trough_Returns=Trough_Returns,
        expert_df=expert_df,
    )
    filled_template = fill_yaml_template(context, train_template)
    context_yaml = yaml.dump(filled_template, default_flow_style=True, allow_unicode=True)

    trade_strategy_response = train_trade_strategy(1, context_yaml, persona, client, model)
    trade_strategy_response["trade_action"] = 1 if context['Expert_Action'] == "Long" else 0

    return trade_strategy_response

strategy = generate_trade_prompt(engineered_df=engineered_sample_df,
                                anchor_date=pd.to_datetime("20190612", utc=True),
                                risk_profile=HIGH_RISK_PROFILE,
                                objectives=HIGH_OBJECTIVES,
                                persona=PERSONA,
                                client=OPENAI_CLIENT,
                                model=OPENAI_MODEL,
                                expert_df=expert_sample_df)
pprint(strategy)

# Prompt Training

In [None]:
def llm_trades(engineered_df, risk_profile, objectives, persona, client, model, prompt_frequency='Monthly', expert_df=None):
    if prompt_frequency not in ['Weekly', 'Monthly']:
        raise ValueError("Invalid prompt_frequency. Choose 'Weekly' or 'Monthly'.")

    columns_to_fill = [
        'strategy', 'strategy_probas', 'explanation',
        'long_conf_score', 'short_conf_score', 'long_token_proba', 'short_token_proba',
        'long_token_proba', 'short_token_proba', 'perplexity', 'news_factors',
        'tokens_meta_strat', 'tokens_meta_news', 'tokens_meta_proba', 'entropy'
    ]
    engineered_df[columns_to_fill] = np.nan
    previous_year = None
    previous_month = None
    previous_week = None
    for date, row in tqdm(engineered_df.iterrows(), desc="Generating strategies..."):
        trigger_prompt = False

        if prompt_frequency == 'Monthly':
            if previous_month != date.month or previous_year != date.year:
                current_month_start = date.replace(day=1)
                current_month_first_business_day = pd.date_range(current_month_start, periods=1, freq='B')[0]
                if date == current_month_first_business_day:
                    trigger_prompt = True
                    previous_month = date.month
                    previous_year = date.year

        elif prompt_frequency == 'Weekly':
            if previous_week != date.isocalendar()[1] or previous_year != date.year:
                current_week_start = date.to_period('W').start_time
                current_week_first_business_day = pd.date_range(current_week_start, periods=1, freq='B')[0]
                if date == current_week_first_business_day:
                    trigger_prompt = True
                    previous_week = date.isocalendar()[1]
                    previous_year = date.year

        if trigger_prompt:

            start_idx = engineered_df.index.get_indexer([date], method='nearest')[0]
            prev_row = engineered_df.iloc[start_idx]

            trade_decision = generate_trade_prompt(
                engineered_df=engineered_df,
                anchor_date=date,
                risk_profile=risk_profile,
                objectives=objectives,
                persona=persona,
                client=client,
                model=model,
                prev_trade_decision=None,
                prev_trade_returns=None,
                prev_trade_days=None,
                Peak_Returns=None,
                Trough_Returns=None,
                expert_df=expert_df
            )

            if not trade_decision:
                continue
            engineered_df.loc[date:, 'trade_action'] = trade_decision['trade_action']
            engineered_df.loc[date:, 'evaluation_iteration'] = 1
            engineered_df.loc[date:, 'evaluation_score'] = None
            engineered_df.loc[date:, 'acceptance_rate'] = 1
            engineered_df.loc[date:, 'features'] = ','.join(trade_decision['features'])
            engineered_df.loc[date:, 'explanation'] = trade_decision['explanation']
            engineered_df.loc[date:, 'perplexity'] = trade_decision['perplexity']
            engineered_df.loc[date:, 'entropy'] = trade_decision['entropy']
            engineered_df.loc[date:, 'long_token_proba'] = trade_decision['long_token_proba']
            engineered_df.loc[date:, 'short_token_proba'] = trade_decision['short_token_proba']
            engineered_df.loc[date:, 'tokens_meta_strat'] = pd.Series([trade_decision["tokens_meta"]] * len(engineered_df.loc[date:]), index=engineered_df.loc[date:].index)


    engineered_df[columns_to_fill].bfill(inplace=True)
    engineered_df[columns_to_fill].ffill(inplace=True)

    return engineered_df

llm_decisions_df = llm_trades(
    engineered_df=engineered_sample_df,
    risk_profile=HIGH_RISK_PROFILE,
    objectives=HIGH_OBJECTIVES,
    persona=PERSONA,
    client=OPENAI_CLIENT,
    model=OPENAI_MODEL,
    expert_df=expert_sample_df
)

llm_decisions_df.tail(1)

In [None]:
pprint(llm_decisions_df['explanation'].unique())

In [None]:
llm_trading_metrics, llm_trades_df = evaluate_trading_metrics(llm_decisions_df)
plot_llm_trade(llm_trades_df)

llm_trading_metrics

In [None]:
sample_start_date = pd.Timestamp("2019-01-01 00:00:00+01:00")
sample_end_date = pd.Timestamp("2019-03-01 00:00:00+01:00")

engineered_sample_df = stock_aug_data.loc[sample_start_date:sample_end_date]
expert_sample_df = expert_df.loc[sample_start_date:sample_end_date]

llm_decisions_df = llm_trades(
    engineered_df=engineered_sample_df,
    risk_profile=HIGH_RISK_PROFILE,
    objectives=HIGH_OBJECTIVES,
    persona=PERSONA,
    client=OPENAI_CLIENT,
    model=OPENAI_MODEL,
    expert_df=expert_df
)

In [None]:
pprint(llm_decisions_df['explanation'].unique())

In [None]:
sample_start_date = pd.Timestamp("2019-04-01 00:00:00+01:00")
sample_end_date = pd.Timestamp("2019-05-30 00:00:00+01:00")

engineered_sample_df = stock_aug_data.loc[sample_start_date:sample_end_date]
expert_sample_df = expert_df.loc[sample_start_date:sample_end_date]

llm_decisions_df = llm_trades(
    engineered_df=engineered_sample_df,
    risk_profile=HIGH_RISK_PROFILE,
    objectives=HIGH_OBJECTIVES,
    persona=PERSONA,
    client=OPENAI_CLIENT,
    model=OPENAI_MODEL,
    expert_df=expert_df
)


In [None]:
pprint(llm_decisions_df['explanation'].unique())

## Prompt Mining

In [None]:

def generate_sample_dates(dataframe, num_samples=15):
    timestamps = pd.to_datetime(dataframe.index, utc=True)
    min_time, max_time = timestamps.min(), timestamps.max()
    sample_dates = pd.date_range(start=min_time, end=max_time, periods=num_samples)
    return sample_dates

def generate_random_sample_dates(dataframe, num_samples=15):
    timestamps = pd.to_datetime(dataframe.index, utc=True)
    random_dates = np.random.choice(timestamps, size=num_samples, replace=False)
    return pd.to_datetime(random_dates)

def process_samples_for_ticker(
    ticker, engineered_df, expert_df, risk_profile, objectives,
    persona, client, model, num_samples=5, prompt_frequency='Monthly'
):
    sample_dates = generate_random_sample_dates(engineered_df, num_samples)
    corpus = []

    for anchor_date in tqdm(sample_dates, desc=f"Processing {ticker}", leave=False):
        if prompt_frequency == 'Monthly':
            start_date = anchor_date - pd.Timedelta(days=7)
            end_date = anchor_date + pd.Timedelta(days=37)
        elif prompt_frequency == 'Weekly':
            start_date = anchor_date - pd.Timedelta(days=1)
            end_date = anchor_date + pd.Timedelta(days=8)
        else:
            raise ValueError("Invalid prompt_frequency. Choose 'Weekly' or 'Monthly'.")

        sliced_df = engineered_df.loc[start_date:end_date].copy()
        if anchor_date not in sliced_df.index:
            continue

        try:
            trade_strategy = llm_trades(
                engineered_df=sliced_df,
                risk_profile=risk_profile,
                objectives=objectives,
                persona=persona,
                client=client,
                model=model,
                prompt_frequency=prompt_frequency,
                expert_df=expert_df
            )
            if trade_strategy is None:
                continue
            corpus.append({
                "ticker": ticker,
                "anchor_date": str(anchor_date),
                "context": context_yaml,
                "explanation": trade_strategy.iloc[-1]['explanation'],
                "features": trade_strategy.iloc[-1]['features'],
            })
        except Exception as e:
            logging.error(e)

    return corpus


def process_multiple_tickers(tickers,
                                risk_profile,
                                objectives,
                                persona,
                                client,
                                model,
                                num_samples=15,
                                high_risk=True,
                                startingDate=START_DATE,
                                endingDate=END_DATE):
    full_corpus = []
    for ticker in tqdm(tickers):
        print(f"Processing Ticker: {ticker}")

        input_file = f"{HISTORIC_PATH}/engineered_{ticker}_data.parquet"
        engineered_df = pd.read_parquet(input_file)
        engineered_df.set_index('Date', inplace=True)
        expert_df = expert_trades(stock_aug_data.copy(), high_risk=high_risk)

        start_date = pd.to_datetime(startingDate, utc=True)
        end_date = pd.to_datetime(endingDate, utc=True)
        engineered_df = engineered_df.loc[start_date:end_date]
        expert_df = expert_df.loc[start_date:end_date]

        ticker_corpus = process_samples_for_ticker(
            ticker=ticker,
            engineered_df=engineered_df,
            expert_df=expert_df,
            risk_profile=risk_profile,
            objectives=objectives,
            persona=persona,
            client=client,
            model=model,
            num_samples=num_samples
        )
        full_corpus.extend(ticker_corpus)
    return full_corpus

corpus_file = "./risk_trade_strategy_corpus.yaml"
if os.path.exists(corpus_file):
    print(f"Corpus file '{corpus_file}' found. Reloading...")
    with open(corpus_file, "r") as f:
        corpus = yaml.safe_load(f)
else:
    print("Corpus file not found. Generating new corpus...")
    OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)
    corpus = process_multiple_tickers(
        tickers= TICKERS,
        risk_profile=HIGH_RISK_PROFILE,
        objectives=HIGH_OBJECTIVES,
        persona=PERSONA,
        client=OPENAI_CLIENT,
        model=OPENAI_MODEL,
        num_samples=PROMPT_SAMPLES,
    )
    with open(corpus_file, "w") as f:
        yaml.dump(corpus, f, default_flow_style=False, allow_unicode=True, indent=2)
    print("Corpus saved successfully.")

print("Corpus ready for use.")

# Prompt Generation Through Writer-Judge


### Exemplars Selection

Need to finding the most similar explanations. Since we are using GPT-4 already, `text-embedding-3-small` or `text-embedding-ada-002` is good.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

def load_corpus(file_path):
    with open(file_path, "r") as f:
        return yaml.safe_load(f)

def get_or_compute_embeddings(explanations, client, model="text-embedding-3-small", cache_path="embeddings_cache.pkl"):
    if os.path.exists(cache_path):
        with open(cache_path, "rb") as f:
            return pickle.load(f)

    embeddings = []
    for i in tqdm(range(0, len(explanations), 100), desc="Embedding"):
        batch = explanations[i:i+100]
        response = client.embeddings.create(input=batch, model=model)
        batch_embeddings = [e.embedding for e in response.data]
        embeddings.extend(batch_embeddings)

    with open(cache_path, "wb") as f:
        pickle.dump(embeddings, f)

    return embeddings
def select_diverse_relevant_exemplars(explanations, embeddings, num_exemplars=10, seed_explanation=None):
    kmeans = KMeans(n_clusters=num_exemplars, random_state=42).fit(embeddings)
    cluster_labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    selected = []
    for i in range(num_exemplars):
        cluster_indices = np.where(cluster_labels == i)[0]
        cluster_embeds = [embeddings[j] for j in cluster_indices]

        if seed_explanation:
            seed_embed = embeddings[explanations.index(seed_explanation)]
            sims = cosine_similarity([seed_embed], cluster_embeds)[0]
        else:
            sims = cosine_similarity([centroids[i]], cluster_embeds)[0]

        best_idx = cluster_indices[np.argmax(sims)]
        selected.append(explanations[best_idx])

    return selected


corpus = load_corpus("./risk_trade_strategy_corpus.yaml")
explanations = [entry["explanation"] for entry in corpus]
embeddings = get_or_compute_embeddings(explanations, OPENAI_CLIENT)
exemplars = select_diverse_relevant_exemplars(explanations, embeddings, num_exemplars=10)

pprint(exemplars)

In [None]:
def extract_selected_features(corpus, max_features=10):
    from collections import Counter
    features = [f.strip() for entry in corpus for f in entry["features"].split(",")]
    required_groups = [
        "Stock_Data", "Options_Data", "Macro_Data",
        "Economic_Data", "Fundamental_Data", "Technical_Analysis"
    ]
    selected = []
    used = set()
    for group in required_groups:
        group_feats = [f for f in features if f.startswith(group)]
        if group_feats:
            top_feat = Counter(group_feats).most_common(1)[0][0]
            selected.append(top_feat)
            used.add(top_feat)
    remaining = [f for f, _ in Counter(features).most_common(30) if f not in used]
    selected += remaining[:max(0, max_features - len(selected))]
    return selected

features = extract_selected_features(corpus)
pprint(features)

# High-Risk Strategies EDA

In [None]:
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download("stopwords")
nltk.download("wordnet")

WORD_COUNT = 50
TOP_FEATURES = 50

def clean_text(text, custom_stop_words=None):
    stop_words = set(stopwords.words("english"))
    if custom_stop_words:
        stop_words.update(custom_stop_words)
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    text = re.sub(r"\s+", " ", text).strip()
    words = [lemmatizer.lemmatize(word) for word in text.split()]
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

def clean_text_data(text_list, custom_stop_words=None):
    return [clean_text(text, custom_stop_words=custom_stop_words) for text in text_list]


def extract_corpus_data(corpus):
    explanations = [entry["explanation"] for entry in corpus]
    features = [f.strip() for entry in corpus for f in entry["features"].split(",")]
    return explanations, features

def compute_tfidf(data, max_features=TOP_FEATURES):
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(data)
    tfidf_scores = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    return tfidf_scores

def compute_word_frequencies(text_data):
    word_counter = Counter(" ".join(text_data).split())
    return pd.DataFrame(word_counter.most_common(WORD_COUNT), columns=["Word", "Count"])

def compute_ngrams(text_data, ngram_range=(2, 3), top_n=WORD_COUNT, custom_stop_words=None):
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words=custom_stop_words)
    ngram_matrix = vectorizer.fit_transform(text_data)
    ngram_counts = pd.DataFrame(
        ngram_matrix.toarray(), columns=vectorizer.get_feature_names_out()
    ).sum(axis=0).sort_values(ascending=False).head(top_n)
    return ngram_counts

def save_and_plot(fig, file_prefix, title):
    """Save the plot and display it."""
    filename = f"./images/{file_prefix}_{title.replace(' ', '_')}.png"
    os.makedirs("./images", exist_ok=True)
    fig.savefig(filename)
    plt.show()
    plt.close(fig)

def generate_wordcloud(data, title, file_prefix):
    wordcloud = WordCloud(width=800, height=400).generate(" ".join(data))
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.imshow(wordcloud, interpolation="bilinear")
    ax.axis("off")
    ax.set_title(title)
    save_and_plot(fig, file_prefix, title)

def plot_top_features(features, title, xlabel, ylabel, file_prefix):
    features_df = pd.DataFrame(features, columns=[xlabel, ylabel])
    features_df = features_df[~features_df[xlabel].str.startswith('Next_')]

    # Compute top quartile threshold
    threshold = features_df[ylabel].quantile(0.75)
    top_quartile_df = features_df[features_df[ylabel] >= threshold]

    fig, ax = plt.subplots(figsize=(10, max(6, 0.3 * len(top_quartile_df))))  # dynamic height
    ax.barh(top_quartile_df[xlabel], top_quartile_df[ylabel], color="skyblue")
    ax.set_xlabel(ylabel)
    ax.set_ylabel(xlabel)
    ax.set_title(title)
    ax.invert_yaxis()
    fig.tight_layout()
    save_and_plot(fig, file_prefix, title)


def analyze_corpus_with_ngrams(file_path, file_prefix, overused_words=None):
    if overused_words is None:
        overused_words = ["stock", "potential", "suggesting", "suggests", "indicate", "indicates",
                          "explanation", "market", "gains", "quick", "expert", "objective", "feature",
                          "refined feature", "feature set", "refined set", "set emphasizes", "set",
                          "metrics", "metrics", "assess", "future prompt", "short", "long"]
    corpus = load_corpus(file_path)
    explanations, features = extract_corpus_data(corpus)
    clean_explanations = clean_text_data(explanations, custom_stop_words=overused_words)
    tfidf_scores = compute_tfidf(clean_explanations)
    feature_counts = Counter(features)
    explanation_bigrams = compute_ngrams(clean_explanations, ngram_range=(2, 2), custom_stop_words=overused_words)
    explanation_trigrams = compute_ngrams(clean_explanations, ngram_range=(3, 3), custom_stop_words=overused_words)
    generate_wordcloud(clean_explanations, "Explanation WordCloud", file_prefix)
    plot_top_features(feature_counts.most_common(WORD_COUNT), "Top Features", "Feature", "Count", file_prefix)
    plot_top_features(tfidf_scores.mean(axis=0).sort_values(ascending=False).head(WORD_COUNT).items(),
                      "Top Words in Explanations (TF-IDF)", "Word", "TF-IDF Score", file_prefix)

    for ngrams, title, color in [
        (explanation_bigrams, "Top Bigrams in Explanations", "purple"),
        (explanation_trigrams, "Top Trigrams in Explanations", "orange"),
    ]:
        # Convert to DataFrame
        ngram_df = pd.DataFrame({ "ngram": ngrams.index, "freq": ngrams.values })

        # Apply top quartile filter
        threshold = ngram_df["freq"].quantile(0.75)
        top_ngram_df = ngram_df[ngram_df["freq"] >= threshold]

        fig, ax = plt.subplots(figsize=(10, max(6, 0.3 * len(top_ngram_df))))
        ax.barh(top_ngram_df["ngram"], top_ngram_df["freq"], color=color)
        ax.set_xlabel("Frequency")
        ax.set_ylabel("N-grams")
        ax.set_title(title)
        ax.invert_yaxis()
        fig.tight_layout()
        save_and_plot(fig, file_prefix, title)


analyze_corpus_with_ngrams(corpus_file, "risk_version1")

# Low-Risk Strategies EDA

All of the above was done for high-risk, now we do the same for the low risk configuration.

In [None]:
expert_df = expert_trades(stock_aug_data.copy(), high_risk=False)

print(expert_df.columns)
expert_df.tail(3)

In [None]:
sample_start_date = pd.Timestamp("2019-06-01 00:00:00+02:00")
sample_end_date = pd.Timestamp("2019-07-30 00:00:00+02:00")

engineered_sample_df = stock_aug_data.loc[sample_start_date:sample_end_date]
expert_sample_df = expert_df.loc[sample_start_date:sample_end_date]
engineered_sample_df.tail(3)

In [None]:
llm_decisions_df = llm_trades(
    engineered_df=engineered_sample_df,
    risk_profile=LOW_RISK_PROFILE,
    objectives=LOW_OBJECTIVES,
    persona=PERSONA,
    client=OPENAI_CLIENT,
    model=OPENAI_MODEL,
    expert_df=expert_df,
)
print(llm_decisions_df['explanation'].unique())

In [None]:

corpus_file = "./norisk_trade_strategy_corpus.yaml"
if os.path.exists(corpus_file):
    print(f"Corpus file '{corpus_file}' found. Reloading...")
    with open(corpus_file, "r") as f:
        corpus = yaml.safe_load(f)
else:
    print("Corpus file not found. Generating new corpus...")
    OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)

    corpus = process_multiple_tickers(
        tickers=TICKERS,
        risk_profile=LOW_RISK_PROFILE,
        objectives=LOW_OBJECTIVES,
        persona=PERSONA,
        client=OPENAI_CLIENT,
        model=OPENAI_MODEL,
        high_risk=False,
        num_samples=PROMPT_SAMPLES,
    )

    with open(corpus_file, "w") as f:
        yaml.dump(corpus, f, default_flow_style=False, allow_unicode=True, indent=2)

    print("Corpus saved successfully.")

print("Corpus ready for use.")

In [None]:
analyze_corpus_with_ngrams(corpus_file, "norisk_version1")