OpenAI LLM Recommendations

In [6]:
import pandas as pd
import numpy as np
import sys
import os
import time
import re
import openai

In [10]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
labeled_balance_sheets = pd.read_csv("data/balance_sheets_with_labels.csv", dtype={"CIK": str})
labeled_income_statements = pd.read_csv("data/income_statements_with_labels.csv", dtype={"CIK": str})
labeled_cash_flow_statements = pd.read_csv("data/cash_flow_statements_with_labels.csv", dtype={"CIK": str})
ciks = set(labeled_balance_sheets["CIK"].unique()) | set(labeled_income_statements["CIK"].unique()) | set(labeled_cash_flow_statements["CIK"].unique())
ciks = pd.DataFrame(ciks, columns=["CIK"])

  labeled_income_statements = pd.read_csv("data/income_statements_with_labels.csv", dtype={"CIK": str})
  labeled_cash_flow_statements = pd.read_csv("data/cash_flow_statements_with_labels.csv", dtype={"CIK": str})


In [3]:
# To avoid hitting rate limits, internet connection issues, or other problems, I will only partly loop over the CIKs and concatenate the results later
# Split into 10 equal parts
fraction = (len(ciks) // 10)
ciks1 = ciks[:fraction]
ciks2 = ciks[fraction:2*fraction]
ciks3 = ciks[2*fraction:3*fraction]
ciks4 = ciks[3*fraction:4*fraction]
ciks5 = ciks[4*fraction:5*fraction]
ciks6 = ciks[5*fraction:6*fraction]
ciks7 = ciks[6*fraction:7*fraction]
ciks8 = ciks[7*fraction:8*fraction]
ciks9 = ciks[8*fraction:9*fraction]
ciks10 = ciks[9*fraction:]

# Check if all CIKs are included
len(ciks) == len(ciks1) + len(ciks2) + len(ciks3) + len(ciks4) + len(ciks5) + len(ciks6) + len(ciks7) + len(ciks8) + len(ciks9) + len(ciks10)

True

In [4]:
cik_sublist = ciks10[:2] # Example sublist for testing, can be adjusted as needed
cik_sublist

Unnamed: 0,CIK
1341,31791
1342,1786352


In [11]:
def get_llm_recommendations(cik_list, labeled_balance_sheets, labeled_income_statements, labeled_cash_flow_statements):
    recommendations = []
    for i, cik in enumerate(cik_list["CIK"]):
        # Progress message
        print(f"Processing CIK {cik} ({i+1}/{len(cik_list)})")
        
        # First filter dfs for input CIK
        balance_sheets = labeled_balance_sheets[labeled_balance_sheets["CIK"] == cik].copy()
        income_statements = labeled_income_statements[labeled_income_statements["CIK"] == cik].copy()
        cash_flow_statements = labeled_cash_flow_statements[labeled_cash_flow_statements["CIK"] == cik].copy()
        
        # Convert the date columns to datetime objects
        for df in [balance_sheets, income_statements, cash_flow_statements]:
            df["Report Date"] = pd.to_datetime(df["Date"])

        # Determine unique dates
        reporting_dates = pd.concat([
            balance_sheets["Report Date"],
            income_statements["Report Date"],
            cash_flow_statements["Report Date"]
        ]).unique()

        # Sort dates just to be safe
        reporting_dates = np.sort(reporting_dates)

        # In order to handle reports, that were filed slighty apart, a window of 10 days around a given reporting date is used
        window = pd.Timedelta(days=10)

        # Loop over reporting dates to obtain LLM ratings
        for date in reporting_dates:

            # Subset all financial statements for the given date +- window days
            bs = balance_sheets[(balance_sheets["Report Date"] >= date - window) & (balance_sheets["Report Date"] <= date + window)]
            is_ = income_statements[(income_statements["Report Date"] >= date - window) & (income_statements["Report Date"] <= date + window)]
            cf = cash_flow_statements[(cash_flow_statements["Report Date"] >= date - window) & (cash_flow_statements["Report Date"] <= date + window)]
        
            # If no reports are available for the given date, skip to next date
            if bs.empty or is_.empty or cf.empty:
                continue
            
            # Concatenate reports into a string with correct labels
            bs_str = "\n".join(bs.apply(lambda row: f"{row['position_label']}: {row['STD Balance Sheet All']}", axis=1).astype(str))
            is_str = "\n".join(is_.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
            cf_str = "\n".join(cf.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))

            # Final combined input
            financials_str = f"Balance Sheet:\n{bs_str}\n\nIncome Statement:\n{is_str}\n\nCash Flow Statement:\n{cf_str}"

            response = openai.chat.completions.create(
                model="gpt-4o",  
                messages=[
                    {"role": "system", "content": "You are an experienced, data-driven financial analyst, that provides concise and clear answers."},
                    {"role": "user", "content": f"""                           
                             Based on the following financial reports only, please provide an investment recommendation for the underlying company.
                             Balance Sheet: 
                             {bs_str}

                             Income Statement: 
                             {is_str}

                             Cash Flow Statement: 
                             {cf_str}

                             Provide your answer using only one of the following signals: 'strong buy', 'buy', 'hold', 'sell', or 'strong sell'."""}
                ],
                temperature=0.0, 
            )
            recommendation = response.choices[0].message.content.strip().lower()
            print(f"Recommendation for CIK {cik} on {date.date()}: {recommendation}")
            recommendations.append({
                "CIK": cik,
                "Date": date.date(),
                "Recommendation": recommendation
            })


In [14]:
response = openai.chat.completions.create(
    model="gpt-4o",  
    messages=[
        {"role": "system", "content": "You are an experienced, data-driven financial analyst, that provides concise and clear answers."},
        {"role": "user", "content": f"What is the CAPM?"}
    ],
    temperature = 0,
)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}