### LLM analysis of financial statements

In this notebook, I preprocess the downloaded reports as far as required in order to feed them into the LLM to obtain a buy/sell recommendation.

In [1]:
import pandas as pd
import numpy as np 
from google import genai
from google.genai import types
import json
import re
from tqdm import tqdm
import time
from google.genai.errors import ServerError  


In [2]:
# Import Gemini API key
with open("../proton_google_api_key.txt", "r") as f:
    key = f.read().strip()    

# Initialize the Gemini client with the API key
client = genai.Client(api_key = key)

In [3]:
# Read in company names to look up buy/sell recommendations for
sp400_companies = pd.read_csv("../data/sp400_companies.csv", dtype={"CIK": str})
sp500_companies = pd.read_csv("../data/sp500_companies.csv", dtype={"CIK": str})
sp600_companies = pd.read_csv("../data/sp600_companies.csv", dtype={"CIK": str})

# Combine all CIKs into a single list
sp400_ciks = sp400_companies["CIK"].tolist()
sp500_ciks = sp500_companies["CIK"].tolist()
sp600_ciks = sp600_companies["CIK"].tolist()
ciks = sp400_ciks + sp500_ciks + sp600_ciks

# Read in RICs to match with CIKs
rics = pd.read_csv("data/rics.csv", dtype={"Instrument": str})
# Ranem Instrument to CIK for easier merging
rics.rename(columns={"Instrument": "CIK"}, inplace=True)

FileNotFoundError: [Errno 2] No such file or directory: 'data/rics.csv'

- Import and further process balance sheet df

In [None]:
first_balance_sheets = pd.read_csv("../data/balance_sheets.csv", dtype={"CIK": str})
missing_balance_sheets = pd.read_csv("../data/missing_balance_sheets.csv", dtype={"CIK": str})
balance_sheets = pd.concat([first_balance_sheets, missing_balance_sheets], ignore_index=True)
# Delete entries where STD Balance Sheet All and FCC Item Name are missing
balance_sheets = balance_sheets.dropna(subset=["STD Balance Sheet All", "FCC Item Name"])
balance_sheets.head()

Unnamed: 0,Date,STD Balance Sheet All,FCC Item Name,RIC,Statement
4,2016-06-30,332000000.0,TR.F.CashSTInvst,AA.N,balance_sheet
5,2016-06-30,332000000.0,TR.F.CashCashEquiv,AA.N,balance_sheet
6,2016-06-30,74000000.0,TR.F.DerivFinInstrHedgeST,AA.N,balance_sheet
7,2016-06-30,607000000.0,TR.F.LoansRcvblNetST,AA.N,balance_sheet
8,2016-06-30,426000000.0,TR.F.TradeAcctTradeNotesRcvblNet,AA.N,balance_sheet


In [5]:
# Adjust Item Name to give a clearer indication of the position
unique_balance_sheet_positions = balance_sheets["FCC Item Name"].unique()

In this cell, I employ Gemini to return actual official balance sheet position labels for the Thomson Reuters abbreviations contained in financial statements downloaded via the LSEG Data Library for Python.

In [None]:
balance_sheet_labels = client.models.generate_content(
    model="gemini-2.5-flash",
    config=types.GenerateContentConfig(
        temperature=0.1,
        system_instruction="You are a financial analyst, that provides concise and accurate answers.",
        thinking_config=types.ThinkingConfig(thinking_budget=0),# Disables thinking
    ),
    contents=[f"""The following is a list of abbreviations used by Thomsom Reuters in their reporting database. For every given abbreviation,
              please provide the actual financial position this refers to in a balance sheet. Example: TR.F.TotAssets: Total Assets.
              Here are the abbreviations: {unique_balance_sheet_positions.tolist()} Provide your answer in a dictionary style format.
              """],  
)

In [None]:
# Regex to extract JSON from the response
# First I extract the JSON string from the response text
json_str = re.search(r'{.*}', balance_sheet_labels.text, re.DOTALL).group()
# Then I parse the JSON string into a Python dictionary
data = json.loads(json_str)
# Save the dictionary to a file
with open("../data/balance_sheet_labels.json", "w") as f:
    json.dump(data, f, indent=4)  # indent makes JSON-format more readable

In [16]:
# Finally, this dictionary can be used to map the abbreviations to the actual financial positions in the balance sheets.
balance_sheets["position_label"] = balance_sheets["FCC Item Name"].map(data)

In [17]:
# Merge on RIC to include CIK and RIC in the balance sheets
balance_sheets = balance_sheets.merge(rics, left_on="RIC", right_on="RIC", how="left")
balance_sheets.head(10)

Unnamed: 0,Date,STD Balance Sheet All,FCC Item Name,RIC,Statement,position_label,CIK
0,2016-06-30,332000000.0,TR.F.CashSTInvst,AA.N,balance_sheet,Cash and Short-Term Investments,1675149
1,2016-06-30,332000000.0,TR.F.CashCashEquiv,AA.N,balance_sheet,Cash and Cash Equivalents,1675149
2,2016-06-30,74000000.0,TR.F.DerivFinInstrHedgeST,AA.N,balance_sheet,"Derivative Financial Instruments, Hedging, Sho...",1675149
3,2016-06-30,607000000.0,TR.F.LoansRcvblNetST,AA.N,balance_sheet,"Loans and Receivables, Net, Short-Term",1675149
4,2016-06-30,426000000.0,TR.F.TradeAcctTradeNotesRcvblNet,AA.N,balance_sheet,"Trade Accounts and Trade Notes Receivable, Net",1675149
5,2016-06-30,181000000.0,TR.F.RcvblOthTot,AA.N,balance_sheet,"Receivables, Other, Total",1675149
6,2016-06-30,1166000000.0,TR.F.InvntTot,AA.N,balance_sheet,"Inventory, Total",1675149
7,2016-06-30,835000000.0,TR.F.InvntRawMaterial,AA.N,balance_sheet,"Inventory, Raw Material",1675149
8,2016-06-30,176000000.0,TR.F.InvntWorkInProg,AA.N,balance_sheet,"Inventory, Work In Progress",1675149
9,2016-06-30,155000000.0,TR.F.InvntFinGoods,AA.N,balance_sheet,"Inventory, Finished Goods",1675149


In [None]:
# Save to CSV
balance_sheets.to_csv("../data/balance_sheets_with_labels.csv", index=False)

---
- Import and further process Cash Flow statement df

In [None]:
first_cash_flow_statements = pd.read_csv("../data/cash_flow_statements.csv", dtype={"CIK": str})
missing_cash_flow_statements = pd.read_csv("../data/missing_cash_flow_statements.csv", dtype={"CIK": str})
cash_flow_statements = pd.concat([first_cash_flow_statements, missing_cash_flow_statements], ignore_index=True)
cash_flow_statements.head()

Unnamed: 0,Date,STD Cash Flow All,FCC Item Name,RIC,statement,Statement
0,2011-09-30,,,AA.N,cashflow,
1,2012-09-30,,,AA.N,cashflow,
2,2013-09-30,,,AA.N,cashflow,
3,2014-09-30,,,AA.N,cashflow,
4,2016-06-30,-19000000.0,TR.F.ProfLossStartingLineCF,AA.N,cashflow,


In [5]:
# Fix wrong column names
cash_flow_statements = cash_flow_statements.drop(columns = "Statement")
cash_flow_statements.rename(columns={"statement": "Statement"}, inplace=True)
# Delete entries where both STD Cash Flow All and FCC Item Name is NaN
cash_flow_statements = cash_flow_statements.dropna(subset=["STD Cash Flow All", "FCC Item Name"], how="all")
cash_flow_statements.head()

Unnamed: 0,Date,STD Cash Flow All,FCC Item Name,RIC,Statement
4,2016-06-30,-19000000.0,TR.F.ProfLossStartingLineCF,AA.N,cashflow
5,2016-06-30,245000000.0,TR.F.NonCashItemsReconcAdjCF,AA.N,cashflow
6,2016-06-30,16000000.0,TR.F.EqIncLossInNetEarnCF,AA.N,cashflow
7,2016-06-30,103000000.0,TR.F.OthNonCashItemsReconcAdjCF,AA.N,cashflow
8,2016-06-30,178000000.0,TR.F.DeprDeplAmortInclImpairCF,AA.N,cashflow


In [6]:
# Get unique cash flow positions
unique_cash_flow_positions = cash_flow_statements["FCC Item Name"].unique()

- As before, Gemini is employed to infer the correct cash flow labels from the FCC Item Name Codes

In [9]:
cash_flow_labels = client.models.generate_content(
    model="gemini-2.5-flash",
    config=types.GenerateContentConfig(
        temperature=0.1,
        system_instruction="You are a financial analyst, that provides concise and accurate answers.",
        thinking_config=types.ThinkingConfig(thinking_budget=0),# Disables thinking
    ),
    contents=[f"""The following is a list of abbreviations used by Thomsom Reuters in their reporting database. For every given abbreviation,
              please provide the actual financial position this refers to in a cash flow statement. Example: TR.F.TotAssets: Total Assets.
              Here are the abbreviations: {unique_cash_flow_positions.tolist()} Provide your answer in a dictionary style format.
              """],  
)

In [10]:
# Regex to extract JSON from the response
# First I extract the JSON string from the response text
json_str = re.search(r'{.*}', cash_flow_labels.text, re.DOTALL).group()
# Then I parse the JSON string into a Python dictionary
data = json.loads(json_str)
# Save the dictionary to a file
with open("data/cash_flow_labels.json", "w") as f:
    json.dump(data, f, indent=4)  

In [11]:
# Map dictionary to cash flow statements
cash_flow_statements["position_label"] = cash_flow_statements["FCC Item Name"].map(data)

In [12]:
# Merge on RIC to include CIK and RIC in the cash flow statements
cash_flow_statements = cash_flow_statements.merge(rics, left_on="RIC", right_on="RIC", how="left")
cash_flow_statements.head(10)

Unnamed: 0,Date,STD Cash Flow All,FCC Item Name,RIC,Statement,position_label,CIK
0,2016-06-30,-19000000.0,TR.F.ProfLossStartingLineCF,AA.N,cashflow,Profit/Loss Starting Line (Cash Flow),1675149
1,2016-06-30,245000000.0,TR.F.NonCashItemsReconcAdjCF,AA.N,cashflow,Non-Cash Items Reconciliation Adjustments (Cas...,1675149
2,2016-06-30,16000000.0,TR.F.EqIncLossInNetEarnCF,AA.N,cashflow,Equity Income/Loss in Net Earnings (Cash Flow),1675149
3,2016-06-30,103000000.0,TR.F.OthNonCashItemsReconcAdjCF,AA.N,cashflow,Other Non-Cash Items Reconciliation Adjustment...,1675149
4,2016-06-30,178000000.0,TR.F.DeprDeplAmortInclImpairCF,AA.N,cashflow,"Depreciation, Depletion, Amortization, Includi...",1675149
5,2016-06-30,178000000.0,TR.F.DeprDeplPPECF,AA.N,cashflow,"Depreciation, Depletion of Property, Plant, an...",1675149
6,2016-06-30,-28000000.0,TR.F.DefIncTaxIncTaxCreditsCF,AA.N,cashflow,Deferred Income Tax and Income Tax Credits (Ca...,1675149
7,2016-06-30,-34000000.0,TR.F.AssetsSaleGLCF,AA.N,cashflow,Assets Sale Gain/Loss (Cash Flow),1675149
8,2016-06-30,10000000.0,TR.F.ShrBasedPaymtCF,AA.N,cashflow,Share-Based Payment (Cash Flow),1675149
9,2016-06-30,226000000.0,TR.F.CashFlowOpBefChgInWkgCap,AA.N,cashflow,Cash Flow from Operations Before Changes in Wo...,1675149


In [None]:
# Save to CSV
cash_flow_statements.to_csv("../data/cash_flow_statements_with_labels.csv", index=False)

---
- Import and further process income statements

In [None]:
first_income_statements = pd.read_csv("../data/income_statements.csv", dtype={"CIK": str})
missing_income_statements = pd.read_csv("../data/missing_income_statements.csv", dtype={"CIK": str})
income_statements = pd.concat([first_income_statements, missing_income_statements], ignore_index=True)
income_statements.head()

Unnamed: 0,Date,STD Income Statement All,FCC Item Name,RIC,statement,Statement
0,2011-09-30,,,AA.N,income_statement,
1,2012-09-30,,,AA.N,income_statement,
2,2013-09-30,,,AA.N,income_statement,
3,2014-09-30,,,AA.N,income_statement,
4,2016-06-30,2323000000.0,TR.F.RevGoodsSrvc,AA.N,income_statement,


In [23]:
# Drop observations, where both STD Income Statement All and FCC Item Name is NaN
income_statements = income_statements.dropna(subset=["STD Income Statement All", "FCC Item Name"], how="all")
# Get rid of Statement column, which is result of wrong spelling
income_statements = income_statements.drop(columns = "Statement")

In [17]:
# Unique positions in income statements
unique_income_positions = income_statements["FCC Item Name"].unique()

- One last time, Gemini is employed to map the FCC Item Name Codes to more official labels used in Cash Flow statements

In [18]:
income_statement_labels = client.models.generate_content(
    model="gemini-2.5-flash",
    config=types.GenerateContentConfig(
        temperature=0.1,
        system_instruction="You are a financial analyst, that provides concise and accurate answers.",
        thinking_config=types.ThinkingConfig(thinking_budget=0),# Disables thinking
    ),
    contents=[f"""The following is a list of abbreviations used by Thomsom Reuters in their reporting database. For every given abbreviation,
              please provide the actual financial position this refers to in an income statement. Example: TR.F.TotAssets: Total Assets.
              Here are the abbreviations: {unique_income_positions.tolist()} Provide your answer in a dictionary style format.
              """],  
)

In [None]:
# Regex to extract JSON from the response
# First I extract the JSON string from the response text
json_str = re.search(r'{.*}', income_statement_labels.text, re.DOTALL).group()
# Then I parse the JSON string into a Python dictionary
data = json.loads(json_str)
# Save the dictionary to a file
with open("../data/income_statement_labels.json", "w") as f:   
    json.dump(data, f, indent=4)

In [24]:
# Map dictionary to cash flow statements
income_statements["position_label"] = income_statements["FCC Item Name"].map(data)
income_statements.head()

Unnamed: 0,Date,STD Income Statement All,FCC Item Name,RIC,statement,position_label
4,2016-06-30,2323000000.0,TR.F.RevGoodsSrvc,AA.N,income_statement,Revenue from Goods and Services
5,2016-06-30,2323000000.0,TR.F.SalesOfGoodsSrvcNetUnclassif,AA.N,income_statement,"Sales of Goods and Services, Net, Unclassified"
6,2016-06-30,2323000000.0,TR.F.TotRevenue,AA.N,income_statement,Total Revenue
7,2016-06-30,2119000000.0,TR.F.CostOfOpRev,AA.N,income_statement,Cost of Operating Revenue
8,2016-06-30,2119000000.0,TR.F.COGSTot,AA.N,income_statement,"Cost of Goods Sold, Total"


In [25]:
# Merge on RIC to include CIK and RIC in the cash flow statements
income_statements = income_statements.merge(rics, left_on="RIC", right_on="RIC", how="left")
income_statements.head(10)

Unnamed: 0,Date,STD Income Statement All,FCC Item Name,RIC,statement,position_label,CIK
0,2016-06-30,2323000000.0,TR.F.RevGoodsSrvc,AA.N,income_statement,Revenue from Goods and Services,1675149
1,2016-06-30,2323000000.0,TR.F.SalesOfGoodsSrvcNetUnclassif,AA.N,income_statement,"Sales of Goods and Services, Net, Unclassified",1675149
2,2016-06-30,2323000000.0,TR.F.TotRevenue,AA.N,income_statement,Total Revenue,1675149
3,2016-06-30,2119000000.0,TR.F.CostOfOpRev,AA.N,income_statement,Cost of Operating Revenue,1675149
4,2016-06-30,2119000000.0,TR.F.COGSTot,AA.N,income_statement,"Cost of Goods Sold, Total",1675149
5,2016-06-30,1941000000.0,TR.F.COGSUnclassif,AA.N,income_statement,"Cost of Goods Sold, Unclassified",1675149
6,2016-06-30,178000000.0,TR.F.DeprInCOGS,AA.N,income_statement,Depreciation in Cost of Goods Sold,1675149
7,2016-06-30,204000000.0,TR.F.GrossProfIndPropTot,AA.N,income_statement,"Gross Profit, Industrial Property, Total",1675149
8,2016-06-30,97000000.0,TR.F.SGATot,AA.N,income_statement,"Selling, General and Administrative Expenses, ...",1675149
9,2016-06-30,90000000.0,TR.F.SGAUnclassif,AA.N,income_statement,"Selling, General and Administrative Expenses, ...",1675149


In [None]:
# Save to CSV
income_statements.to_csv("../data/income_statements_with_labels.csv", index=False)

---

- Function to fetch most recent company filings for a given CIK and Date

In [3]:
labeled_balance_sheets = pd.read_csv("../data/balance_sheets_with_labels.csv", dtype={"CIK": str})
labeled_income_statements = pd.read_csv("../data/income_statements_with_labels.csv", dtype={"CIK": str})
labeled_cash_flow_statements = pd.read_csv("../data/cash_flow_statements_with_labels.csv", dtype={"CIK": str})

  labeled_income_statements = pd.read_csv("../data/income_statements_with_labels.csv", dtype={"CIK": str})
  labeled_cash_flow_statements = pd.read_csv("../data/cash_flow_statements_with_labels.csv", dtype={"CIK": str})


In [4]:
def get_llm_ratings(cik: str, balance_sheets = None, income_statements = None, cash_flow_statements = None):
    
    """
    Function that returns a DataFrame with LLM ratings for a given CIK.
    For every reporting date, the function fetches the most recent financial statements, i.e.
    - Balance Sheet
    - Cash Flow Statement
    - Income Statement
    and calls the LLM to get a buy/sell/hold recommendation. In order to avoid issues with reports that were filed slightly apart,
    a window of 10 days around a given reporting date is used. This helps to ensure that the LLM has access to all relevant financial information for a given reporting date.
    Furthermore, reports that were filed slightly apart will not lead to recommendations that are based on partial information only and will also not cause multiple
    recommendations that only lie within the window of 10 days around a given reporting date.
    
    Parameters: 
    cik: str, CIK of the company (Can be looked up on the SEC website)
    balance_sheets: Balance Sheets DataFrame with columns: STD Balance Sheet All, FCC Item Name, CIK and Date (among others)
    income_statements: Income Statements DataFrame with columns: STD Income Statement All, FCC Item Name, CIK and Date (among others)
    cash_flow_statements: Cash Flow Statements DataFrame with columns: STD Cash Flow All, FCC Item Name, CIK and Date (among others)
    """
    
    # First filter dfs for input CIK
    balance_sheets = balance_sheets[balance_sheets["CIK"] == cik].copy()
    income_statements = income_statements[income_statements["CIK"] == cik].copy()
    cash_flow_statements = cash_flow_statements[cash_flow_statements["CIK"] == cik].copy()
    
    # Convert the date columns to datetime objects
    for df in [balance_sheets, income_statements, cash_flow_statements]:
        df["Report Date"] = pd.to_datetime(df["Date"])

    # Determine unique dates
    reporting_dates = pd.concat([
        balance_sheets["Report Date"],
        income_statements["Report Date"],
        cash_flow_statements["Report Date"]
    ]).unique()

    # Sort dates just to be safe
    reporting_dates = np.sort(reporting_dates)

    # In order to handle reports, that were filed slighty apart, a window of 10 days around a given reporting date is used
    window = pd.Timedelta(days=10)

    # Loop over reporting dates to obtain LLM ratings
    llm_ratings = []
    for date in reporting_dates:

        # Subset all financial statements for the given dates +- window days
        # Current quarter
        bs = balance_sheets[(balance_sheets["Report Date"] >= date - window) & (balance_sheets["Report Date"] <= date + window)]
        is_ = income_statements[(income_statements["Report Date"] >= date - window) & (income_statements["Report Date"] <= date + window)]
        cf = cash_flow_statements[(cash_flow_statements["Report Date"] >= date - window) & (cash_flow_statements["Report Date"] <= date + window)]
        
        # Concatenate reports into a string with correct labels
        bs_str = "\n".join(bs.apply(lambda row: f"{row['position_label']}: {row['STD Balance Sheet All']}", axis=1).astype(str))
        is_str = "\n".join(is_.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
        cf_str = "\n".join(cf.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))


        # Call the LLM to get the rating
        response = client.models.generate_content(
            model="gemini-2.5-flash-lite", # "gemini-2.5-flash"
            config=types.GenerateContentConfig(
                temperature=0, # Deterministic ouput
                system_instruction="""You are an experienced, data-driven financial analyst, that provides concise and accurate answers.""",
                
                thinking_config=types.ThinkingConfig(thinking_budget=0),# Disables thinking, but only required for Gemini 2.5
            ),
            
            contents=[f"""
            Based on the following financial reports only, please provide an investment recommendation for the underlying company.
                      
            Balance Sheet: 
            {bs_str}

            Income Statement: 
            {is_str}

            Cash Flow Statement: 
            {cf_str}

            Provide your answer using only one of the following signals: 'strong buy', 'buy', 'hold', 'sell', or 'strong sell'.
            """]
        )

        # Extract rating from the response
        rating = response.text.strip().lower()
        llm_ratings.append({
            "cik": str(cik),  # Ensure CIK is a string
            "date": date,
            "rating": rating
        })
    
    # Convert the list of dictionaries to a DataFrame
    llm_ratings_df = pd.DataFrame(llm_ratings)

    # Convert Report Date to datetime
    llm_ratings_df["Report Date"] = pd.to_datetime(llm_ratings_df["Report Date"]).dt.date

    # Sort by Report Date
    llm_ratings_df.sort_values(by="Report Date", inplace=True)

    # Reset index
    llm_ratings_df.reset_index(drop=True, inplace=True)
    
    return llm_ratings_df

In [5]:
def create_requests(cik: str, balance_sheets = None, income_statements = None, cash_flow_statements = None):
    
    """
    Function that returns a DataFrame with LLM ratings for a given CIK.
    For every reporting date, the function fetches the most recent financial statements, i.e.
    - Balance Sheet
    - Cash Flow Statement
    - Income Statement
    and calls the LLM to get a buy/sell/hold recommendation. In order to avoid issues with reports that were filed slightly apart,
    a window of 10 days around a given reporting date is used. This helps to ensure that the LLM has access to all relevant financial information for a given reporting date.
    Furthermore, reports that were filed slightly apart will not lead to recommendations that are based on partial information only and will also not cause multiple
    recommendations that only lie within the window of 10 days around a given reporting date.
    
    Parameters: 
    cik: str, CIK of the company (Can be looked up on the SEC website)
    balance_sheets: Balance Sheets DataFrame with columns: STD Balance Sheet All, FCC Item Name, CIK and Date (among others)
    income_statements: Income Statements DataFrame with columns: STD Income Statement All, FCC Item Name, CIK and Date (among others)
    cash_flow_statements: Cash Flow Statements DataFrame with columns: STD Cash Flow All, FCC Item Name, CIK and Date (among others)
    """
    
    # First filter dfs for input CIK
    balance_sheets = balance_sheets[balance_sheets["CIK"] == cik].copy()
    income_statements = income_statements[income_statements["CIK"] == cik].copy()
    cash_flow_statements = cash_flow_statements[cash_flow_statements["CIK"] == cik].copy()
    
    # Convert the date columns to datetime objects
    for df in [balance_sheets, income_statements, cash_flow_statements]:
        df["Report Date"] = pd.to_datetime(df["Date"])

    # Determine unique dates
    reporting_dates = pd.concat([
        balance_sheets["Report Date"],
        income_statements["Report Date"],
        cash_flow_statements["Report Date"]
    ]).unique()

    # Sort dates just to be safe
    reporting_dates = np.sort(reporting_dates)

    # In order to handle reports, that were filed slighty apart, a window of 10 days around a given reporting date is used
    window = pd.Timedelta(days=10)

    # Loop over reporting dates to obtain LLM ratings
    requests = []
    for i, date in enumerate(reporting_dates):


        # Determine current date
        q0_date = date
        # Determine date of previous quarter
        qminus1_date = date - pd.DateOffset(months=3)
        # Determine date of q0 -2 
        qminus2_date = date - pd.DateOffset(months=6)
        # Determine date of q0 -3
        qminus3_date = date - pd.DateOffset(months=9)
        print(f"Processing date {i+1}/{len(reporting_dates)}: {date} (Q0: {q0_date}, Q-1: {qminus1_date}, Q-2: {qminus2_date}, Q-3: {qminus3_date})")
        


        # Subset all financial statements for the given dates +- window days
        # Current quarter
        bs = balance_sheets[
            (balance_sheets["Report Date"] >= date - window) &
            (balance_sheets["Report Date"] <= date + window)
        ]
        is_0 = income_statements[
            (income_statements["Report Date"] >= date - window) &
            (income_statements["Report Date"] <= date + window)
        ]
        cf_0 = cash_flow_statements[
            (cash_flow_statements["Report Date"] >= date - window) &
            (cash_flow_statements["Report Date"] <= date + window)
        ]

        # Previous quarter (Q-1)
        is_qminus1 = income_statements[
            (income_statements["Report Date"] >= qminus1_date - window) &
            (income_statements["Report Date"] <= qminus1_date + window)
        ]
        cf_qminus1 = cash_flow_statements[
            (cash_flow_statements["Report Date"] >= qminus1_date - window) &
            (cash_flow_statements["Report Date"] <= qminus1_date + window)
        ]

        # Two quarters ago (Q-2)
        is_qminus2 = income_statements[
            (income_statements["Report Date"] >= qminus2_date - window) &
            (income_statements["Report Date"] <= qminus2_date + window)
        ]
        cf_qminus2 = cash_flow_statements[
            (cash_flow_statements["Report Date"] >= qminus2_date - window) &
            (cash_flow_statements["Report Date"] <= qminus2_date + window)
        ]

        # Three quarters ago (Q-3)
        is_qminus3 = income_statements[
            (income_statements["Report Date"] >= qminus3_date - window) &
            (income_statements["Report Date"] <= qminus3_date + window)
        ]
        cf_qminus3 = cash_flow_statements[
            (cash_flow_statements["Report Date"] >= qminus3_date - window) &
            (cash_flow_statements["Report Date"] <= qminus3_date + window)
        ]

        # If no reports are available for the given date, skip to next date
        if bs.empty or is_0.empty or cf_0.empty:
            continue
        
        # Concatenate reports into strings with correct labels
        bs_str = "\n".join(bs.apply(lambda row: f"{row['position_label']}: {row['STD Balance Sheet All']}", axis=1).astype(str))
        is_str = "\n".join(is_0.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
        cf_str = "\n".join(cf_0.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))

        # Append previous quarters if available — even if just one of IS or CF is present
        if not is_qminus1.empty:
            is_qminus1_str = "\n".join(is_qminus1.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
            is_str += f"\n\nIncome Statement from previous quarter:\n{is_qminus1_str}"
        if not cf_qminus1.empty:
            cf_qminus1_str = "\n".join(cf_qminus1.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))
            cf_str += f"\n\nCash Flow Statement from previous quarter:\n{cf_qminus1_str}"

        if not is_qminus2.empty:
            is_qminus2_str = "\n".join(is_qminus2.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
            is_str += f"\n\nIncome Statement from two quarters ago:\n{is_qminus2_str}"
        if not cf_qminus2.empty:
            cf_qminus2_str = "\n".join(cf_qminus2.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))
            cf_str += f"\n\nCash Flow Statement from two quarters ago:\n{cf_qminus2_str}"

        if not is_qminus3.empty:
            is_qminus3_str = "\n".join(is_qminus3.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
            is_str += f"\n\nIncome Statement from three quarters ago:\n{is_qminus3_str}"
        if not cf_qminus3.empty:
            cf_qminus3_str = "\n".join(cf_qminus3.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))
            cf_str += f"\n\nCash Flow Statement from three quarters ago:\n{cf_qminus3_str}"
            
        contents=[f"""
        Based on the following financial reports only, please provide an investment recommendation for the underlying company.
                    
        Balance Sheet: 
        {bs_str}

        Income Statement: 
        {is_str}     

        Cash Flow Statement: 
        {cf_str}

        Provide your answer using only one of the following signals: 'strong buy', 'buy', 'hold', 'sell', or 'strong sell'.
        """]
        requests.append({
            "cik": str(cik),  # Ensure CIK is a string
            "date": date,
            "contents": contents
        })


    
    return requests

In [6]:
def get_llm_ratings2(cik: str, balance_sheets = None, income_statements = None, cash_flow_statements = None):
    
    """
    Function that returns a DataFrame with LLM ratings for a given CIK.
    For every reporting date, the function fetches the most recent financial statements, i.e.
    - Balance Sheet
    - Cash Flow Statement
    - Income Statement
    and calls the LLM to get a buy/sell/hold recommendation. In order to avoid issues with reports that were filed slightly apart,
    a window of 10 days around a given reporting date is used. This helps to ensure that the LLM has access to all relevant financial information for a given reporting date.
    Furthermore, reports that were filed slightly apart will not lead to recommendations that are based on partial information only and will also not cause multiple
    recommendations that only lie within the window of 10 days around a given reporting date.
    
    Parameters: 
    cik: str, CIK of the company (Can be looked up on the SEC website)
    balance_sheets: Balance Sheets DataFrame with columns: STD Balance Sheet All, FCC Item Name, CIK and Date (among others)
    income_statements: Income Statements DataFrame with columns: STD Income Statement All, FCC Item Name, CIK and Date (among others)
    cash_flow_statements: Cash Flow Statements DataFrame with columns: STD Cash Flow All, FCC Item Name, CIK and Date (among others)
    """
    
    # First filter dfs for input CIK
    balance_sheets = balance_sheets[balance_sheets["CIK"] == cik].copy()
    income_statements = income_statements[income_statements["CIK"] == cik].copy()
    cash_flow_statements = cash_flow_statements[cash_flow_statements["CIK"] == cik].copy()
    
    # Convert the date columns to datetime objects
    for df in [balance_sheets, income_statements, cash_flow_statements]:
        df["Report Date"] = pd.to_datetime(df["Date"])

    # Determine unique dates
    reporting_dates = pd.concat([
        balance_sheets["Report Date"],
        income_statements["Report Date"],
        cash_flow_statements["Report Date"]
    ]).unique()

    # Sort dates just to be safe
    reporting_dates = np.sort(reporting_dates)

    # In order to handle reports, that were filed slighty apart, a window of 10 days around a given reporting date is used
    window = pd.Timedelta(days=10)

    # Loop over reporting dates to obtain LLM ratings
    llm_ratings = []
    for i, date in enumerate(reporting_dates):


        # Determine current date
        q0_date = date
        # Determine date of previous quarter
        qminus1_date = date - pd.DateOffset(months=3)
        # Determine date of q0 -2 
        qminus2_date = date - pd.DateOffset(months=6)
        # Determine date of q0 -3
        qminus3_date = date - pd.DateOffset(months=9)        


        # Subset all financial statements for the given dates +- window days
        # Current quarter
        bs = balance_sheets[
            (balance_sheets["Report Date"] >= date - window) &
            (balance_sheets["Report Date"] <= date + window)
        ]
        is_0 = income_statements[
            (income_statements["Report Date"] >= date - window) &
            (income_statements["Report Date"] <= date + window)
        ]
        cf_0 = cash_flow_statements[
            (cash_flow_statements["Report Date"] >= date - window) &
            (cash_flow_statements["Report Date"] <= date + window)
        ]

        # Previous quarter (Q-1)
        is_qminus1 = income_statements[
            (income_statements["Report Date"] >= qminus1_date - window) &
            (income_statements["Report Date"] <= qminus1_date + window)
        ]
        cf_qminus1 = cash_flow_statements[
            (cash_flow_statements["Report Date"] >= qminus1_date - window) &
            (cash_flow_statements["Report Date"] <= qminus1_date + window)
        ]

        # Two quarters ago (Q-2)
        is_qminus2 = income_statements[
            (income_statements["Report Date"] >= qminus2_date - window) &
            (income_statements["Report Date"] <= qminus2_date + window)
        ]
        cf_qminus2 = cash_flow_statements[
            (cash_flow_statements["Report Date"] >= qminus2_date - window) &
            (cash_flow_statements["Report Date"] <= qminus2_date + window)
        ]

        # Three quarters ago (Q-3)
        is_qminus3 = income_statements[
            (income_statements["Report Date"] >= qminus3_date - window) &
            (income_statements["Report Date"] <= qminus3_date + window)
        ]
        cf_qminus3 = cash_flow_statements[
            (cash_flow_statements["Report Date"] >= qminus3_date - window) &
            (cash_flow_statements["Report Date"] <= qminus3_date + window)
        ]

        # If no reports are available for the given date, skip to next date
        if bs.empty or is_0.empty or cf_0.empty:
            continue
        
        # Concatenate reports into strings with correct labels
        bs_str = "\n".join(bs.apply(lambda row: f"{row['position_label']}: {row['STD Balance Sheet All']}", axis=1).astype(str))
        is_str = "\n".join(is_0.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
        cf_str = "\n".join(cf_0.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))

        # Append previous quarters if available — even if just one of IS or CF is present
        if not is_qminus1.empty:
            is_qminus1_str = "\n".join(is_qminus1.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
            is_str += f"\n\nIncome Statement from previous quarter:\n{is_qminus1_str}"
        if not cf_qminus1.empty:
            cf_qminus1_str = "\n".join(cf_qminus1.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))
            cf_str += f"\n\nCash Flow Statement from previous quarter:\n{cf_qminus1_str}"

        if not is_qminus2.empty:
            is_qminus2_str = "\n".join(is_qminus2.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
            is_str += f"\n\nIncome Statement from two quarters ago:\n{is_qminus2_str}"
        if not cf_qminus2.empty:
            cf_qminus2_str = "\n".join(cf_qminus2.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))
            cf_str += f"\n\nCash Flow Statement from two quarters ago:\n{cf_qminus2_str}"

        if not is_qminus3.empty:
            is_qminus3_str = "\n".join(is_qminus3.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
            is_str += f"\n\nIncome Statement from three quarters ago:\n{is_qminus3_str}"
        if not cf_qminus3.empty:
            cf_qminus3_str = "\n".join(cf_qminus3.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))
            cf_str += f"\n\nCash Flow Statement from three quarters ago:\n{cf_qminus3_str}"
            

        # Call the LLM to get the rating
        response = client.models.generate_content(
           # model="gemini-2.5-flash", 
            model="gemini-2.5-flash-lite",
            config=types.GenerateContentConfig(
                temperature=0, # Deterministic ouput
                system_instruction="""You are an experienced, data-driven financial analyst, that provides concise and accurate answers.""",
                
                thinking_config=types.ThinkingConfig(thinking_budget=0),# Disables thinking, but only required for Gemini 2.5
            ),
            
            contents=[f"""
            Based on the following financial reports only, please provide an investment recommendation for the underlying company.
                      
            Balance Sheet: 
            {bs_str}

            Income Statement: 
            {is_str}

            Cash Flow Statement: 
            {cf_str}

            Provide your answer using only one of the following signals: 'buy', 'hold' or 'sell'.
            """]
        )

        # Extract rating from the response
        rating = response.text.strip().lower()
        llm_ratings.append({
            "cik": str(cik), 
            "date": date,
            "rating": rating
        })

    # If no ratings were generated, return None
    if not llm_ratings:
        return None
    
    # Convert the list of dictionaries to a DataFrame
    llm_ratings_df = pd.DataFrame(llm_ratings)

    llm_ratings_df["date"] = pd.to_datetime(llm_ratings_df["date"]).dt.date

    # Sort by Report Date
    llm_ratings_df.sort_values(by="date", inplace=True)

    # Reset index
    llm_ratings_df.reset_index(drop=True, inplace=True)
    
    return llm_ratings_df

In [7]:
# Function to get LLM ratings for all CIKs
import pandas as pd
import time
from tqdm import tqdm

def llm_ratings_loop(
    cik_list,
    balance_sheets,
    income_statements,
    cash_flow_statements,
    output_path_ratings="../data/ciklist1_ratings_with_previous_quarters.csv",
    output_path_failed="../data/failed_ciks1.csv",
    retries=5,
    retry_delay=30
):
    """
    Process a list of CIKs to retrieve LLM ratings with retry logic on server errors.

    Args:
        cik_list (pd.DataFrame): DataFrame with a "CIK" column.
        balance_sheets (dict or DataFrame): Balance sheet data.
        income_statements (dict or DataFrame): Income statement data.
        cash_flow_statements (dict or DataFrame): Cash flow data.
        output_path_ratings (str): File path to save ratings CSV.
        output_path_failed (str): File path to save failed CIKs.
        retries (int): Number of retry attempts on server error.
        retry_delay (int): Seconds to wait between retries.

    Returns:
        pd.DataFrame: Combined ratings DataFrame.
    """
    list_ratings = []
    failed_ciks = []
    progress_bar = tqdm(cik_list["CIK"], desc="Processing CIKs")

    for i, cik in enumerate(progress_bar):
        progress_bar.set_description(
            f"Processing CIK {i+1}/{len(cik_list)}: {cik} | Time: {pd.Timestamp.now().strftime('%H:%M:%S')}"
        )

        for attempt in range(retries):
            try:
                ratings = get_llm_ratings2(
                    cik,
                    balance_sheets,
                    income_statements,
                    cash_flow_statements
                )
                if ratings is not None:
                    list_ratings.append(ratings)
                break  # success, exit retry loop which starts at for attempt in range(retries)

            except ServerError as e:
                print(f"ServerError for CIK {cik} (Attempt {attempt + 1}/{retries}): {e}")
                if attempt < retries - 1:
                    time.sleep(retry_delay)
                else: # This else statement basically only runs if all retries failed i.e. the inner loop is completed, hence it starts with the next CIK
                    failed_ciks.append(cik)

    # Save results to CSV
    cik_ratings_df = pd.concat(list_ratings, ignore_index=True)
    cik_ratings_df.to_csv(output_path_ratings, index=False)

    # Save failed CIKs to CSV
    if failed_ciks:
        pd.Series(failed_ciks).to_csv(output_path_failed, index=False)

    return cik_ratings_df

- Finally, I can use this function to loop over all company CIKs 

In [8]:
# First determine unique CIKs
ciks = set(labeled_balance_sheets["CIK"].unique()) | set(labeled_income_statements["CIK"].unique()) | set(labeled_cash_flow_statements["CIK"].unique())

# Convert to DataFrame to make slicing into equal parts easier
ciks = pd.DataFrame(ciks, columns=["CIK"])

---
Testing the function

In [13]:
ciks1 = ciks[:100] 
ciks2 = ciks[100:200] 
ciks3 = ciks[200:300] 
ciks4 = ciks[300:400] 
ciks5 = ciks[400:500] 
ciks6 = ciks[500:600] 
ciks7 = ciks[600:700] 
ciks8 = ciks[700:800] 
ciks9 = ciks[800:900] 
ciks10 = ciks[900:1000] 
ciks11 = ciks[1000:1100] 
ciks12 = ciks[1100:1200] 
ciks13 = ciks[1200:1300] 
ciks14 = ciks[1300:1400] 
ciks15 = ciks[1400:1500] 

In [None]:
ciks1_ratings = []
progress_bar = tqdm(ciks1["CIK"], desc="Processing CIKs")

for i, cik in enumerate(progress_bar):
    # Progress message
    progress_bar.set_description(f"Processing CIK {i+1}/{len(ciks1)}: {cik} | Time: {pd.Timestamp.now().strftime('%H:%M:%S')}")
    # Get LLM ratings for the given CIK
    ratings = get_llm_ratings(cik, labeled_balance_sheets, labeled_income_statements, labeled_cash_flow_statements)
    ciks1_ratings.append(ratings)
# Concatenate all ratings into a single DataFrame
ciks_test_ratings_df = pd.concat(ciks1_ratings, ignore_index=True)
# Save to CSV
ciks_test_ratings_df.to_csv("../data/ciks1_ratings.csv", index=False)

Processing CIK 100/100: 0001396009 | Time: 18:19:23: 100%|██████████| 100/100 [1:14:11<00:00, 44.52s/it]


In [15]:
ciks2_ratings = []
progress_bar = tqdm(ciks2["CIK"], desc="Processing CIKs")

for i, cik in enumerate(progress_bar):
    # Progress message
    progress_bar.set_description(f"Processing CIK {i+1}/{len(ciks2)}: {cik} | Time: {pd.Timestamp.now().strftime('%H:%M:%S')}")
    # Get LLM ratings for the given CIK
    ratings = get_llm_ratings(cik, labeled_balance_sheets, labeled_income_statements, labeled_cash_flow_statements)
    ciks2_ratings.append(ratings)
# Concatenate all ratings into a single DataFrame
ciks2_ratings_df = pd.concat(ciks2_ratings, ignore_index=True)
# Save to CSV
ciks2_ratings_df.to_csv("../data/ciks2_ratings.csv", index=False)

Processing CIK 100/100: 0001094831 | Time: 16:25:34: 100%|██████████| 100/100 [1:02:45<00:00, 37.65s/it]


In [None]:
ciks3_ratings = []
progress_bar = tqdm(ciks3["CIK"], desc="Processing CIKs")

for i, cik in enumerate(progress_bar):
    # Progress message
    progress_bar.set_description(f"Processing CIK {i+1}/{len(ciks3)}: {cik} | Time: {pd.Timestamp.now().strftime('%H:%M:%S')}")
    # Get LLM ratings for the given CIK
    ratings = get_llm_ratings(cik, labeled_balance_sheets, labeled_income_statements, labeled_cash_flow_statements)
    ciks3_ratings.append(ratings)
# Concatenate all ratings into a single DataFrame
ciks3_ratings_df = pd.concat(ciks3_ratings, ignore_index=True)
# Save to CSV
ciks3_ratings_df.to_csv("../data/ciks3_ratings.csv", index=False)

In [8]:
ciks4_ratings = []
progress_bar = tqdm(ciks4["CIK"], desc="Processing CIKs")

for i, cik in enumerate(progress_bar):
    # Progress message
    progress_bar.set_description(f"Processing CIK {i+1}/{len(ciks4)}: {cik} | Time: {pd.Timestamp.now().strftime('%H:%M:%S')}")
    # Get LLM ratings for the given CIK
    ratings = get_llm_ratings(cik, labeled_balance_sheets, labeled_income_statements, labeled_cash_flow_statements)
    ciks4_ratings.append(ratings)
# Concatenate all ratings into a single DataFrame
ciks4_ratings_df = pd.concat(ciks4_ratings, ignore_index=True)
# Save to CSV
ciks4_ratings_df.to_csv("../data/ciks4_ratings.csv", index=False)

Processing CIK 100/100: 0000890926 | Time: 17:44:45: 100%|██████████| 100/100 [1:01:25<00:00, 36.85s/it]


In [8]:
ciks5_ratings = []
progress_bar = tqdm(ciks5["CIK"], desc="Processing CIKs")

for i, cik in enumerate(progress_bar):
    # Progress message
    progress_bar.set_description(f"Processing CIK {i+1}/{len(ciks5)}: {cik} | Time: {pd.Timestamp.now().strftime('%H:%M:%S')}")
    # Get LLM ratings for the given CIK
    ratings = get_llm_ratings(cik, labeled_balance_sheets, labeled_income_statements, labeled_cash_flow_statements)
    ciks5_ratings.append(ratings)
# Concatenate all ratings into a single DataFrame
ciks5_ratings_df = pd.concat(ciks5_ratings, ignore_index=True)
# Save to CSV
ciks5_ratings_df.to_csv("../data/ciks5_ratings.csv", index=False)

Processing CIK 100/100: 0001669811 | Time: 16:04:57: 100%|██████████| 100/100 [1:06:02<00:00, 39.63s/it]


In [10]:
ciks6_ratings = []
progress_bar = tqdm(ciks6["CIK"], desc="Processing CIKs")

for i, cik in enumerate(progress_bar):
    # Progress message
    progress_bar.set_description(f"Processing CIK {i+1}/{len(ciks6)}: {cik} | Time: {pd.Timestamp.now().strftime('%H:%M:%S')}")
    # Get LLM ratings for the given CIK
    ratings = get_llm_ratings(cik, labeled_balance_sheets, labeled_income_statements, labeled_cash_flow_statements)
    ciks6_ratings.append(ratings)
# Concatenate all ratings into a single DataFrame
ciks6_ratings_df = pd.concat(ciks6_ratings, ignore_index=True)
# Save to CSV
ciks6_ratings_df.to_csv("../data/ciks6_ratings.csv", index=False)

Processing CIK 100/100: 0001090872 | Time: 20:48:09: 100%|██████████| 100/100 [1:37:41<00:00, 58.61s/it]


---

## Updated function that includes Cash Flow and Income Statements from previous quarters

In [9]:
cik_list1 = ciks[:500]
cik_list2 = ciks[500:1000]
cik_list3 = ciks[1000:]
len(cik_list1) + len(cik_list2) + len(cik_list3) == len(ciks)

True

In [23]:
list1_ratings = []
failed_ciks = []
progress_bar = tqdm(cik_list1["CIK"], desc="Processing CIKs")

for i, cik in enumerate(progress_bar):
    progress_bar.set_description(
        f"Processing CIK {i+1}/{len(cik_list1)}: {cik} | Time: {pd.Timestamp.now().strftime('%H:%M:%S')}"
    )
    
    try:
        ratings = get_llm_ratings2(
            cik,
            labeled_balance_sheets,
            labeled_income_statements,
            labeled_cash_flow_statements
        )
        if ratings is not None:
            list1_ratings.append(ratings)

    except ServerError as e:
        print(f"ServerError for CIK {cik}: {e}")
        failed_ciks.append(cik)
        time.sleep(1)  # In case errors are caused by rate limiting or something..

# Concatenate all ratings into a single DataFrame
cik_ratings_df = pd.concat(list1_ratings, ignore_index=True)

# Save to CSV
cik_ratings_df.to_csv("../data/ciklist1_ratings_with_previous_quarters.csv", index=False)

# Optionally save failed CIKs for later retry
if failed_ciks:
    pd.Series(failed_ciks).to_csv("../data/failed_ciks1.csv", index=False)


Processing CIK 29/500: 0000080172 | Time: 17:03:34:   6%|▌         | 28/500 [22:31<7:02:51, 53.75s/it]

ServerError for CIK 0000080172: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 43/500: 0000907242 | Time: 17:14:33:   8%|▊         | 42/500 [33:29<6:14:02, 49.00s/it]

ServerError for CIK 0000907242: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 47/500: 0000745732 | Time: 17:17:50:   9%|▉         | 46/500 [36:46<6:14:34, 49.50s/it]

ServerError for CIK 0000745732: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 55/500: 0000885725 | Time: 17:24:09:  11%|█         | 54/500 [43:06<6:17:43, 50.82s/it]

ServerError for CIK 0000885725: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 59/500: 0000793733 | Time: 17:26:00:  12%|█▏        | 58/500 [44:57<4:20:41, 35.39s/it]

ServerError for CIK 0000793733: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 72/500: 0001021635 | Time: 17:33:11:  14%|█▍        | 71/500 [52:07<3:16:21, 27.46s/it]

ServerError for CIK 0001021635: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 80/500: 0000886163 | Time: 17:39:42:  16%|█▌        | 79/500 [58:39<4:28:21, 38.25s/it]

ServerError for CIK 0000886163: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 81/500: 0000006281 | Time: 17:40:13:  16%|█▌        | 80/500 [59:09<4:11:23, 35.91s/it]

ServerError for CIK 0000006281: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 107/500: 0001405495 | Time: 18:01:12:  21%|██        | 106/500 [1:20:08<5:19:49, 48.70s/it]

ServerError for CIK 0001405495: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 175/500: 0000066570 | Time: 18:50:23:  35%|███▍      | 174/500 [2:09:19<4:33:27, 50.33s/it]

ServerError for CIK 0000066570: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 177/500: 0000316709 | Time: 18:50:46:  35%|███▌      | 176/500 [2:09:43<2:40:26, 29.71s/it]

ServerError for CIK 0000316709: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 178/500: 0001306830 | Time: 18:50:52:  35%|███▌      | 177/500 [2:09:49<2:01:38, 22.60s/it]

ServerError for CIK 0001306830: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 203/500: 0000873303 | Time: 19:08:23:  40%|████      | 202/500 [2:27:19<4:04:40, 49.26s/it]

ServerError for CIK 0000873303: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 211/500: 0000856982 | Time: 19:13:17:  42%|████▏     | 210/500 [2:32:14<2:52:29, 35.69s/it]

ServerError for CIK 0000856982: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 218/500: 0001050797 | Time: 19:18:08:  43%|████▎     | 217/500 [2:37:04<3:34:44, 45.53s/it]

ServerError for CIK 0001050797: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 231/500: 0000773840 | Time: 19:30:00:  46%|████▌     | 230/500 [2:48:56<3:38:58, 48.66s/it]

ServerError for CIK 0000773840: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 257/500: 0001158449 | Time: 19:48:59:  51%|█████     | 256/500 [3:07:55<3:20:16, 49.25s/it]

ServerError for CIK 0001158449: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 306/500: 0000019745 | Time: 20:24:44:  61%|██████    | 305/500 [3:43:41<2:12:02, 40.63s/it]

ServerError for CIK 0000019745: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 307/500: 0000913760 | Time: 20:24:56:  61%|██████    | 306/500 [3:43:52<1:43:14, 31.93s/it]

ServerError for CIK 0000913760: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 310/500: 0000775158 | Time: 20:26:23:  62%|██████▏   | 309/500 [3:45:20<1:45:24, 33.12s/it]

ServerError for CIK 0000775158: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 323/500: 0001035267 | Time: 20:35:24:  64%|██████▍   | 322/500 [3:54:20<2:21:17, 47.62s/it]

ServerError for CIK 0001035267: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 331/500: 0000037785 | Time: 20:41:23:  66%|██████▌   | 330/500 [4:00:19<2:10:28, 46.05s/it]

ServerError for CIK 0000037785: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 334/500: 0000884887 | Time: 20:43:23:  67%|██████▋   | 333/500 [4:02:19<1:54:40, 41.20s/it]

ServerError for CIK 0000884887: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 381/500: 0000875320 | Time: 21:20:05:  76%|███████▌  | 380/500 [4:39:02<1:33:04, 46.54s/it]

ServerError for CIK 0000875320: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 394/500: 0000921738 | Time: 21:30:51:  79%|███████▊  | 393/500 [4:49:47<1:12:36, 40.72s/it]

ServerError for CIK 0000921738: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 413/500: 0001326380 | Time: 21:43:54:  82%|████████▏ | 412/500 [5:02:50<52:26, 35.76s/it]  

ServerError for CIK 0001326380: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 432/500: 0001048695 | Time: 21:57:15:  86%|████████▌ | 431/500 [5:16:12<50:12, 43.65s/it]  

ServerError for CIK 0001048695: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 464/500: 0001237831 | Time: 22:20:35:  93%|█████████▎| 463/500 [5:39:32<28:23, 46.04s/it]

ServerError for CIK 0001237831: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 471/500: 0000903129 | Time: 22:25:58:  94%|█████████▍| 470/500 [5:44:55<25:45, 51.50s/it]

ServerError for CIK 0000903129: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 476/500: 0000059527 | Time: 22:28:26:  95%|█████████▌| 475/500 [5:47:22<15:41, 37.66s/it]

ServerError for CIK 0000059527: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 480/500: 0000723612 | Time: 22:30:44:  96%|█████████▌| 479/500 [5:49:40<11:45, 33.58s/it]

ServerError for CIK 0000723612: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 487/500: 0000072971 | Time: 22:35:35:  97%|█████████▋| 486/500 [5:54:31<09:51, 42.25s/it]

ServerError for CIK 0000072971: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 496/500: 0001046568 | Time: 22:41:09:  99%|█████████▉| 495/500 [6:00:05<02:55, 35.13s/it]

ServerError for CIK 0001046568: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 500/500: 0000890926 | Time: 22:44:01: 100%|█████████▉| 499/500 [6:02:57<00:43, 43.91s/it]

ServerError for CIK 0000890926: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 500/500: 0000890926 | Time: 22:44:01: 100%|██████████| 500/500 [6:03:02<00:00, 43.57s/it]


In [5]:
ratings = pd.read_csv("../data/ciklist1_ratings_with_previous_quarters.csv", dtype={"CIK": str})
ratings.head()

Unnamed: 0,cik,date,rating
0,1576018,2013-03-31,buy
1,1576018,2013-06-30,hold
2,1576018,2013-09-30,buy
3,1576018,2013-12-31,hold
4,1576018,2014-03-31,hold


In [13]:
ratings2 = llm_ratings_loop(
    cik_list=cik_list2,
    balance_sheets=labeled_balance_sheets,
    income_statements=labeled_income_statements,
    cash_flow_statements=labeled_cash_flow_statements,
    output_path_ratings="../data/ciklist2_ratings_with_previous_quarters.csv",
    output_path_failed="../data/failed_ciks2.csv",
    retries=5,
    retry_delay=30)

Processing CIK 8/500: 0000949157 | Time: 11:32:08:   1%|▏         | 7/500 [05:49<7:21:43, 53.76s/it]

ServerError for CIK 0000949157 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 35/500: 0000040533 | Time: 11:56:55:   7%|▋         | 34/500 [30:35<6:31:01, 50.35s/it] 

ServerError for CIK 0000040533 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 65/500: 0000854775 | Time: 12:23:40:  13%|█▎        | 64/500 [57:21<5:16:49, 43.60s/it]

ServerError for CIK 0000854775 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 133/500: 0001034670 | Time: 13:22:59:  26%|██▋       | 132/500 [1:56:40<4:18:17, 42.11s/it]

ServerError for CIK 0001034670 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 200/500: 0000720005 | Time: 14:18:42:  40%|███▉      | 199/500 [2:52:23<3:47:04, 45.26s/it]

ServerError for CIK 0000720005 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 257/500: 0000731012 | Time: 15:02:28:  51%|█████     | 256/500 [3:36:09<3:26:18, 50.73s/it]

ServerError for CIK 0000731012 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 273/500: 0001652044 | Time: 15:17:26:  54%|█████▍    | 272/500 [3:51:07<3:10:37, 50.16s/it]

ServerError for CIK 0001652044 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 275/500: 0000708955 | Time: 15:20:03:  55%|█████▍    | 274/500 [3:53:44<4:00:20, 63.81s/it]

ServerError for CIK 0000708955 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 282/500: 0000822818 | Time: 15:27:38:  56%|█████▌    | 281/500 [4:01:19<2:43:54, 44.90s/it]

ServerError for CIK 0000822818 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 298/500: 0001004980 | Time: 15:46:27:  59%|█████▉    | 297/500 [4:20:08<2:30:55, 44.61s/it]

ServerError for CIK 0001004980 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 304/500: 0000009092 | Time: 15:54:59:  61%|██████    | 303/500 [4:28:40<3:57:00, 72.18s/it]

ServerError for CIK 0000009092 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 352/500: 0000883948 | Time: 16:40:07:  70%|███████   | 351/500 [5:13:48<2:24:21, 58.13s/it]

ServerError for CIK 0000883948 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 359/500: 0000104889 | Time: 16:45:31:  72%|███████▏  | 358/500 [5:19:12<1:21:38, 34.50s/it]

ServerError for CIK 0000104889 (Attempt 1/5): 502 Bad Gateway. {'message': '<!DOCTYPE html>\n<html lang=en>\n  <meta charset=utf-8>\n  <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">\n  <title>Error 502 (Server Error)!!1</title>\n  <style>\n    *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/

Processing CIK 366/500: 0000719220 | Time: 16:53:56:  73%|███████▎  | 365/500 [5:27:37<1:54:19, 50.81s/it]

ServerError for CIK 0000719220 (Attempt 1/5): 502 Bad Gateway. {'message': '<!DOCTYPE html>\n<html lang=en>\n  <meta charset=utf-8>\n  <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">\n  <title>Error 502 (Server Error)!!1</title>\n  <style>\n    *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/

Processing CIK 376/500: 0001473844 | Time: 17:03:24:  75%|███████▌  | 375/500 [5:37:05<1:27:17, 41.90s/it]

ServerError for CIK 0001473844 (Attempt 1/5): 502 Bad Gateway. {'message': '<!DOCTYPE html>\n<html lang=en>\n  <meta charset=utf-8>\n  <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">\n  <title>Error 502 (Server Error)!!1</title>\n  <style>\n    *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/

Processing CIK 385/500: 0000907242 | Time: 17:10:59:  77%|███████▋  | 384/500 [5:44:40<1:31:33, 47.36s/it]

ServerError for CIK 0000907242 (Attempt 1/5): 502 Bad Gateway. {'message': '<!DOCTYPE html>\n<html lang=en>\n  <meta charset=utf-8>\n  <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">\n  <title>Error 502 (Server Error)!!1</title>\n  <style>\n    *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/

Processing CIK 386/500: 0001001316 | Time: 17:13:00:  77%|███████▋  | 385/500 [5:46:41<2:13:24, 69.60s/it]

ServerError for CIK 0001001316 (Attempt 1/5): 502 Bad Gateway. {'message': '<!DOCTYPE html>\n<html lang=en>\n  <meta charset=utf-8>\n  <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">\n  <title>Error 502 (Server Error)!!1</title>\n  <style>\n    *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/

Processing CIK 406/500: 0000354950 | Time: 17:30:50:  81%|████████  | 405/500 [6:04:31<1:26:15, 54.48s/it]

ServerError for CIK 0000354950 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 417/500: 0000098677 | Time: 17:41:27:  83%|████████▎ | 416/500 [6:15:08<1:18:16, 55.91s/it]

ServerError for CIK 0000098677 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 418/500: 0000109198 | Time: 17:43:20:  83%|████████▎ | 417/500 [6:17:01<1:41:09, 73.13s/it]

ServerError for CIK 0000109198 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 495/500: 0000003453 | Time: 18:46:13:  99%|█████████▉| 494/500 [7:19:54<04:26, 44.34s/it]  

ServerError for CIK 0000003453 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 500/500: 0000887343 | Time: 18:51:03: 100%|██████████| 500/500 [7:25:44<00:00, 53.49s/it]


In [10]:
ratings3 = llm_ratings_loop(
    cik_list=cik_list3,
    balance_sheets=labeled_balance_sheets,
    income_statements=labeled_income_statements,
    cash_flow_statements=labeled_cash_flow_statements,
    output_path_ratings="../data/ciklist3_ratings_with_previous_quarters.csv",
    output_path_failed="../data/failed_ciks3.csv",
    retries=5,
    retry_delay=30)

Processing CIK 48/491: 0000058492 | Time: 14:42:36:  10%|▉         | 47/491 [36:03<5:50:44, 47.40s/it]

ServerError for CIK 0000058492 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 98/491: 0001519751 | Time: 15:24:16:  20%|█▉        | 97/491 [1:17:42<6:10:15, 56.38s/it]

ServerError for CIK 0001519751 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 116/491: 0000042582 | Time: 15:42:16:  23%|██▎       | 115/491 [1:35:43<6:20:24, 60.70s/it]

ServerError for CIK 0000042582 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 127/491: 0000318154 | Time: 15:53:39:  26%|██▌       | 126/491 [1:47:06<5:48:00, 57.21s/it]

ServerError for CIK 0000318154 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 156/491: 0001021860 | Time: 16:22:12:  32%|███▏      | 155/491 [2:15:38<5:07:23, 54.89s/it]

ServerError for CIK 0001021860 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}
ServerError for CIK 0001021860 (Attempt 2/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}
ServerError for CIK 0001021860 (Attempt 3/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}
ServerError for CIK 0001021860 (Attempt 4/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 157/491: 0001336917 | Time: 16:26:39:  32%|███▏      | 156/491 [2:20:06<11:02:57, 118.74s/it]

ServerError for CIK 0001021860 (Attempt 5/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}
ServerError for CIK 0001336917 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}
ServerError for CIK 0001336917 (Attempt 2/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 158/491: 0000049071 | Time: 16:29:07:  32%|███▏      | 157/491 [2:22:34<11:50:11, 127.58s/it]

ServerError for CIK 0000049071 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}
ServerError for CIK 0000049071 (Attempt 2/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}
ServerError for CIK 0000049071 (Attempt 3/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 159/491: 0000354647 | Time: 16:34:53:  32%|███▏      | 158/491 [2:28:20<17:51:22, 193.04s/it]

ServerError for CIK 0000354647 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 160/491: 0000103379 | Time: 16:36:50:  32%|███▏      | 159/491 [2:30:16<15:40:46, 170.02s/it]

ServerError for CIK 0000103379 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}
ServerError for CIK 0000103379 (Attempt 2/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 161/491: 0000046250 | Time: 16:40:15:  33%|███▎      | 160/491 [2:33:42<16:37:07, 180.75s/it]

ServerError for CIK 0000046250 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 163/491: 0000103730 | Time: 16:43:48:  33%|███▎      | 162/491 [2:37:15<12:51:11, 140.64s/it]

ServerError for CIK 0000103730 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 169/491: 0001057706 | Time: 16:51:21:  34%|███▍      | 168/491 [2:44:48<6:39:47, 74.27s/it]  

ServerError for CIK 0001057706 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 177/491: 0000818479 | Time: 16:59:21:  36%|███▌      | 176/491 [2:52:48<4:58:01, 56.77s/it]

ServerError for CIK 0000818479 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 252/491: 0000005513 | Time: 17:54:04:  51%|█████     | 251/491 [3:47:31<2:05:28, 31.37s/it]

ServerError for CIK 0000005513 (Attempt 1/5): 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}


Processing CIK 491/491: 0001792789 | Time: 20:43:24: 100%|██████████| 491/491 [6:37:02<00:00, 48.52s/it]  


---


# Batched API requests

Since the number of individual requests that would be sent using the previously written function would exceed the daily request limit by far, batch mode will be used.

In [9]:
def create_llm_requests(cik_list, labeled_balance_sheets, labeled_income_statements, labeled_cash_flow_statements):
    requests_data = []
    for i, cik in enumerate(cik_list["CIK"]):
        # Progress message
        print(f"Processing CIK {cik} ({i+1}/{len(cik_list)})")
        
        # First filter dfs for input CIK
        balance_sheets = labeled_balance_sheets[labeled_balance_sheets["CIK"] == cik].copy()
        income_statements = labeled_income_statements[labeled_income_statements["CIK"] == cik].copy()
        cash_flow_statements = labeled_cash_flow_statements[labeled_cash_flow_statements["CIK"] == cik].copy()
        
        # Convert the date columns to datetime objects
        for df in [balance_sheets, income_statements, cash_flow_statements]:
            df["Report Date"] = pd.to_datetime(df["Date"])

        # Determine unique dates
        reporting_dates = pd.concat([
            balance_sheets["Report Date"],
            income_statements["Report Date"],
            cash_flow_statements["Report Date"]
        ]).unique()

        # Sort dates just to be safe
        reporting_dates = np.sort(reporting_dates)

        # In order to handle reports, that were filed slighty apart, a window of 10 days around a given reporting date is used
        window = pd.Timedelta(days=10)

        # Loop over reporting dates to obtain LLM ratings
        for date in reporting_dates:

            # Subset all financial statements for the given date +- window days
            bs = balance_sheets[(balance_sheets["Report Date"] >= date - window) & (balance_sheets["Report Date"] <= date + window)]
            is_ = income_statements[(income_statements["Report Date"] >= date - window) & (income_statements["Report Date"] <= date + window)]
            cf = cash_flow_statements[(cash_flow_statements["Report Date"] >= date - window) & (cash_flow_statements["Report Date"] <= date + window)]
        
            # If no reports are available for the given date, skip to next date
            if bs.empty or is_.empty or cf.empty:
                continue
            
            # Concatenate reports into a string with correct labels
            bs_str = "\n".join(bs.apply(lambda row: f"{row['position_label']}: {row['STD Balance Sheet All']}", axis=1).astype(str))
            is_str = "\n".join(is_.apply(lambda row: f"{row['position_label']}: {row['STD Income Statement All']}", axis=1).astype(str))
            cf_str = "\n".join(cf.apply(lambda row: f"{row['position_label']}: {row['STD Cash Flow All']}", axis=1).astype(str))

            # Construct the request for batch processing
            requests_data.append({
                "key": f"request_{i}_{cik}_{pd.Timestamp(date).strftime('%d_%m_%Y')}",
                "request": {
                    "contents": [{
                        "parts": [{
                            "text": f"""
                            Based on the following financial reports only, please provide an investment recommendation for the underlying company.

                            Balance Sheet: 
                            {bs_str}

                            Income Statement: 
                            {is_str}

                            Cash Flow Statement: 
                            {cf_str}

                            Provide your answer using only one of the following signals: 'strong buy', 'buy', 'hold', 'sell', or 'strong sell'.
                            """.strip()  # strip removes leading and trailing whitespace
                        }]
                    }],
                    "generation_config": {"temperature": 0,
                                          "thinking_config":{"thinking_budget": 0}
                    },
                    "system_instruction": {
                        "parts": [{"text": "You are an experienced, data-driven financial analyst, that provides concise and accurate answers."}]
                    }
                }
            })

                     

                    

                      # Disables thinking, but only required for Gemini 2.5
            
    return requests_data


In [10]:
cik_sublist = ciks1[:100]
ciks1_requests = create_llm_requests(cik_sublist, labeled_balance_sheets, labeled_income_statements, labeled_cash_flow_statements)

Processing CIK 0001996862 (1/100)
Processing CIK 0001049502 (2/100)
Processing CIK 0001071739 (3/100)
Processing CIK 0001164863 (4/100)
Processing CIK 0000717605 (5/100)
Processing CIK 0000008947 (6/100)
Processing CIK 0001341439 (7/100)
Processing CIK 0001955520 (8/100)
Processing CIK 0000775158 (9/100)
Processing CIK 0001166003 (10/100)
Processing CIK 0001360901 (11/100)
Processing CIK 0000822416 (12/100)
Processing CIK 0000066740 (13/100)
Processing CIK 0000089089 (14/100)
Processing CIK 0000889331 (15/100)
Processing CIK 0000876427 (16/100)
Processing CIK 0000104894 (17/100)
Processing CIK 0000788784 (18/100)
Processing CIK 0000038777 (19/100)
Processing CIK 0000805676 (20/100)
Processing CIK 0000089439 (21/100)
Processing CIK 0001025378 (22/100)
Processing CIK 0001532961 (23/100)
Processing CIK 0001845815 (24/100)
Processing CIK 0000913142 (25/100)
Processing CIK 0001569650 (26/100)
Processing CIK 0000882184 (27/100)
Processing CIK 0000106640 (28/100)
Processing CIK 0001095565 (29

In [81]:
# Create a sample JSONL file
with open("my-batch-requests.jsonl", "w") as f:
    for req in ciks1_requests:
        f.write(json.dumps(req) + "\n")


In [12]:
# Read in file to check if it was created correctly
with open("../data/my-batch-requests.jsonl", "r") as f:
    requests = [json.loads(line) for line in f]

In [83]:
uploaded_file = client.files.upload(
    file='my-batch-requests.jsonl',
    config=types.UploadFileConfig(display_name='my-batch-requests', mime_type='jsonl')
)

In [84]:
print(f"Uploaded file: {uploaded_file.name}")

Uploaded file: files/14jmhiosat1d


In [14]:
# Assumes `uploaded_file` is the file object from the previous step
file_batch_job = client.batches.create(
    model="gemini-2.5-flash",
    src=uploaded_file.name,
    config={
        'display_name': "file-upload-job-1",
    },
)

print(f"Created batch job: {file_batch_job.name}")


NameError: name 'uploaded_file' is not defined

In [66]:
# Use the name of the job you want to check
# e.g., inline_batch_job.name from the previous step
job_name = file_batch_job.name  # (e.g. 'batches/your-batch-id')
batch_job = client.batches.get(name=job_name)

completed_states = set([
    'JOB_STATE_SUCCEEDED',
    'JOB_STATE_FAILED',
    'JOB_STATE_CANCELLED',
])

print(f"Polling status for job: {job_name}")
batch_job = client.batches.get(name=job_name) # Initial get
while batch_job.state.name not in completed_states:
  print(f"Current state: {batch_job.state.name}")
  time.sleep(15) # Wait for 30 seconds before polling again
  batch_job = client.batches.get(name=job_name)

print(f"Job finished with state: {batch_job.state.name}")
if batch_job.state.name == 'JOB_STATE_FAILED':
    print(f"Error: {batch_job.error}")


Polling status for job: batches/omc1mqgq4vbwmhbe01uzo2oxn09ai4rlc5ok
Job finished with state: JOB_STATE_SUCCEEDED


In [67]:
import json

# Use the name of the job you want to check
# e.g., inline_batch_job.name from the previous step
job_name = file_batch_job.name
batch_job = client.batches.get(name=job_name)

if batch_job.state.name == 'JOB_STATE_SUCCEEDED':

    # If batch job was created with a file
    if batch_job.dest and batch_job.dest.file_name:
        # Results are in a file
        result_file_name = batch_job.dest.file_name
        print(f"Results are in file: {result_file_name}")

        print("Downloading result file content...")
        file_content = client.files.download(file=result_file_name)
        # Process file_content (bytes) as needed
        print(file_content.decode('utf-8'))

    # If batch job was created with inline request
    elif batch_job.dest and batch_job.dest.inlined_responses:
        # Results are inline
        print("Results are inline:")
        for i, inline_response in enumerate(batch_job.dest.inlined_responses):
            print(f"Response {i+1}:")
            if inline_response.response:
                # Accessing response, structure may vary.
                try:
                    print(inline_response.response.text)
                except AttributeError:
                    print(inline_response.response) # Fallback
            elif inline_response.error:
                print(f"Error: {inline_response.error}")
    else:
        print("No results found (neither file nor inline).")
else:
    print(f"Job did not succeed. Final state: {batch_job.state.name}")
    if batch_job.error:
        print(f"Error: {batch_job.error}")


Results are in file: files/batch-omc1mqgq4vbwmhbe01uzo2oxn09ai4rlc5ok
Downloading result file content...
{"response":{"candidates":[{"content":{"parts":[{"text":"Hold"}],"role":"model"},"index":0,"finishReason":"STOP"}],"usageMetadata":{"candidatesTokenCount":1,"totalTokenCount":3960,"promptTokensDetails":[{"modality":"TEXT","tokenCount":3959}],"promptTokenCount":3959},"responseId":"6eN4aIvLOvPB1MkPq4XgyQE","modelVersion":"gemini-2.5-flash"},"key":"request_0_0001295401_30_06_2004"}
{"response":{"candidates":[{"finishReason":"STOP","content":{"parts":[{"text":"Hold"}],"role":"model"},"index":0}],"responseId":"6eN4aOadJO-n1MkPs6OcsAc","usageMetadata":{"promptTokensDetails":[{"modality":"TEXT","tokenCount":4096}],"candidatesTokenCount":1,"totalTokenCount":4097,"promptTokenCount":4096},"modelVersion":"gemini-2.5-flash"},"key":"request_0_0001295401_30_09_2004"}



check differences between temperature and thinking budget, but parameters seem to be working

----


Testing