In [2]:
import os
import csv
import numpy as np
import openai
import pandas as pd
from datetime import datetime
from pathlib import Path
from io import StringIO
import re

openai.api_key = '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'

In [3]:
"""
    Process and load csv's
"""
def process_one_csv(path: Path, n_keep: int = 10) -> pd.DataFrame:
    # Load csv keep column ids as strings
    df = pd.read_csv(
        path,
        dtype={"Topic_Clusters": str, "Topic_Sub_Clusters": str},
    )

    # All columns to the right of the first two are probabilities
    value_cols = df.columns[3:]
    df[value_cols] = df[value_cols].apply(pd.to_numeric, errors="coerce")

    # Build new columns as lists
    topic_ids, sub_ids, cluster_ids, topics_ls = [], [], [], []

    for _, row in df.iterrows():
        topic_ids.append(_)
        numeric_row = row[value_cols].astype(float)

        top = numeric_row.nlargest(n_keep)
        topics_ls.append([[term, round(float(weight), 6)] for term, weight in top.items()])

        # Normalize Subcluster ids
        raw = str(row["Topic_Sub_Clusters"])
        try:
            sub_ids.append(int(raw.split("_")[1]))
        except (IndexError, ValueError):
            sub_ids.append(-1)
        cluster_ids.append(row["Topic_Clusters"])

    # Form final iter dataframe
    new_cols = pd.DataFrame(
        {
            "file_name": os.path.basename(path)
            , "topic_id": topic_ids
            , "subcluster_id": sub_ids
            , "cluster_id": cluster_ids
            , "topics": topics_ls
        },
        index=df.index,
    )

    result = pd.concat([new_cols, df[value_cols]], axis=1)
    return result

INPUT_DIR = Path("input")
OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)

# Mainloop for extracting datasets from the input folder
result_matrix = []

for csv_file in INPUT_DIR.glob("*.csv"):
    inner_df = process_one_csv(csv_file, n_keep=5)
    result_matrix.append([csv_file.name, inner_df])

# Create a dataframe of file names, nested with their content
results_df = pd.DataFrame(result_matrix, columns=["file_name", "inner_df"])

# Preview header daatframe
print("\n===================Summary===================")
display(results_df[["file_name"]]) # list filenames

# Access first iteration of dataframe
first_inner = results_df.loc[0, "inner_df"]
print("\nFirst inner frame:")
display(first_inner.head())




Unnamed: 0,file_name
0,LDA_100_topic_word_norm_with_4_with_subcluster...



First inner frame:


Unnamed: 0,file_name,topic_id,subcluster_id,cluster_id,topics,absolutely,acceleration,accounting,accretive,acquire,...,year continue,year expect,year go,year look,year quarter,year see,year would,year year,yield,york
0,LDA_100_topic_word_norm_with_4_with_subcluster...,0,2,0,"[[organic, 0.230976], [organic growth, 0.09574...",0.001398,0.001732,0.001052,0.00195,0.007201,...,0.000886,0.00065,0.0006,0.000764,0.000271,0.001249,0.00083,0.000386,7e-06,2.185418e-07
1,LDA_100_topic_word_norm_with_4_with_subcluster...,1,0,0,"[[aircraft, 0.061866], [united, 0.021051], [fl...",0.002064,0.000154,0.00053,0.000455,0.000113,...,0.000904,0.001611,0.00126,0.000902,3.8e-05,0.000725,0.000806,0.001476,0.00291,0.0002362675
2,LDA_100_topic_word_norm_with_4_with_subcluster...,2,0,0,"[[plant, 0.064305], [packaging, 0.036433], [to...",0.000478,4e-05,0.000118,0.00019,0.000659,...,0.000515,0.001234,0.000827,0.000812,0.00019,0.001275,0.000445,0.000632,0.000452,5.571611e-08
3,LDA_100_topic_word_norm_with_4_with_subcluster...,3,1,0,"[[gross margin, 0.114848], [shipment, 0.040261...",0.000829,0.000617,0.00019,0.000135,0.000577,...,0.000257,0.001337,0.000435,0.000246,0.00038,0.000895,0.000483,0.001292,1.7e-05,6.201966e-08
4,LDA_100_topic_word_norm_with_4_with_subcluster...,4,0,0,"[[gas, 0.120051], [energy, 0.070837], [natural...",0.00052,2.4e-05,0.000167,0.000595,0.001406,...,0.000352,0.000527,0.000313,0.000459,0.000327,0.001072,0.000574,0.00051,0.000819,3.363482e-05


In [6]:
"""
    Stores ChatGPT relabeling function. Takes a prompt and returns the result
"""
def get_chatgpt_response(prompt, model="gpt-4.1"):
    try:
        completion = openai.chat.completions.create(model=model,messages=[{"role": "user", "content": prompt}])
        return completion.choices[0].message.content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [7]:
"""
    Send initial prompt call for taking [(topic, weights), ...] to derive base level meanings
"""
base_prompt = (
    "In the context of an earnings call, the list of topics are relevant, with their relevancy nested in their decimal number weight"
    "Provide me back a 1-3 word description that summarizes them and use financial, economic, accounting language"
    "DO NOT RETURN ANYTHING OTHER THAN THE 1-3 WORD RESPONSE, THE SAKE OF THE WORLD DEPENDS ON IT"
    "\n\n"
)

for file_rec in results_df.itertuples(index=False):
    file_name = file_rec.file_name
    inner_df = file_rec.inner_df

    print(f"\n===================Processing rows from {file_name}===================")

    # Iterate through each loop of the inner dataframe
    for idx, row in inner_df.iterrows():
        subcluster_id = row["subcluster_id"]
        cluster_id = row["cluster_id"]
        topics = row["topics"]

        # Preview results as they are built
        print(f"row {idx:>4}: sub={subcluster_id}, clust={cluster_id}, "f"top term={topics[0][0]} (wt={topics[0][1]})", topics)
        
        prompt = base_prompt + f"Chunk of words:\n{topics}\n"
        caller = get_chatgpt_response(prompt, model="gpt-4.1")
        inner_df.at[idx, "topic_name"] = caller.strip()

print("\n===================Summary===================")
display(results_df[["file_name"]])

first_inner = results_df.loc[0, "inner_df"]
print("\nPreview frame results:")
display(first_inner.head())


row    0: sub=2, clust=0, top term=organic (wt=0.230976) [['organic', 0.230976], ['organic growth', 0.095748], ['organic revenue', 0.038918], ['services', 0.012626], ['organically', 0.012216]]
row    1: sub=0, clust=0, top term=aircraft (wt=0.061866) [['aircraft', 0.061866], ['united', 0.021051], ['fleet', 0.020797], ['delivery', 0.020031], ['international', 0.015799]]
row    2: sub=0, clust=0, top term=plant (wt=0.064305) [['plant', 0.064305], ['packaging', 0.036433], ['ton', 0.03458], ['export', 0.027069], ['brazil', 0.023463]]
row    3: sub=1, clust=0, top term=gross margin (wt=0.114848) [['gross margin', 0.114848], ['shipment', 0.040261], ['new product', 0.024856], ['ship', 0.023289], ['ramp', 0.022372]]
row    4: sub=0, clust=0, top term=gas (wt=0.120051) [['gas', 0.120051], ['energy', 0.070837], ['natural', 0.067073], ['natural gas', 0.047968], ['plant', 0.019523]]
row    5: sub=0, clust=0, top term=fuel (wt=0.273163) [['fuel', 0.273163], ['stuff', 0.018555], ['card', 0.013833],

Unnamed: 0,file_name
0,LDA_100_topic_word_norm_with_4_with_subcluster...



Preview frame results:


Unnamed: 0,file_name,topic_id,subcluster_id,cluster_id,topics,absolutely,acceleration,accounting,accretive,acquire,...,year expect,year go,year look,year quarter,year see,year would,year year,yield,york,topic_name
0,LDA_100_topic_word_norm_with_4_with_subcluster...,0,2,0,"[[organic, 0.230976], [organic growth, 0.09574...",0.001398,0.001732,0.001052,0.00195,0.007201,...,0.00065,0.0006,0.000764,0.000271,0.001249,0.00083,0.000386,7e-06,2.185418e-07,Organic Growth
1,LDA_100_topic_word_norm_with_4_with_subcluster...,1,0,0,"[[aircraft, 0.061866], [united, 0.021051], [fl...",0.002064,0.000154,0.00053,0.000455,0.000113,...,0.001611,0.00126,0.000902,3.8e-05,0.000725,0.000806,0.001476,0.00291,0.0002362675,Fleet Expansion
2,LDA_100_topic_word_norm_with_4_with_subcluster...,2,0,0,"[[plant, 0.064305], [packaging, 0.036433], [to...",0.000478,4e-05,0.000118,0.00019,0.000659,...,0.001234,0.000827,0.000812,0.00019,0.001275,0.000445,0.000632,0.000452,5.571611e-08,Production Volume
3,LDA_100_topic_word_norm_with_4_with_subcluster...,3,1,0,"[[gross margin, 0.114848], [shipment, 0.040261...",0.000829,0.000617,0.00019,0.000135,0.000577,...,0.001337,0.000435,0.000246,0.00038,0.000895,0.000483,0.001292,1.7e-05,6.201966e-08,Gross Margin
4,LDA_100_topic_word_norm_with_4_with_subcluster...,4,0,0,"[[gas, 0.120051], [energy, 0.070837], [natural...",0.00052,2.4e-05,0.000167,0.000595,0.001406,...,0.000527,0.000313,0.000459,0.000327,0.001072,0.000574,0.00051,0.000819,3.363482e-05,Natural Gas Operations


In [8]:
"""
    Run subcluster and clusting action through GPT. More precise prompt due to data requirements of maintaining keys
"""

for rec in results_df.itertuples(index=False):
    file_name  = rec.file_name
    inner_df = rec.inner_df.copy()

    # Idea for chatgpt to keep key strings separate from learning strings
    inner_df["key1"] = "$" + inner_df["topic_id"].astype(str) + "$"
    inner_df["key2"] = "$" + inner_df["topic_name"] + "$"

    # Select sending columns
    send_it = inner_df[[
        "file_name","key1", "key2", "topic_id", "subcluster_id","cluster_id", "topic_name"
    ]]

    def dataframe_to_markdown(df: pd.DataFrame, include_index: bool = False) -> str:
        # Convert dataframe to markdown for improving GPT's structure understanding
        if include_index:
            df = df.reset_index()
        columns = df.columns.tolist()
        headings = "| " + " | ".join(columns) + " |"
        deliminator = "| " + " | ".join("---" for _ in columns) + " |"
        body_lines = []
        for row in df.itertuples(index=False, name=None):
            cells = [str(cell) for cell in row]
            body_lines.append("| " + " | ".join(cells) + " |")
        return "\n".join([headings, deliminator] + body_lines)
    
    readable = dataframe_to_markdown(send_it)

    base_prompt = """
        CONTEXT:
        Every quarter, S&P 500 companies hold earnings calls to discuss financial results.
        We’ve extracted topic vectors (5 weighted terms each) clustered into 
        topic_id → subcluster_id → cluster_id.
        topic_name → subcluster_name → cluster_name.

        YOUR TASK:
        1. You will receive a single row as a CSV with these columns:
        • file_name
        • key1
        • key2
        • topic_id
        • topic_name
        • subcluster_id
        • topics (5 term:weight pairs)

        2. ADD TWO NEW COLUMNS:
        • subcluster_name
        • cluster_name

        3. SUBCLUSTER_NAME should be a summary of the topic_name that can be grouped underneath it

        4. CLUSTER_NAME MUST BE CHOSEN FROM:
            Economic & Financial
            Technological & Digital
            Environmental & Resource
            Social & Consumer
            Corporate Strategy & Execution
            – Try to assign at least two different clusters per file.
            – each SUBCLUSTER_NAME must be grouped into one CLUSTER_NAME
            
        TRY TO HAVE NO MORE THAN 15 UNIQUE SUBCLUSTER_NAME in the output

        4. YOU MUST NOT MODIFY:
        file_name, key1, key2, topic_id, topic_name
        – They must exactly match the input row, in the same order.

        5. OUTPUT:
        Return **only** a CSV (no extra text) with **all** 100 rows,
        and these columns in this order:
        file_name, key1, key2, topic_id, topic_name, 
        subcluster_id, subcluster_name, cluster_name, cluster_id

        !!! BEFORE RETURNING:
        • VERIFY that file_name, key1 and key2 are unchanged for every row.
        • DO NOT include any leading/trailing comments—CSV ONLY.
    """

    prompt = base_prompt + f"\n\nDataframe to analyze:\n{readable}\n"
    print("================PROMPT================")
    display(prompt)
    caller = get_chatgpt_response(prompt, model="gpt-4.1")
    print("================RESPONSE================")
    display(caller)
    csv_text = caller.strip()
    
    # Execute post-processing
    df_response = pd.read_csv(StringIO(csv_text))
    inner_df = df_response
    print(f"Processed {file_name}, got columns: {inner_df.columns.tolist()}")



'\n        CONTEXT:\n        Every quarter, S&P\xa0500 companies hold earnings calls to discuss financial results.\n        We’ve extracted topic vectors (5 weighted terms each) clustered into \n        topic_id → subcluster_id → cluster_id.\n        topic_name → subcluster_name → cluster_name.\n\n        YOUR TASK:\n        1. You will receive a single row as a CSV with these columns:\n        • file_name\n        • key1\n        • key2\n        • topic_id\n        • topic_name\n        • subcluster_id\n        • topics (5 term:weight pairs)\n\n        2. ADD TWO NEW COLUMNS:\n        • subcluster_name\n        • cluster_name\n\n        3. SUBCLUSTER_NAME should be a summary of the topic_name that can be grouped underneath it\n\n        4. CLUSTER_NAME MUST BE CHOSEN FROM:\n            Economic & Financial\n            Technological & Digital\n            Environmental & Resource\n            Social & Consumer\n            Corporate Strategy & Execution\n            –\xa0Try to assign



'file_name,key1,key2,topic_id,topic_name,subcluster_id,subcluster_name,cluster_name,cluster_id\nLDA_100_topic_word_norm_with_4_with_subclusters.csv,$0$,$Organic Growth$,0,Organic Growth,2,Growth,Corporate Strategy & Execution,0\nLDA_100_topic_word_norm_with_4_with_subclusters.csv,$1$,$Fleet Expansion$,1,Fleet Expansion,0,Asset & Operations,Corporate Strategy & Execution,0\nLDA_100_topic_word_norm_with_4_with_subclusters.csv,$2$,$Production Volume$,2,Production Volume,0,Asset & Operations,Corporate Strategy & Execution,0\nLDA_100_topic_word_norm_with_4_with_subclusters.csv,$3$,$Gross Margin$,3,Gross Margin,1,Profitability Metrics,Economic & Financial,0\nLDA_100_topic_word_norm_with_4_with_subclusters.csv,$4$,$Natural Gas Operations$,4,Natural Gas Operations,0,Energy & Utilities,Environmental & Resource,0\nLDA_100_topic_word_norm_with_4_with_subclusters.csv,$5$,$Fuel Costs$,5,Fuel Costs,0,Energy & Utilities,Environmental & Resource,0\nLDA_100_topic_word_norm_with_4_with_subclusters.csv,$

Processed LDA_100_topic_word_norm_with_4_with_subclusters.csv, got columns: ['file_name', 'key1', 'key2', 'topic_id', 'topic_name', 'subcluster_id', 'subcluster_name', 'cluster_name', 'cluster_id']


In [9]:
"""
  Read and clean export data
"""
csv_text = re.sub(r"^```csv\s*", "", caller) # drop the opening fence
csv_text = re.sub(r"\s*```$", "",  csv_text) # drop the closing fence
csv_text = csv_text.strip() # strip any leading/trailing whitespace
df_response = pd.read_csv(StringIO(csv_text)) # convert to a pandas DataFrame

df_response = df_response.drop(columns=["cluster_id", "subcluster_id"]) # Drop old columns

# Redindex cluster_id alphabetically
cluster_names = sorted(df_response["cluster_name"].unique())
cluster_id_map = {name: i for i, name in enumerate(cluster_names)}
df_response["cluster_id"] = df_response["cluster_name"].map(cluster_id_map)

# Reindex subcluster_id within cluster_id alphabetically
df_response["subcluster_id"] = (
    df_response
        .groupby("cluster_name")["subcluster_name"]
        .transform(lambda s: pd.Categorical(s, categories=sorted(s.unique())).codes)
)

# Preview cleaning
print("===========RESPONSE_DF===========")
display(df_response)

print("===========ORIGINAL_DF===========")
df_original = rec.inner_df.copy()
display(df_original)



Unnamed: 0,file_name,key1,key2,topic_id,topic_name,subcluster_name,cluster_name,cluster_id,subcluster_id
0,LDA_100_topic_word_norm_with_4_with_subcluster...,$0$,$Organic Growth$,0,Organic Growth,Growth,Corporate Strategy & Execution,0,2
1,LDA_100_topic_word_norm_with_4_with_subcluster...,$1$,$Fleet Expansion$,1,Fleet Expansion,Asset & Operations,Corporate Strategy & Execution,0,0
2,LDA_100_topic_word_norm_with_4_with_subcluster...,$2$,$Production Volume$,2,Production Volume,Asset & Operations,Corporate Strategy & Execution,0,0
3,LDA_100_topic_word_norm_with_4_with_subcluster...,$3$,$Gross Margin$,3,Gross Margin,Profitability Metrics,Economic & Financial,1,5
4,LDA_100_topic_word_norm_with_4_with_subcluster...,$4$,$Natural Gas Operations$,4,Natural Gas Operations,Energy & Utilities,Environmental & Resource,2,0
...,...,...,...,...,...,...,...,...,...
95,LDA_100_topic_word_norm_with_4_with_subcluster...,$95$,$Retail Channel Mix$,95,Retail Channel Mix,Brand & Channel,Social & Consumer,3,0
96,LDA_100_topic_word_norm_with_4_with_subcluster...,$96$,$Brand Performance$,96,Brand Performance,Brand & Channel,Social & Consumer,3,0
97,LDA_100_topic_word_norm_with_4_with_subcluster...,$97$,$Revenue Performance$,97,Revenue Performance,Brand & Channel,Social & Consumer,3,0
98,LDA_100_topic_word_norm_with_4_with_subcluster...,$98$,$Fiscal Results$,98,Fiscal Results,Financial Results,Economic & Financial,1,1




Unnamed: 0,file_name,topic_id,subcluster_id,cluster_id,topics,absolutely,acceleration,accounting,accretive,acquire,...,year expect,year go,year look,year quarter,year see,year would,year year,yield,york,topic_name
0,LDA_100_topic_word_norm_with_4_with_subcluster...,0,2,0,"[[organic, 0.230976], [organic growth, 0.09574...",0.001398,0.001732,0.001052,0.001950,0.007201,...,0.000650,0.000600,0.000764,0.000271,0.001249,0.000830,0.000386,6.670445e-06,2.185418e-07,Organic Growth
1,LDA_100_topic_word_norm_with_4_with_subcluster...,1,0,0,"[[aircraft, 0.061866], [united, 0.021051], [fl...",0.002064,0.000154,0.000530,0.000455,0.000113,...,0.001611,0.001260,0.000902,0.000038,0.000725,0.000806,0.001476,2.910178e-03,2.362675e-04,Fleet Expansion
2,LDA_100_topic_word_norm_with_4_with_subcluster...,2,0,0,"[[plant, 0.064305], [packaging, 0.036433], [to...",0.000478,0.000040,0.000118,0.000190,0.000659,...,0.001234,0.000827,0.000812,0.000190,0.001275,0.000445,0.000632,4.517502e-04,5.571611e-08,Production Volume
3,LDA_100_topic_word_norm_with_4_with_subcluster...,3,1,0,"[[gross margin, 0.114848], [shipment, 0.040261...",0.000829,0.000617,0.000190,0.000135,0.000577,...,0.001337,0.000435,0.000246,0.000380,0.000895,0.000483,0.001292,1.671633e-05,6.201966e-08,Gross Margin
4,LDA_100_topic_word_norm_with_4_with_subcluster...,4,0,0,"[[gas, 0.120051], [energy, 0.070837], [natural...",0.000520,0.000024,0.000167,0.000595,0.001406,...,0.000527,0.000313,0.000459,0.000327,0.001072,0.000574,0.000510,8.193279e-04,3.363482e-05,Natural Gas Operations
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,LDA_100_topic_word_norm_with_4_with_subcluster...,95,1,3,"[[brand, 0.088151], [retail, 0.072138], [chann...",0.001050,0.001707,0.000155,0.000574,0.000308,...,0.001324,0.000247,0.000535,0.000217,0.000774,0.000089,0.000200,5.192086e-08,6.963468e-04,Retail Channel Mix
96,LDA_100_topic_word_norm_with_4_with_subcluster...,96,1,3,"[[brand, 0.216955], [innovation, 0.027693], [m...",0.000740,0.001690,0.001194,0.000557,0.000911,...,0.000946,0.000244,0.000605,0.000056,0.000478,0.000598,0.000468,1.127390e-06,1.396094e-06,Brand Performance
97,LDA_100_topic_word_norm_with_4_with_subcluster...,97,2,3,"[[net sale, 0.100041], [market share, 0.038637...",0.000360,0.001930,0.000739,0.000223,0.000803,...,0.000797,0.000312,0.000180,0.000273,0.000317,0.000459,0.000121,7.275657e-08,7.504904e-08,Revenue Performance
98,LDA_100_topic_word_norm_with_4_with_subcluster...,98,0,3,"[[fiscal, 0.073283], [profit, 0.053042], [gros...",0.000737,0.001517,0.000027,0.000991,0.000695,...,0.001601,0.000588,0.000407,0.001968,0.000400,0.001446,0.000189,4.805660e-05,8.932758e-08,Fiscal Results


In [10]:
"""
    Reorder and export output
"""

BASE_OUT = Path("output")
ts_folder = BASE_OUT / datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
ts_folder.mkdir(parents=True, exist_ok=True)
print("Saving files to →", ts_folder)

INPUT_DIR = Path("input")
ORDER_DIR = Path("order")

inner_df = rec.inner_df
file_name = rec.file_name

prep = df_original.rename(columns={
        'topic_name': 'topic_name_orig'
        , 'subcluster_id': 'subcluster_id_orig'
        , 'cluster_id': 'cluster_id_orig'}
    ).copy()

keys = ["topic_id"]
vals = [ "topic_name", "subcluster_id", "subcluster_name", "cluster_id", "cluster_name"]
df_merged = prep.merge(
    df_response[keys + vals],
    on=keys,
    how="left",
)

display(df_merged)

# Drop and reorder columns
export = df_merged.drop(columns=["file_name"], errors="ignore")
front_cols = [
    "topic_id",
    "topic_name",
    "topic_name_orig",
    "topics",
    "subcluster_id",
    "subcluster_id_orig",
    "subcluster_name",
    "cluster_id",
    "cluster_id_orig",
    "cluster_name",
]
front_cols = [c for c in front_cols if c in export.columns]
export = export[front_cols + [c for c in export.columns if c not in front_cols]]

display(export)

# Apply ordering from ordering file to regain initial topic importance
stem_prefix = Path(file_name).stem[:7]
order_file = next(f for f in ORDER_DIR.glob("*.csv") if f.stem.startswith(stem_prefix))

# Merge in order from Absolutely column's weights
order_df = pd.read_csv(order_file, usecols=["absolutely"])
order_df["__row_rank"] = range(len(order_df))
export = (
    export
        .merge(order_df, on="absolutely", how="left")
        .sort_values("__row_rank", kind="mergesort")
        .drop(columns="__row_rank")
        .reset_index(drop=True)
)

display(export)

# Reset topic_id to match new order
export["topic_id"] = export.index
columns = ["topic_id"] + [c for c in export.columns if c != "topic_id"]
export = export[columns]

display(export)

# Save to outputs/timestamp
out_path = ts_folder / Path(file_name).name
export.to_csv(out_path, index=False)
print(f"✓ saved {out_path}")

Saving files to → output/2025-04-20_17-51-22


Unnamed: 0,file_name,topic_id,subcluster_id_orig,cluster_id_orig,topics,absolutely,acceleration,accounting,accretive,acquire,...,year would,year year,yield,york,topic_name_orig,topic_name,subcluster_id,subcluster_name,cluster_id,cluster_name
0,LDA_100_topic_word_norm_with_4_with_subcluster...,0,2,0,"[[organic, 0.230976], [organic growth, 0.09574...",0.001398,0.001732,0.001052,0.001950,0.007201,...,0.000830,0.000386,6.670445e-06,2.185418e-07,Organic Growth,Organic Growth,2,Growth,0,Corporate Strategy & Execution
1,LDA_100_topic_word_norm_with_4_with_subcluster...,1,0,0,"[[aircraft, 0.061866], [united, 0.021051], [fl...",0.002064,0.000154,0.000530,0.000455,0.000113,...,0.000806,0.001476,2.910178e-03,2.362675e-04,Fleet Expansion,Fleet Expansion,0,Asset & Operations,0,Corporate Strategy & Execution
2,LDA_100_topic_word_norm_with_4_with_subcluster...,2,0,0,"[[plant, 0.064305], [packaging, 0.036433], [to...",0.000478,0.000040,0.000118,0.000190,0.000659,...,0.000445,0.000632,4.517502e-04,5.571611e-08,Production Volume,Production Volume,0,Asset & Operations,0,Corporate Strategy & Execution
3,LDA_100_topic_word_norm_with_4_with_subcluster...,3,1,0,"[[gross margin, 0.114848], [shipment, 0.040261...",0.000829,0.000617,0.000190,0.000135,0.000577,...,0.000483,0.001292,1.671633e-05,6.201966e-08,Gross Margin,Gross Margin,5,Profitability Metrics,1,Economic & Financial
4,LDA_100_topic_word_norm_with_4_with_subcluster...,4,0,0,"[[gas, 0.120051], [energy, 0.070837], [natural...",0.000520,0.000024,0.000167,0.000595,0.001406,...,0.000574,0.000510,8.193279e-04,3.363482e-05,Natural Gas Operations,Natural Gas Operations,0,Energy & Utilities,2,Environmental & Resource
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,LDA_100_topic_word_norm_with_4_with_subcluster...,95,1,3,"[[brand, 0.088151], [retail, 0.072138], [chann...",0.001050,0.001707,0.000155,0.000574,0.000308,...,0.000089,0.000200,5.192086e-08,6.963468e-04,Retail Channel Mix,Retail Channel Mix,0,Brand & Channel,3,Social & Consumer
96,LDA_100_topic_word_norm_with_4_with_subcluster...,96,1,3,"[[brand, 0.216955], [innovation, 0.027693], [m...",0.000740,0.001690,0.001194,0.000557,0.000911,...,0.000598,0.000468,1.127390e-06,1.396094e-06,Brand Performance,Brand Performance,0,Brand & Channel,3,Social & Consumer
97,LDA_100_topic_word_norm_with_4_with_subcluster...,97,2,3,"[[net sale, 0.100041], [market share, 0.038637...",0.000360,0.001930,0.000739,0.000223,0.000803,...,0.000459,0.000121,7.275657e-08,7.504904e-08,Revenue Performance,Revenue Performance,0,Brand & Channel,3,Social & Consumer
98,LDA_100_topic_word_norm_with_4_with_subcluster...,98,0,3,"[[fiscal, 0.073283], [profit, 0.053042], [gros...",0.000737,0.001517,0.000027,0.000991,0.000695,...,0.001446,0.000189,4.805660e-05,8.932758e-08,Fiscal Results,Fiscal Results,1,Financial Results,1,Economic & Financial


Unnamed: 0,topic_id,topic_name,topic_name_orig,topics,subcluster_id,subcluster_id_orig,subcluster_name,cluster_id,cluster_id_orig,cluster_name,...,year continue,year expect,year go,year look,year quarter,year see,year would,year year,yield,york
0,0,Organic Growth,Organic Growth,"[[organic, 0.230976], [organic growth, 0.09574...",2,2,Growth,0,0,Corporate Strategy & Execution,...,0.000886,0.000650,0.000600,0.000764,0.000271,0.001249,0.000830,0.000386,6.670445e-06,2.185418e-07
1,1,Fleet Expansion,Fleet Expansion,"[[aircraft, 0.061866], [united, 0.021051], [fl...",0,0,Asset & Operations,0,0,Corporate Strategy & Execution,...,0.000904,0.001611,0.001260,0.000902,0.000038,0.000725,0.000806,0.001476,2.910178e-03,2.362675e-04
2,2,Production Volume,Production Volume,"[[plant, 0.064305], [packaging, 0.036433], [to...",0,0,Asset & Operations,0,0,Corporate Strategy & Execution,...,0.000515,0.001234,0.000827,0.000812,0.000190,0.001275,0.000445,0.000632,4.517502e-04,5.571611e-08
3,3,Gross Margin,Gross Margin,"[[gross margin, 0.114848], [shipment, 0.040261...",5,1,Profitability Metrics,1,0,Economic & Financial,...,0.000257,0.001337,0.000435,0.000246,0.000380,0.000895,0.000483,0.001292,1.671633e-05,6.201966e-08
4,4,Natural Gas Operations,Natural Gas Operations,"[[gas, 0.120051], [energy, 0.070837], [natural...",0,0,Energy & Utilities,2,0,Environmental & Resource,...,0.000352,0.000527,0.000313,0.000459,0.000327,0.001072,0.000574,0.000510,8.193279e-04,3.363482e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,Retail Channel Mix,Retail Channel Mix,"[[brand, 0.088151], [retail, 0.072138], [chann...",0,1,Brand & Channel,3,3,Social & Consumer,...,0.001418,0.001324,0.000247,0.000535,0.000217,0.000774,0.000089,0.000200,5.192086e-08,6.963468e-04
96,96,Brand Performance,Brand Performance,"[[brand, 0.216955], [innovation, 0.027693], [m...",0,1,Brand & Channel,3,3,Social & Consumer,...,0.000881,0.000946,0.000244,0.000605,0.000056,0.000478,0.000598,0.000468,1.127390e-06,1.396094e-06
97,97,Revenue Performance,Revenue Performance,"[[net sale, 0.100041], [market share, 0.038637...",0,2,Brand & Channel,3,3,Social & Consumer,...,0.000774,0.000797,0.000312,0.000180,0.000273,0.000317,0.000459,0.000121,7.275657e-08,7.504904e-08
98,98,Fiscal Results,Fiscal Results,"[[fiscal, 0.073283], [profit, 0.053042], [gros...",1,0,Financial Results,1,3,Economic & Financial,...,0.001527,0.001601,0.000588,0.000407,0.001968,0.000400,0.001446,0.000189,4.805660e-05,8.932758e-08


Unnamed: 0,topic_id,topic_name,topic_name_orig,topics,subcluster_id,subcluster_id_orig,subcluster_name,cluster_id,cluster_id_orig,cluster_name,...,year continue,year expect,year go,year look,year quarter,year see,year would,year year,yield,york
0,29,Store Performance,Store Performance,"[[store, 0.200999], [comp, 0.054356], [online,...",3,0,Retail & Consumer,3,1,Social & Consumer,...,0.001149,0.001418,0.000550,0.000567,0.001182,0.000764,0.000660,0.001109,2.840055e-08,3.382235e-05
1,30,Financial Instruments,Financial Instruments,"[[tool, 0.16287], [new product, 0.022916], [cu...",2,0,Financing & Risk,1,1,Economic & Financial,...,0.000066,0.000079,0.000070,0.000290,0.001018,0.000137,0.000156,0.000209,2.292114e-03,1.288875e-07
2,0,Organic Growth,Organic Growth,"[[organic, 0.230976], [organic growth, 0.09574...",2,2,Growth,0,0,Corporate Strategy & Execution,...,0.000886,0.000650,0.000600,0.000764,0.000271,0.001249,0.000830,0.000386,6.670445e-06,2.185418e-07
3,31,Capital Markets,Capital Markets,"[[senior, 0.19277], [senior vice, 0.111769], [...",2,1,Financing & Risk,1,1,Economic & Financial,...,0.000211,0.000588,0.000418,0.000412,0.000011,0.000838,0.000556,0.002503,6.918441e-05,1.129268e-07
4,95,Retail Channel Mix,Retail Channel Mix,"[[brand, 0.088151], [retail, 0.072138], [chann...",0,1,Brand & Channel,3,3,Social & Consumer,...,0.001418,0.001324,0.000247,0.000535,0.000217,0.000774,0.000089,0.000200,5.192086e-08,6.963468e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,88,Express Shipping,Express Shipping,"[[express, 0.147459], [ground, 0.098676], [pac...",0,0,Asset & Operations,0,1,Corporate Strategy & Execution,...,0.000189,0.000194,0.000310,0.000414,0.000049,0.000533,0.000346,0.001974,1.221889e-02,3.591596e-06
96,89,Product Offerings,Product Offerings,"[[solutions, 0.142157], [mortgage, 0.04265], [...",3,0,Retail & Consumer,3,1,Social & Consumer,...,0.000951,0.002408,0.000458,0.000078,0.000287,0.000665,0.000289,0.000203,2.238873e-05,1.070367e-07
97,28,Revenue Outlook,Revenue Outlook,"[[come line, 0.086321], [organization, 0.00724...",6,0,Revenue Trends,1,0,Economic & Financial,...,0.001809,0.002114,0.001152,0.001478,0.000161,0.002602,0.002121,0.001228,1.060189e-04,2.889729e-08
98,94,Client Retention,Client Retention,"[[client, 0.22997], [retention, 0.018155], [ne...",0,0,Client & Fund Services,1,2,Economic & Financial,...,0.001031,0.001141,0.000474,0.000618,0.000462,0.000903,0.000779,0.000701,7.484136e-05,9.500245e-06


Unnamed: 0,topic_id,topic_name,topic_name_orig,topics,subcluster_id,subcluster_id_orig,subcluster_name,cluster_id,cluster_id_orig,cluster_name,...,year continue,year expect,year go,year look,year quarter,year see,year would,year year,yield,york
0,0,Store Performance,Store Performance,"[[store, 0.200999], [comp, 0.054356], [online,...",3,0,Retail & Consumer,3,1,Social & Consumer,...,0.001149,0.001418,0.000550,0.000567,0.001182,0.000764,0.000660,0.001109,2.840055e-08,3.382235e-05
1,1,Financial Instruments,Financial Instruments,"[[tool, 0.16287], [new product, 0.022916], [cu...",2,0,Financing & Risk,1,1,Economic & Financial,...,0.000066,0.000079,0.000070,0.000290,0.001018,0.000137,0.000156,0.000209,2.292114e-03,1.288875e-07
2,2,Organic Growth,Organic Growth,"[[organic, 0.230976], [organic growth, 0.09574...",2,2,Growth,0,0,Corporate Strategy & Execution,...,0.000886,0.000650,0.000600,0.000764,0.000271,0.001249,0.000830,0.000386,6.670445e-06,2.185418e-07
3,3,Capital Markets,Capital Markets,"[[senior, 0.19277], [senior vice, 0.111769], [...",2,1,Financing & Risk,1,1,Economic & Financial,...,0.000211,0.000588,0.000418,0.000412,0.000011,0.000838,0.000556,0.002503,6.918441e-05,1.129268e-07
4,4,Retail Channel Mix,Retail Channel Mix,"[[brand, 0.088151], [retail, 0.072138], [chann...",0,1,Brand & Channel,3,3,Social & Consumer,...,0.001418,0.001324,0.000247,0.000535,0.000217,0.000774,0.000089,0.000200,5.192086e-08,6.963468e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,Express Shipping,Express Shipping,"[[express, 0.147459], [ground, 0.098676], [pac...",0,0,Asset & Operations,0,1,Corporate Strategy & Execution,...,0.000189,0.000194,0.000310,0.000414,0.000049,0.000533,0.000346,0.001974,1.221889e-02,3.591596e-06
96,96,Product Offerings,Product Offerings,"[[solutions, 0.142157], [mortgage, 0.04265], [...",3,0,Retail & Consumer,3,1,Social & Consumer,...,0.000951,0.002408,0.000458,0.000078,0.000287,0.000665,0.000289,0.000203,2.238873e-05,1.070367e-07
97,97,Revenue Outlook,Revenue Outlook,"[[come line, 0.086321], [organization, 0.00724...",6,0,Revenue Trends,1,0,Economic & Financial,...,0.001809,0.002114,0.001152,0.001478,0.000161,0.002602,0.002121,0.001228,1.060189e-04,2.889729e-08
98,98,Client Retention,Client Retention,"[[client, 0.22997], [retention, 0.018155], [ne...",0,0,Client & Fund Services,1,2,Economic & Financial,...,0.001031,0.001141,0.000474,0.000618,0.000462,0.000903,0.000779,0.000701,7.484136e-05,9.500245e-06


✓ saved output/2025-04-20_17-51-22/LDA_100_topic_word_norm_with_4_with_subclusters.csv
