# Preliminaries

In [2]:
# Python version
import sys
print(sys.version)

3.8.20 (default, Oct  3 2024, 15:24:27) 
[GCC 11.2.0]


In [3]:
# import all modules
import pandas as pd
import numpy as np
import json
import time
import regex as re
import random
import spacy
import hdbscan
import pickle
import dill
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim
from nltk.corpus import stopwords
from tqdm import tqdm
from names_dataset import NameDataset
from bertopic import BERTopic
from umap import UMAP
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from sklearn.metrics import silhouette_score
from IPython.core.magic import register_cell_magic

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# # uncomment if not already donwloaded
# nltk.download('stopwords')
# nltk.download('punkt_tab')

In [5]:
# Set random seed 
seed = 1

# Set global random seed  
random.seed(seed)
np.random.seed(seed)

# Note: local random seed set below in
# - sampling
# - LDA
# - UMAP

In [6]:
# Define function
@register_cell_magic
def skip(line, cell):
    '''
    Skip the cell
    '''
    return

# Data Preparation

## Load and transform data (JSON to pandas df)

Note: Currently, we use for each earnings call the full transcripts. Alternatively, we could use for each earnings call the transcript_splits (statements by individual speakers). The latter would require changes below under "flatten nested data structure"; the resulting df should be kept equivalent to the current version with the exception of having more rows (transcript_splits instead of full transcripts) to work with subsequent code. However, unlike in the current version, adjustments to accomodate differences in interpretations (topic per earnigns call transcript vs topic per transcript_split/ speaker) might be necessary.  

In [7]:
# load data from JSON files
path = './'
file = 'full_combined.json'

with open(path + file, 'r') as f:
    data = json.load(f)

In [8]:
# flatten nested data structure
records = []
for year in data.keys(): 
    for quarter in data[year].keys():
        for report in data[year][quarter]:
            records.append({
                'company name': report['company name'],
                'ticker': report['ticker'],
                'sector': report['sector'],
                'industry': report['industry'],
                'year': year,
                'quarter': quarter,
                'date': report['transcript']['date'],
                'text': report['transcript']['transcript']
            })

In [9]:
# create df
df = pd.DataFrame(records)

## Exploratory data analysis

In [10]:
# View first entries
df.head()

Unnamed: 0,company name,ticker,sector,industry,year,quarter,date,text
0,Agilent Technologies,A,Health Care,Life Sciences Tools & Services,2014,Q1,2014-02-13,Executives: Bill Sullivan - President and CEO ...
1,Apple Inc.,AAPL,Information Technology,"Technology Hardware, Storage & Peripherals",2014,Q1,2014-01-27,Executives: Tim Cook - CEO Peter Oppenheimer -...
2,AbbVie,ABBV,Health Care,Biotechnology,2014,Q1,2014-04-25,"Executives: Richard Gonzalez – Chairman, Chief..."
3,AmerisourceBergen Corp,ABC,Health Care,Health Care Distributors,2014,Q1,2014-04-24,"Operator: Greetings, and welcome to the CoreSi..."
4,Abbott Laboratories,ABT,Health Care,Health Care Equipment,2014,Q1,2014-04-16,Operator: Good morning and thank you for stand...


In [11]:
# View last entries
df.tail()

Unnamed: 0,company name,ticker,sector,industry,year,quarter,date,text
20518,ExxonMobil,XOM,Energy,Integrated Oil & Gas,2024,Q4,2025-01-31,"Jim Chapman: Good morning, everyone. Welcome t..."
20519,Yum! Brands,YUM,Consumer Discretionary,Restaurants,2024,Q4,2025-02-06,"Operator: Welcome, everyone, to the Yum! Brand..."
20520,Zebra Technologies,ZBRA,Information Technology,Electronic Equipment & Instruments,2024,Q4,2025-02-13,Operator: Good day. And welcome to the Fourth ...
20521,Zoetis,ZTS,Health Care,Pharmaceuticals,2024,Q4,2025-02-13,Operator: Welcome to the Fourth Quarter and Fu...
20522,Zimmer Biomet,ZBH,Health Care,Health Care Equipment,2024,Q4,2025-02-06,"Operator: Good morning, ladies and gentlemen, ..."


In [12]:
# Check the shape
df.shape

(20523, 8)

In [13]:
# Check the info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20523 entries, 0 to 20522
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   company name  20523 non-null  object
 1   ticker        20523 non-null  object
 2   sector        20523 non-null  object
 3   industry      20523 non-null  object
 4   year          20523 non-null  object
 5   quarter       20523 non-null  object
 6   date          20523 non-null  object
 7   text          20523 non-null  object
dtypes: object(8)
memory usage: 1.3+ MB


In [14]:
# Check the descriptive statistics
df.describe().T

Unnamed: 0,count,unique,top,freq
company name,20523,636,Danaher Corporation,83
ticker,20523,643,A,44
sector,20523,11,Industrials,2900
industry,20523,165,Health Care Equipment,606
year,20523,11,2023,1976
quarter,20523,4,Q4,5158
date,20523,2213,2023-04-27,75
text,20523,20360,"Executives: Carrie Gillard - Under Armour, Inc...",2


In [15]:
# print part of an example transcript
print(df.iloc[0, ]["text"][0:5000])

Executives: Bill Sullivan - President and CEO Ron Nersesian - CEO, Keysight Technologies  Didier Hirsch - SVP, CFO Mike McMullen - President, Chemical Analysis Group Fred Strohmeier - President of Life Sciences and Diagnostics Group Neil Dougherty - CFO, Keysight Guy Séné - SVP of R&D and Sales  Alicia Rodriguez - VP, Investor Relations
Analysts: Tycho Peterson - JPMorgan Brandon Couillard - Jefferies Paul Knight - Janney Capital Isaac Ro - Goldman Sachs Ross Muken - ISI Group Tim Evans - Wells Fargo Securities Derik de Bruin - Bank of America Merrill Lynch Jon Groberg - Macquarie Capital Patrick Newton - Stifel Nicolaus Doug Schenkel - Cowen & Company Dan Arias - UBS Bryan Kipp - Janney Capital Markets
Operator: At this time, I would like to welcome everyone to Q1 ’14 Agilent Technologies Incorporated earnings conference call. [Operator instructions.] Alicia Rodriguez, you may begin your conference. 
Alicia Rodriguez: Thank you, operator, and thank you and welcome everyone to Agilent’

In [16]:
# sector distribution
df["sector"].value_counts()

sector
Industrials               2900
Financials                2883
Health Care               2566
Information Technology    2513
Consumer Discretionary    2394
Consumer Staples          1555
Real Estate               1262
Utilities                 1172
Materials                 1151
Energy                    1111
Communication Services    1016
Name: count, dtype: int64

In [17]:
# year distribution
df["year"].value_counts().sort_index()

year
2014    1622
2015    1679
2016    1760
2017    1820
2018    1870
2019    1934
2020    1964
2021    1967
2022    1973
2023    1976
2024    1958
Name: count, dtype: int64

## Optionally: Choose a random sample

In [18]:
# # uncomment for the final analysis
# # select a sample for development purposes
# sample_size = 2000
# df = df.sample(sample_size, random_state=seed)

# # check the shape
# df.shape

# Topic modelling: Fundamentals

Note: Preprocessing includes (1) cleaning (with sentence- and word-level tokenization) and (2) feature extraction (creating a numerical representation of the text, i.e. a document-term matrix or DTM for short). NMF and LDA require both but differ in the best feature engineering; BERTopic internalizes feature engineering (no DTM as input needed) and can handle uncleaned and cleaned text.

## Preprocessing 1: Cleaning

#### Replace financial numbers with the word "fin_num"

Note: Financial numbers are replaced with "fin_num" to normalize numeric expressions, reduce noise, and retain meaningful signals for modeling.

In [19]:
# define function
def substitute_financial_numbers(string):
    '''
    Substitues financial numbers by "fin_num" in a string
    '''
    sub_string = re.sub(
        r"\$\s?[0-9.,']+(?:\s?(?:million|billion|thousand))?|[0-9.,']+%",
        " fin_num ", 
        string) 
    return sub_string

In [20]:
# Apply function to df
df["text_clean"] = df["text"].apply(substitute_financial_numbers)

#### Filter words and sentences

Note: We require words to contain only alphabetic characters, be at least three characters long, and to be not first names (F); exceptions are "AI", "US", "fin_num". In addition, we require sentences to contain fewer than 50% occurrences of the word "fin_num" and to be at least five words long. (F) Footnote: First results showed that the DTM and the resulting topic-word-matrices contain many first names; filtering by NER is too slow and only somewhat effective; therefore, we choose a simpler and more effective approach here.  

In [21]:
# define function
def filter_words(string, common_names, word_length=3, exceptions=None):
    '''
    Filters words in a string
    - Req. 1: word is alpha (excl. numbers and special characters)
    - Req. 2: word has a certain length (default: 3 characters) 
    - Req. 3: word is not a first name
    - Exceptions: list of words exempt from the req (e.g., "AI") (default: None)
    '''
    if exceptions is None:
        exceptions = []
    keep_words = []
    for word in word_tokenize(string):
        if ((word.isalpha() 
             and len(word) >= word_length 
             and word.lower() not in common_names) 
            or word in exceptions):
            keep_words.append(word)
    return " ".join(keep_words)

# test the function
test_string = '''Artificial intelligence, or short AI, boosts return 
fin_num by a factor of 10. However, Jim not 100%. Revenue fin_num fin_num fin_num.'''

print("Test:", filter_words(test_string, common_names=["Jim"], exceptions=["AI", "fin_num"]))

Test: Artificial intelligence short AI boosts return fin_num factor However Jim not Revenue fin_num fin_num fin_num


In [22]:
# define function
def filter_words_sentences(string, common_names, word_length=3, exceptions=None, 
                           sent_length=5, fin_num_th=0.5):
    '''
    Filters words (by call of filter_words) and sentences in a string
    - Req. 1, 2, 3 and exceptions: see function filter_words
    - Req. 4: sentence has certain length (default: 5 words)
    - Req. 5: sentence has less than certain percentage of word "fin_num" 
      in it default: 0.5)
    '''
    keep_sents = []
    for sent in sent_tokenize(string):
        words = filter_words(sent, common_names, word_length, exceptions).split() # incl. word_tokenize
        if (len(words) >= sent_length 
            and words.count("fin_num")/len(words) <= fin_num_th):
            keep_sents.append(" ".join(words))
    return " ".join(keep_sents)

# test the function:
print("Test:", filter_words_sentences(test_string, common_names=["Jim"], exceptions=["AI", "fin_num"]))

Test: Artificial intelligence short AI boosts return fin_num factor


In [23]:
# Apply function to df

# define common names (US)
number_names = 1000
nd = NameDataset()
top_male = nd.get_top_names(n=number_names, gender='Male', country_alpha2='US')['US']['M']
top_female = nd.get_top_names(n=number_names, gender='Female', country_alpha2='US')['US']['F']
common_names = set(name.lower() for name in top_male + top_female)

# define exceptions
exceptions = ["AI", "US", "fin_num"]

# call function
df["text_clean"] = df["text"].apply(
    filter_words_sentences,
    word_length=3,
    common_names=common_names,
    exceptions=exceptions, 
    sent_length=5, 
    fin_num_th=0.5)

In [24]:
# View first entries
df.head()

Unnamed: 0,company name,ticker,sector,industry,year,quarter,date,text,text_clean
0,Agilent Technologies,A,Health Care,Life Sciences Tools & Services,2014,Q1,2014-02-13,Executives: Bill Sullivan - President and CEO ...,Executives Sullivan President and CEO Nersesia...
1,Apple Inc.,AAPL,Information Technology,"Technology Hardware, Storage & Peripherals",2014,Q1,2014-01-27,Executives: Tim Cook - CEO Peter Oppenheimer -...,Executives Cook CEO Oppenheimer SVP CFO Luca M...
2,AbbVie,ABBV,Health Care,Biotechnology,2014,Q1,2014-04-25,"Executives: Richard Gonzalez – Chairman, Chief...",Executives Chairman Chief Executive Officer Ex...
3,AmerisourceBergen Corp,ABC,Health Care,Health Care Distributors,2014,Q1,2014-04-24,"Operator: Greetings, and welcome to the CoreSi...",Operator Greetings and welcome the CoreSite Re...
4,Abbott Laboratories,ABT,Health Care,Health Care Equipment,2014,Q1,2014-04-16,Operator: Good morning and thank you for stand...,Operator Good morning and thank you for standi...


In [25]:
# print part of a cleaned example transcript
print(df.iloc[0, ]["text_clean"][0:5000])

Executives Sullivan President and CEO Nersesian CEO Keysight Technologies Didier Hirsch SVP CFO McMullen President Chemical Analysis Group Strohmeier President Life Sciences and Diagnostics Group Dougherty CFO Keysight Séné SVP and Sales Investor Relations Analysts Tycho Peterson JPMorgan Couillard Jefferies Knight Janney Capital Goldman Sachs Muken ISI Group Evans Wells Fargo Securities Derik Bruin Bank Merrill Lynch Groberg Macquarie Capital Newton Stifel Nicolaus Schenkel Cowen Company Arias UBS Kipp Janney Capital Markets Operator this time would like welcome everyone Agilent Technologies Incorporated earnings conference call Thank you operator and thank you and welcome everyone Agilent first quarter conference call for fiscal year With are Sullivan Agilent President and CEO Nersesian CEO Keysight Technologies and Didier Hirsch Agilent Senior Vice President and CFO Joining the after Didier comments the presidents our chemical analysis and life sciences and diagnostics groups McMull

#### Lemmatization

Note: We lemmatize words. Lemmatizing converts words to their base forms, reducing the inflectional variability in your texts. This is only hedlpful for LDA and NMF, as BERTopic can deal with this. Footnote: Earlier attempts showed that diffreent forms of the same word (e.g., singular and plural) showed up in the topic.

In [26]:
# Define function: 
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(string):
    """
    Lemmatizes the string
    """
    doc = nlp(string)
    return " ".join(token.lemma_ for token in doc)

In [27]:
# Appply the function
tqdm.pandas()
df["text_clean_lemma"] = df["text_clean"].progress_apply(lemmatize_text)

100%|██████████| 20523/20523 [3:54:55<00:00,  1.46it/s]  


In [28]:
# View first entries
df.head()

Unnamed: 0,company name,ticker,sector,industry,year,quarter,date,text,text_clean,text_clean_lemma
0,Agilent Technologies,A,Health Care,Life Sciences Tools & Services,2014,Q1,2014-02-13,Executives: Bill Sullivan - President and CEO ...,Executives Sullivan President and CEO Nersesia...,executive Sullivan President and CEO Nersesian...
1,Apple Inc.,AAPL,Information Technology,"Technology Hardware, Storage & Peripherals",2014,Q1,2014-01-27,Executives: Tim Cook - CEO Peter Oppenheimer -...,Executives Cook CEO Oppenheimer SVP CFO Luca M...,executive Cook CEO Oppenheimer SVP CFO Luca Ma...
2,AbbVie,ABBV,Health Care,Biotechnology,2014,Q1,2014-04-25,"Executives: Richard Gonzalez – Chairman, Chief...",Executives Chairman Chief Executive Officer Ex...,executive Chairman Chief Executive Officer Exe...
3,AmerisourceBergen Corp,ABC,Health Care,Health Care Distributors,2014,Q1,2014-04-24,"Operator: Greetings, and welcome to the CoreSi...",Operator Greetings and welcome the CoreSite Re...,Operator greeting and welcome the CoreSite Rea...
4,Abbott Laboratories,ABT,Health Care,Health Care Equipment,2014,Q1,2014-04-16,Operator: Good morning and thank you for stand...,Operator Good morning and thank you for standi...,operator good morning and thank you for stand ...


In [29]:
# print part of a cleaned example transcript
print(df.iloc[0, ]["text_clean_lemma"][0:5000])

executive Sullivan President and CEO Nersesian CEO Keysight Technologies Didier Hirsch SVP CFO McMullen President Chemical Analysis Group Strohmeier President Life Sciences and Diagnostics Group Dougherty CFO Keysight Séné SVP and Sales Investor Relations Analysts Tycho Peterson JPMorgan Couillard Jefferies Knight Janney Capital Goldman Sachs Muken ISI Group Evans Wells Fargo Securities Derik Bruin Bank Merrill Lynch Groberg Macquarie Capital Newton Stifel Nicolaus Schenkel Cowen Company Arias UBS Kipp Janney Capital Markets Operator this time would like welcome everyone Agilent Technologies incorporated earning conference call thank you operator and thank you and welcome everyone Agilent first quarter conference call for fiscal year with be Sullivan Agilent President and CEO Nersesian CEO Keysight Technologies and Didier Hirsch Agilent Senior Vice President and CFO join the after Didier comment the president our chemical analysis and life science and diagnostic group McMullen and Stro

#### Optionally: Save or load preprocessed df

In [30]:
# To save:
df.to_csv("./df.csv.gz",index=False)

In [31]:
# # To load:
# df = pd.read_csv("./df.csv.gz")

# # Check for NaNs introduced through saving: 
# problem_rows = df[~df["text_clean_lemma"].apply(lambda x: isinstance(x, str))]
# print(f"Problem rows: {len(problem_rows)}")
# print(problem_rows.head())

# # Drop NaN rows intriduced through saving:
# df = df[df["text_clean_lemma"].notna()]

## Preprocessing 2: Feature extraction/ Document-term matrix (DTM) 

Note: There are two options for DTMs: (1) DTM-TF, which includes simple word counts (term frequencies) for each document, and (2) DTM-TF-IDF, which weights terms by their inverse frequency in the corpus (term frequency–inverse document frequency). LDA requires DTM-TF, while NMF works best with DTM-TF-IDF. (BERTopic requires neither.) For both DTMs, we are case-insensitive (convert all text to lowercase), consider unigrams (single words) and bigrams (expressions consisting of two words), remove stopwords (i.e., common words that typically do not influence meaning), exclude words that appear in more than 50% of documents, and restrict the vocabulary to the 1,000 most frequent words. 

#### DTM-TF

In [32]:
# load stop words
stops = set(stopwords.words("english"))

# Create DTM-TF
start = time.time()

vec_tf = CountVectorizer(
    analyzer="word", # Tokenize text at the word level
    tokenizer=lambda x: x.split(), # simple split bc text pre-tokenized (see preprocess.)
    token_pattern=None, # disable regex-based tokeniz. bc text pre-tokenized (see preprocess.)
    lowercase=True, # convert tokens to lowercase
    stop_words=list(stops), # exclude stop words
    ngram_range=(1, 2), # allow unigrams and bigrams
    max_df=0.5, # exclude tokens appearing in >50% of docs
    max_features=1000) # limit vocabulart to 1000 most common tokens

dtm_tf = vec_tf.fit_transform(df["text_clean_lemma"])
vocab_tf = vec_tf.get_feature_names_out()

end = time.time()
print(f"time elapsed (seconds): {end - start}")

time elapsed (seconds): 182.61599397659302


In [33]:
# sanity check:
display(dtm_tf.todense())
dtm_tf.shape

matrix([[ 1,  2,  0, ..., 12,  1,  1],
        [ 0,  0,  1, ..., 22,  0,  0],
        [ 1,  0,  0, ...,  1,  0,  0],
        ...,
        [ 2,  2,  0, ...,  0,  0,  0],
        [ 0,  2,  1, ...,  0,  0,  0],
        [ 0,  5,  0, ...,  1,  0,  1]])

(20523, 1000)

#### DTM-TF-IDF

In [34]:
# load stop words
stops = set(stopwords.words("english"))

# Create DTM-TF
start = time.time()

vec_tfidf = TfidfVectorizer(
    analyzer="word", # Tokenize text at the word level
    tokenizer=lambda x: x.split(), # simple split bc text pre-tokenized (see preprocess.)
    token_pattern=None, # disable regex-based tokeniz. bc text pre-tokenized (see preprocess.)
    lowercase=True, # convert tokens to lowercase
    stop_words=list(stops), # exclude stop words
    ngram_range=(1, 2), # allow unigrams and bigrams
    max_df=0.5, # exclude tokens appearing in >50% of docs
    max_features=1000) # limit vocabulart to 1000 most common tokens

dtm_tfidf = vec_tfidf.fit_transform(df["text_clean_lemma"])
vocab_tfidf = vec_tfidf.get_feature_names_out()

end = time.time()
print(f"time elapsed (seconds): {end - start}")

time elapsed (seconds): 187.48503375053406


In [35]:
# sanity check:
display(dtm_tfidf.todense())
dtm_tfidf.shape

matrix([[0.01442843, 0.02887781, 0.        , ..., 0.17916472, 0.01370502,
         0.01839409],
        [0.        , 0.        , 0.01282105, ..., 0.26917476, 0.        ,
         0.        ],
        [0.00834295, 0.        , 0.        , ..., 0.00863319, 0.        ,
         0.        ],
        ...,
        [0.0193613 , 0.01937535, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.02146574, 0.0116296 , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.08084033, 0.        , ..., 0.01671841, 0.        ,
         0.02059692]])

(20523, 1000)

## LDA, NMF and BERTopic models

### Theoretical background 

#### LDA notes

We use topic_word_prior = 0.01, doc_topic_prior = 50/n; these priors are not the default but often recommended in the literature; the model is stochastic

Interpretation of output:
- Document-topic matrix: Each row represents a document (e.g., a transcript of an earnings call), and each column represents a topic. The matrix can be interpreted as a topic distribution for each document, i.e., the proportion or probability that a document is associated with a given topic (for each row: sum over columns = 1).
- Topic-word matrix: Each row corresponds to a topic, and each column corresponds to a word from the vocabulary in the DTM. Originally, the values (from model.components_) contain pseudocounts, i.e. estimated number of times word j assigned to topic i. After normalization (m_topic_word = m_doc_topic / m_doc_topic(axis=1, keepdims=True), the can be viewed as word distribution for each topic, i.e., the probability that a given word appears in a topic (for each row: sum over columns = 1).

#### NMF notes

The model is deterministic.

Interpretiation of the output: 
- Document-topic matrix: Each row represents a document (transcript of an earnings call), and each column represents a topic. Originally, the values (from model.fit_transform) reflect the strength or weight of each topic in the corresponding document — that is, how much the document loads onto each topic. After normalization (m_doc_topic / m_doc_topic.sum(axis=1, keepdims=True)), the matrix can be interpreted as a topic distribution for each document, i.e., the proportion or probability that a document is associated with a given topic (for each row: sum over columns = 1).

- Topic-word matrix: Each row corresponds to a topic, and each column corresponds to a word from the vocabulary in the DTM. Originally, the values (from model.components_) represent the strength or weight of association between word j and topic i. After normalization (model.components_ / model.components_.sum(axis=1, keepdims=True)), the matrix can be interpreted as a word distribution for each topic, i.e., the probability that a given word appears in a topic (for each row: sum over columns = 1).


#### BERTOpic notes

Default BERTopic incolves the following steps (source: GPT):

1. Embedding Documents – Using a transformer model on each document to convert its raw text into dense vector representations that capture the semantic meaning 
2. Reducing Dimensionality – Use UMAP to compress the high-dimensional embeddings into a lower-dimensional space while preserving their structural relationships
3. Clustering Reduced Embeddings into Topics – Apply HDBSCAN to group similar document embeddings, where each cluster represents a topic
4. Tokenization of Topics – For each cluster, tokenize the original documents to extract words and phrases that occur frequently
5. Weight Tokens – Use TF-IDF weighting to score and rank tokens by their importance within each topic
6. Represent Topics – Summarize each topic by selecting the top-weighted tokens

Clustering:

"BERTopic approaches topic modeling as a cluster task and attempts to cluster semantically similar documents to extract common topics. A disadvantage of using such a method is that [in the default version] each document is assigned to a single cluster and therefore also a single topic. In practice, however documents may conatin." (Grootendorst 2024) That is, BERTopic is more a topic clustering (assign 1 topic) than topic modelling approach (assign multiple topics).
To assign multiple topics to documents there are several metods using BERTopic (Grootendorst 2024):
1. Applying BERTopic on parts of the documents (e.g., sentences).
2. Use a cluster model that can perform soft clustering like HDBSCAN.
3. Use .approximate_distrbution: "each document is split into tokens according to the provided tokenizer in the CountVectorizer. Then, a sliding window is applied on each document creating subsets of the document".

Test with the default version have shown: 

- Assigns only one topic to each document (see above for the reason 
- Applying the model on raw data leads to non-sensicel results.
- The number of topics (around 20) and words per topic (around 20) are small. 

To use BERTopic in alingment with LDA and NMF, we therefore use a custoized BERTopic model:

1. Multiple topics per document: Option 1: Use HDSBCAN (default), which does soft-clustering, and use calculate_probabilities=True (non-default). Option 2: Use k-means (non-default), which does hard clustering, and use .approximate_distrbution.
2. Increase number of topics: In HDBSCAN (option 1) the number of clusters/ topics can not be set but indirectly controlled by min_cluster_size; the default is 15, so chose a lower number to increase the number of topics. In k-means we can directly set the number of clusters/ topics to 100. https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html#controlling-number-of-topics, https://maartengr.github.io/BERTopic/getting_started/clustering/clustering.html#k-means
3. Increase the words per topic: Use CountVectorizer's parameter max_features to set the number of words per topic. 
4. Follow LDA and NMF as closely as possible in all other regards: Use the preprocessed data, excl. stopwords, lowercase, exclude to frequent words (max_df=0.5).
5. Document-topic and topic-word matrices: Created in analogy to LDA and NMF: https://maartengr.github.io/BERTopic/faq.html#how-do-i-calculate-the-probabilities-of-all-topics-in-a-document, https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html#example, https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#topic-term-matrix, 

Randomness:

BERTopic is a stochastic model. But its not easy to random seeds in the default version. Thus, we only do it for the final customized version.


Interpretation of the output:

- Document-Topic Matrix: Each row represents an earnings call transcript, and each column represents a topic. Originally, the values reflect the strength or weight (based on clustering and the underlying TF-IDF scores) of each topic for that document. After normalization, each row sums to 1, and you can interpret the entries as the probability or proportion of the document that is associated with each topic.

- Topic-Word Matrix (BERTopic): Each row corresponds to a topic and each column to a word in the vocabulary. Initially, the values indicate the relative importance of a word within the topic (often derived from TF-IDF weights). Normalizing each row so that it sums to 1 lets you interpret the entries as the probability of a given word appearing in that topic.

- Note that BERTopic reserves the label –1 for documents that are considered outliers and are not confidently assigned to any topic.

### Helper function

In [36]:
# Define function
def normalize_matrix(df_matrix):
    '''
    Normalizes a document-topic or topic-word matrix, i.e. makes the row_sum = 1,
    i.e. makes the document-topic matrix interpretable as a topic distribution for 
    each document and the the topic-word matrix as a word distribution for each topic
    
    Paramters:
    - df_matrix: pd.DataFrame, non-normalized document-topic or topic-word matrix

    Output:
    - pd.DataFrame, normalized document-topic or topic-word matrix
    '''
    return df_matrix.div(df_matrix.sum(axis=1), axis=0)

In [37]:
# Define function
def get_topn_words(df_topic_word, number_top_words):
    """
    Get top n words for each topic from topic-word matrix.

    Parameters:
    - df_topic_word : pd.DataFrame, topic-word matrix 
    - number_top_words : int, number of top words to extract

    Output:
    - list of list: list containing top n words for each topic
    """
    top_words_per_topic = []
    for i in range(df_topic_word.shape[0]):
        top_words = df_topic_word.iloc[i].nlargest(number_top_words).index.tolist()
        top_words_per_topic.append(top_words)
    return top_words_per_topic

### Functions for the models (incl. Parameterisation)

#### LDA function

In [38]:
# Define function
def lda_fun(dtm, vocab, number_topics, seed):
    '''
    Fits LDA model and returns model, document-topic and topic-word matrices
    
    Parameters:
    - dtm: np.ndarray, document-term matrix
    - number_topics: int, number of topics
    - vocab: list of str, feature names/ dtm column names (i.e., the words)
    - seed: int, random seed for reproducibility

    Outputs:
    - lda: fitted LDA model
    - df_doc_topic: pd.DataFrame, document-topic distr. matrix (docs x topics)
    - df_topic_word: pd.DataFrame, topic-word distr. matrix (topics x words)

    Notes:
    - output matrices are already normalized (for each row: sum over columns = 1)
    '''
    # define model
    lda = LDA(
        n_components=number_topics,
        doc_topic_prior=min(50 / number_topics, 1),
        topic_word_prior=0.01,
        n_jobs=-1,
        random_state=seed)
    
    # Fit model and get document-topic-matrix
    m_doc_topic = lda.fit_transform(dtm)
    
    # Get topic-word-matrix
    m_topic_word = lda.components_

    # Transform document-topic-matrix and topic-word-matrix to df
    df_doc_topic = pd.DataFrame(
        m_doc_topic, 
        columns=[f"topic_{i}" for i in range(lda.n_components)])

    df_topic_word = pd.DataFrame(
        m_topic_word, 
        columns=vocab)
    
    return {"model": lda, 
            "df_doc_topic": df_doc_topic, 
            "df_topic_word": df_topic_word}

#### NMF function

In [39]:
# Define function
def nmf_fun(dtm, vocab, number_topics):
    '''
    Fits a NMF model and returns model, document-topic and topic-word matrices
    
    Parameters:
    - dtm: np.ndarray, document-term matrix
    - number_topics: int, number of topics
    - vocab: list of str, feature names/ dtm column names (i.e., the words)

    Outputs:
    - nmf: fitted LDA model
    - df_doc_topic: pd.DataFrame, ocument-topic distr. matrix (docs x topics)
    - df_topic_word: pd.DataFrame, topic-word distr. matrix (topics x words)

    Notes:
    - output matrices are not yet normalized (row sums != 1)
    - nmf is deterinistic, i.e. reproducable without seed
    '''
    # Define model
    nmf = NMF(
        n_components=number_topics)
    
    # Fit model and get document-topic-matrix
    m_doc_topic = nmf.fit_transform(dtm)
    
    # Get topic-word-matrix
    m_topic_word = nmf.components_

    # Transform document-topic-matrix and topic-word-matrix to df
    df_doc_topic = pd.DataFrame(
        m_doc_topic, 
        columns=[f"topic_{i}" for i in range(nmf.n_components)])

    df_topic_word = pd.DataFrame(
        m_topic_word, 
        columns=vocab)
    
    return {"model": nmf,
            "df_doc_topic": df_doc_topic, 
            "df_topic_word": df_topic_word}

#### BERTopic function

In [40]:
# Define function
def bertopic_fun(docs, number_topics, seed):
    '''
    Fits a BERTopic model and returns model, document-topic and topic-word matrices

    Parameters:
    - docs: list of str, input documents
    - number_topics: int, number of topics
    - seed: int, random seed for reproducibility

    Outputs:
    - bert: fitted BERTopic model
    - df_doc_topic: pd.DataFrame, document-topic distribution (n_docs x n_topics)
    - df_topic_word: pd.DataFrame, topic-word distribution (n_topics x words)

    Notes:
    - Output matrices are not yet normalized (row sums ≠ 1)
    '''
    # Define model
    custom_vectorizer = CountVectorizer(
        stop_words="english", 
        lowercase=True, 
        max_df=0.5, 
        max_features=1000)
    
    umap_model = UMAP( 
        n_neighbors=15,  
        n_components=5, # increases number of dim in dimenson reduction
        min_dist=0.1, 
        metric='cosine',
        random_state=seed) 
    
    kmeans_model = KMeans(
        n_clusters=number_topics)
    
    # Create BERTopic model
    bert = BERTopic(
        # embedding_model=finbert,
        vectorizer_model=custom_vectorizer,
        umap_model=umap_model,
        hdbscan_model=kmeans_model,  #yYes, pass KMeans here!
        verbose=True)
    
    # Fit the model
    topics = bert.fit_transform(docs)

    # Get topic-word-matrix. 
    # Note: Approximates document-topic distribution 
    m_doc_topic, _ = bert.approximate_distribution(docs) 
    df_doc_topic = pd.DataFrame(
        m_doc_topic,
        columns=[f"topic_{i}" for i in range(m_doc_topic.shape[1])])

    # Topic-word matrix
    words = bert.vectorizer_model.get_feature_names_out()
    df_topic_word = pd.DataFrame(
        bert.c_tf_idf_.todense(),
        columns=words) 
    return {"model": bert,
            "df_doc_topic": df_doc_topic, 
            "df_topic_word": df_topic_word}

### Functions for evaluating models

#### umass

Note: The umass coherence score compares the co-occurrence probability of word pairs in the same topic with the probability of these word pairs occurring by chance. The score usually ranges from negative values to zero; numbers closer to zero mean better coherence/ more meaningful topics. Generally, scores around -1 or higher (closer to zero) are considered pretty good

In [41]:
# Define function
def compute_umass(df_topic_word, dtm, vocab, number_top_words=5):
    """
    Compute mean u_mass coherence using metric_coherence_gensim.
    Parameters:
    - m_topic_word : pd.DataFrame, topic-word matrix
    - dtm : np.ndarray, document-term matrix.
    - vocab: list of str, feature names/ dtm column names (i.e., the words)
    - number_top_words : int, Number of top words per topic.

    Oputput:
    - umass: float, mean umass coherence score
    - does not require normalized topic-word matrix
    """
    scores = metric_coherence_gensim(
        measure="u_mass",
        top_n=number_top_words,
        topic_word_distrib=df_topic_word.to_numpy(),
        dtm=dtm,
        vocab=vocab,
        texts=None)
    
    return np.mean(scores)

#### Topic diversity

Note: Topic Diversity measures the ratio of unique top words across all topics to the total number of top words. Scores range from 0 to 1; values closer to 1 indicate that topics share fewer words (i.e. are more distinct and diverse).

In [42]:
# Define function
def compute_topic_diversity(df_topic_word, number_top_words=5):
    """
    Compute topic diversity
 
    Parameters:
    - df_topic_word : pd.DataFrame, topic-word matrix
    - number_top_words : int, number of top words per topic.

    Output:
    - topic diversity: float, topic diversity score

    Notes:
    - topic diversity = (num. of unique top words across topics) / (total num. of top words)
    - does not require normalized topic-word matrix
    """
    # use helper fun to get top words for each topic as list of list
    top_words = get_topn_words(df_topic_word, number_top_words) 

    # Flatten all top words across topics and make list unique
    top_words_flattend = []
    for topic in top_words:         
        for word in topic:          
            top_words_flattend.append(word)
    
    top_words_unique = set(top_words_flattend)  

    # calc and return topic diversity
    return len(top_words_unique) / (len(top_words) * number_top_words)

#### Intruder analysis

In [43]:
# Define the functions
def intruder_analysis(df_topic_word, vocab, number_top_words=5, seed=1):
    """
    Generate the top words and a random intruder word for each topic in a string

    Parameters:
    - df_topic_word : pd.DataFrame, topic-word matrix
    - vocab: list of str, feature names/ dtm column names (i.e., the words)
    - number_top_words : int, Number of top words per topic
    - seed: int, random seed for reproducibility

    Ouptput:
    - str, formatted topic + intruder lines for each topic    
    """
    # Set random seed
    random.seed(seed)

    # Use helper function to get top words per topic
    top_words = get_topn_words(df_topic_word, number_top_words)

    # Flatten all top words across topics and make list unique
    top_words_flattend = []
    for topic in top_words:
        for word in topic:
            top_words_flattend.append(word)

    top_words_unique = set(top_words_flattend)

    # collect results as strings
    output_lines = []

    for i, topic_words in enumerate(top_words):
        intr_candidates = [w for w in vocab if w not in topic_words and w not in top_words_unique]
        intr_word = random.choice(intr_candidates) if intr_candidates else None
        line = f"Topic {i} words: {'|'.join(topic_words)} | Intruder: {intr_word}"
        output_lines.append(line)

    return "\n".join(output_lines)


### Calculate and display results

#### Loop (incl. possibility for tuning topic numbers)

Implementation notes: (1)  dtm_tf and dtm_tfidf are unequal while vocab_tf and vocab_tfidf are equal. (2) For computing umass, one should use the same raw count-based DTM (dtm_tf) incl. the corresponding vocabulary vocab_tf for LDA, NMF, and BERTopic. (3) For intruder analysis, one should use the same vorabluary (vocab_tf) for LDA, NMF, and BERTopic. (4) In lda_fun and mnf_fun function calls we use vobab_tf and vocab_tfidf (eventhough they are identical) because of consistency with the parameters dtm_tf and dtm_tfidf (which are not identical)


In [44]:
# Specify number of topics
L_number_topics = [100]

In [45]:
# Prep data for BERTopic as list
docs = df["text_clean"].tolist()

In [46]:
# Loop: Fit models and get quantiative and qualitative measures
dict_results = {}

for number_topics in L_number_topics:

    print("number_topics:", number_topics)
    
    # LDA
    out_lda = lda_fun(dtm=dtm_tf, vocab=vocab_tf, number_topics=number_topics, seed=seed)
    umass_lda = compute_umass(df_topic_word=out_lda["df_topic_word"], dtm=dtm_tf, vocab=vocab_tf, number_top_words=5)
    topic_div_lda = compute_topic_diversity(df_topic_word=out_lda["df_topic_word"], number_top_words=5)
    intruder_lda = intruder_analysis(df_topic_word=out_lda["df_topic_word"], vocab=vocab_tf, number_top_words=5, seed=seed)
    dict_results[f"LDA_{number_topics}"] = {"df_doc_topic": out_lda["df_doc_topic"], "df_topic_word": out_lda["df_topic_word"], "umass": umass_lda, "topic_diversity": topic_div_lda, "intruder": intruder_lda}

    # NMF
    # out_nmf = nmf_fun(dtm=dtm_tfidf, vocab=vocab_tfidf, number_topics=number_topics)
    # umass_nmf = compute_umass(df_topic_word=out_nmf["df_topic_word"], dtm=dtm_tf, vocab=vocab_tf, number_top_words=5)
    # topic_div_nmf = compute_topic_diversity(df_topic_word=out_nmf["df_topic_word"], number_top_words=5)
    # intruder_nmf = intruder_analysis(df_topic_word=out_nmf["df_topic_word"], vocab=vocab_tf, number_top_words=5, seed=seed)
    # dict_results[f"NMF_{number_topics}"] = {"df_doc_topic": out_nmf["df_doc_topic"], "df_topic_word": out_nmf["df_topic_word"], "umass": umass_nmf, "topic_diversity": topic_div_nmf, "intruder": intruder_nmf}

    # # BERTopic
    # out_bert = bertopic_fun(docs=docs, number_topics=number_topics, seed=seed)
    # umass_bert = compute_umass(df_topic_word=out_bert["df_topic_word"], dtm=dtm_tf, vocab=vocab_tf, number_top_words=5)
    # topic_div_bert = compute_topic_diversity(df_topic_word=out_bert["df_topic_word"], number_top_words=5)
    # intruder_bert = intruder_analysis(df_topic_word=out_bert["df_topic_word"], vocab=vocab_tf, number_top_words=5, seed=seed)
    # dict_results[f"BERTopic_{number_topics}"] = {"df_doc_topic": out_bert["df_doc_topic"], "df_topic_word": out_bert["df_topic_word"], "umass": umass_bert, "topic_diversity": topic_div_bert, "intruder": intruder_bert}

number_topics: 100


#### Quantiative results

In [47]:
# Create summary table from dict_results
summary_rows = []

for label, result in dict_results.items():
    summary_rows.append({
        "model_number_topics": label,
        "umass": result["umass"],
        "topic_diversity": result["topic_diversity"]})

df_summary = pd.DataFrame(summary_rows)
print(df_summary)

  model_number_topics     umass  topic_diversity
0             LDA_100 -0.977191            0.664


#### Qualitative results

In [48]:
# print the intruder analysis from dict_results
for label, result in dict_results.items():
    print(f"{label} intruder analysis")
    print(result["intruder"])
    print()

LDA_100 intruder analysis
Topic 0 words: store|comp|online|traffic|retail | Intruder: deutsche
Topic 1 words: tool|new product|currency|critical|unfavorable | Intruder: thing like
Topic 2 words: organic|organic growth|organic revenue|services|organically | Intruder: cetera
Topic 3 words: senior|senior vice|yeah|markets|capital markets | Intruder: hope
Topic 4 words: brand|retail|channel|wholesale|store | Intruder: cowen
Topic 5 words: aircraft|fleet|united|delivery|schedule | Intruder: sale increase
Topic 6 words: cloud|enterprise|data|datum center|provider | Intruder: quarter also
Topic 7 words: currency|constant|constant currency|new product|americas | Intruder: reform
Topic 8 words: loan|deposit|net interest|ratio|fee | Intruder: year year
Topic 9 words: plant|packaging|ton|export|brazil | Intruder: offering
Topic 10 words: gross margin|shipment|new product|ship|manufacturing | Intruder: finish
Topic 11 words: marketing|user|app|mobile|advertising | Intruder: consolidated
Topic 12 w

## Output

Note: In this section the document-topic and topic-word matrices are saved as csv

In [49]:
# Ensure metadata is clean and aligned
df_valid = df[df["text_clean_lemma"].notna()].reset_index(drop=True)
df_metadata = df_valid.drop(columns=["text", "text_clean", "text_clean_lemma"]).reset_index(drop=True)

# Save normalized matrices with consistent and clear naming
for model_name, result in dict_results.items():
    # Normalize both matrices
    df_doc_topic_norm = normalize_matrix(result["df_doc_topic"]).reset_index(drop=True)
    df_topic_word_norm = normalize_matrix(result["df_topic_word"])

    # Augment the document-topic matrix with metadata
    df_doc_topic_norm_augmented = pd.concat([df_metadata, df_doc_topic_norm], axis=1)

    # Save both matrices
    df_doc_topic_norm_augmented.to_csv(f"{model_name}_doc_topic_norm_augmented.csv", index=False)
    df_topic_word_norm.to_csv(f"{model_name}_topic_word_norm.csv", index=False)