# Install necessary packages

In [16]:
# !pip install top2vec
# !pip install pyLDAvis
!pip install bertopic
# !python -m spacy download en_core_web_lg

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m122.9/154.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

# Read the transcripts

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
df = pd.read_pickle("/content/drive/MyDrive/transcript_texts.pkl")
df.head()

Unnamed: 0,Company,Call,Date_of_call,Type_of_text,Person,Text,Order,Industry
0,Abbott Laboratories,"Abbott Laboratories, Q1 2018 Earnings Call, Ap...",2018-04-18,Presentation Operator Message,Operator,Good morning and thank you for standing by. We...,0,Healthcare
1,Abbott Laboratories,"Abbott Laboratories, Q1 2018 Earnings Call, Ap...",2018-04-18,Presenter Speech,Scott Leinenweber,Good morning and thank you for joining us. \r\...,1,Healthcare
2,Abbott Laboratories,"Abbott Laboratories, Q1 2018 Earnings Call, Ap...",2018-04-18,Presenter Speech,Miles White,"Okay. Thanks, Scott, and good morning. \r\nTod...",2,Healthcare
3,Abbott Laboratories,"Abbott Laboratories, Q1 2018 Earnings Call, Ap...",2018-04-18,Presenter Speech,Brian Yoor,"Okay. Thanks, Miles. \r\nAnd as Scott mentione...",3,Healthcare
4,Abbott Laboratories,"Abbott Laboratories, Q1 2018 Earnings Call, Ap...",2018-04-18,Question and Answer Operator Message,Operator,[Operator Instructions] And our first question...,4,Healthcare


In [3]:
df = df[df['Date_of_call'] >= '2021-01-01']

In [4]:
df.Industry.unique()

array(['Healthcare', 'Semiconductor', 'Transportation', 'Financial',
       'Technology', 'Industrial', 'Oil and Gas', 'Consumer'],
      dtype=object)

In [5]:
df['Type_of_text'].unique()

array(['Presentation Operator Message', 'Presenter Speech',
       'Question and Answer Operator Message', 'Question', 'Answer'],
      dtype=object)

In [6]:
df2 = df.groupby('Call')['Text'].apply(lambda x: ' '.join(x)).to_frame(name='Text')
df2.head(5)

Unnamed: 0_level_0,Text
Call,Unnamed: 1_level_1
"Abbott Laboratories, Q1 2021 Earnings Call, Apr 20, 2021","Good morning, and thank you for standing by. W..."
"Abbott Laboratories, Q1 2022 Earnings Call, Apr 20, 2022","Good morning, and thank you for standing by. W..."
"Abbott Laboratories, Q2 2021 Earnings Call, Jul 22, 2021","Good morning, and thank you for standing by. W..."
"Abbott Laboratories, Q3 2021 Earnings Call, Oct 20, 2021","Good morning, and thank you for standing by. W..."
"Abbott Laboratories, Q4 2020 Earnings Call, Jan 27, 2021","Good morning, and thank you for standing by. W..."


In [7]:
df2.shape

(242, 1)

# Pre-processing

In [8]:
import re
df2['Text'] = df2['Text'].map(lambda x: re.sub('[\n\t\r.,;\'\"\[\]]*', '',x))
# df2['Text'] = df2['Text'].apply(lambda x: [w for w in x if w.isalpha()])
# Convert to lowercase
df2['Text'] = df2['Text'].map(lambda x: x.lower())

In [9]:
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = df2.Text.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['good', 'morning', 'and', 'thank', 'you', 'for', 'standing', 'by', 'welcome', 'to', 'abbotts', 'first', 'quarter', 'earnings', 'conference', 'call', 'operator', 'instructions', 'this', 'call', 'is', 'being', 'recorded', 'by', 'abbott', 'with', 'the', 'exception', 'of', 'any']


In [10]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [11]:
#NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'ie', 'st', 'th', 'rd','quarter','year','month','us','well','would',
                   'think','question', 'billion', 'million', 'thousands','recorded','morning','new','weve',
                   'thats', 'revenue', 'income','company','quarter','year','month','earnings','call','transcript',
                   'new', 'business', 'results','report','statement','today','said','like','going','think','question','answer',
                   'ask', 'know', 'well', 'would', 'could', 'may', 'great',
                   'might', 'get', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
                   'eight', 'nine', 'ten', 'zero', 'thousand', 'percent', 'percentage',
                   'point', 'points', 'market', 'stock', 'price', 'share', 'analyst',
                   'investor', 'conference', 'operator', 'good', 'morning', 'afternoon',
                   'evening', 'thank', 'operator', 'forward', 'looking', 'statements',
                   'information', 'future', 'events', 'circumstances', 'risks',
                   'uncertainties', 'assumptions', 'factors', 'developments', 'events',
                   'performance', 'results', 'forward', 'risks', 'statements',
                   'events', 'developments','business','first','see','continue','strong','last','also',
                   'going','really',' thats','next','get','back','thats','kind','content', 'product', 'lot',
                   'expect','like','obviously','yes','time','right','years','look','im','look','bit','second',
                   'little','across','capital','youre','still','got','go', 'say', 'talk', 'take', 'come', 'make'])


# Define functions for stopwords, bigram, trigrams, lemmatization
def remove_stopwords(texts):
    return[[word for word in simple_preprocess(str(doc)) \
            if word not in stop_words] for doc in texts]

# def make_bigrams(texts):
    # return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc \
                          if token.pos_ in allowed_postags])
    return texts_out

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# x = lemmatization(data_words)

In [13]:
data_words_nostops = remove_stopwords(data_words)

# BERTopic



In [14]:
flatten_data = [' '.join(sub) for sub in data_words_nostops]

In [15]:
# !pip install --upgrade joblib==1.1.0 # Little trick to downgrade your joblib while import BERTopic in Colab else it will throw joblib metadata error

In [16]:
from bertopic import BERTopic
model = BERTopic(verbose=True, min_topic_size= 10,nr_topics=8)
topics, _ = model.fit_transform(flatten_data)

2024-04-17 09:02:53,532 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

2024-04-17 09:04:06,679 - BERTopic - Embedding - Completed ✓
2024-04-17 09:04:06,683 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-17 09:04:18,302 - BERTopic - Dimensionality - Completed ✓
2024-04-17 09:04:18,304 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-17 09:04:18,322 - BERTopic - Cluster - Completed ✓
2024-04-17 09:04:18,324 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-04-17 09:04:19,880 - BERTopic - Representation - Completed ✓
2024-04-17 09:04:19,882 - BERTopic - Topic reduction - Reducing number of topics
2024-04-17 09:04:21,839 - BERTopic - Topic reduction - Reduced number of topics from 9 to 8


In [17]:
top_words = model.get_topic(1)
print(top_words)

[('aircraft', 0.028551487478929216), ('travel', 0.027077028176867222), ('demand', 0.024930205421425145), ('capacity', 0.02344249280703237), ('united', 0.022958652322622564), ('airline', 0.018917784054019563), ('network', 0.01819053582201494), ('customers', 0.017953621050692252), ('people', 0.01784872566259215), ('airlines', 0.017566559021067484)]


In [18]:
model.save("/content/drive/MyDrive/model_min_topic_size_10_nr_topics_7")



In [19]:
model.save('/content/drive/MyDrive/BERTopic_model8')



In [20]:
saved_model = BERTopic.load("/content/drive/MyDrive/model_min_topic_size_10_nr_topics_7")

## Visualization

In [24]:
saved_model.visualize_barchart(n_words = 10,width = 300,height=300,top_n_topics=8, custom_labels=top_words)

In [23]:
saved_model.visualize_heatmap()

In [None]:
import numpy as np
np_topics = np.array(topics)

# Saving the topic array to local
# np.save("drive/MyDrive/topics_arr",np_topics)

# Loading the topic array from local
# topics = np.load("content/drive/MyDrive/topics_arr.npy")
topics = np_topics.tolist()

In [None]:
documents = pd.DataFrame({"Transcript": flatten_data,
                          "ID": range(len(flatten_data)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Transcript': ' '.join})
cleaned_docs = saved_model._preprocess_text(documents_per_topic.Transcript.values)

In [None]:
# Extract vectorizer and analyzer from BERTopic
vectorizer = saved_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

In [None]:
import gensim.corpora as corpora
# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in saved_model.get_topic(topic)]
               for topic in range(len(set(topics))-1)]

## Evaluation

In [None]:
from gensim.models.coherencemodel import CoherenceModel
# Evaluate
coherence_model = CoherenceModel(topics=topic_words,
                                 texts=tokens,
                                 corpus=corpus,
                                 dictionary=dictionary,
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

In [None]:
print(coherence)

0.571279689526497
