In [1]:
%%capture
!pip install bertopic

In [2]:
## Mounting of Gdrive in the colab nb

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Specify the path to your CSV file
csv_file_path = '/content/drive/My Drive/Consultancy ADB/Fed/fed_data.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

df.tail(10)

Unnamed: 0,date,content
72,11/1/2023,"For release at 2:00 p.m. EDT November 1, 2023 ..."
73,11/1/2023,______________________________________________...
74,12/13/2023,"For release at 2:00 p.m. EST December 13, 2023..."
75,12/12/2023,_ ____________________________________________...
76,12/12/2023,_ ____________________________________________...
77,1/31/2024,"For release at 2:00 p.m. EST January 31, 2024 ..."
78,1/30/2024,______________________________________________...
79,3/20/2024,"For release at 2:00 p.m. EDT March 20, 2024 Re..."
80,3/19/2024,______________________________________________...
81,5/1/2024,"For release at 2:00 p.m. EDT May 1, 2024 Recen..."


In [4]:
import pandas as pd
import re
import nltk

# Uncomment to download "stopwords"
nltk.download("stopwords")
from nltk.corpus import stopwords

def text_preprocessing(s):
    """
    Preprocess text by:
    - Lowercasing the sentence
    - Changing "'t" to "not"
    - Removing "@name"
    - Removing all non-word characters except whitespace
    - Removing stopwords except "not" and "can"
    - Removing trailing whitespace
    """
    if isinstance(s, str):  # Check if s is a string
        s = s.lower()
        # Change 't to 'not'
        s = re.sub(r"\'t", " not", s)
        # Remove @name
        s = re.sub(r'(@.*?)[\s]', ' ', s)
        # Remove all non-word characters except whitespace
        s = re.sub(r'[^\w\s]', '', s)
        # Remove stopwords except 'not' and 'can'
        stop_words = set(stopwords.words('english'))
        stop_words.discard('not')  # Discard 'not' from stopwords
        stop_words.discard('can')  # Discard 'can' from stopwords
        s = " ".join([word for word in s.split() if word not in stop_words])
        # Remove trailing whitespace
        s = re.sub(r'\s+', ' ', s).strip()
    else:
        s = ""  # Handle non-string types, e.g., NaN
    return s

# Assuming you have a DataFrame named df with a 'content' column
df['clean_content'] = df['content'].apply(text_preprocessing)
df.tail()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,date,content,clean_content
77,1/31/2024,"For release at 2:00 p.m. EST January 31, 2024 ...",release 200 pm est january 31 2024 recent indi...
78,1/30/2024,______________________________________________...,______________________________________________...
79,3/20/2024,"For release at 2:00 p.m. EDT March 20, 2024 Re...",release 200 pm edt march 20 2024 recent indica...
80,3/19/2024,______________________________________________...,______________________________________________...
81,5/1/2024,"For release at 2:00 p.m. EDT May 1, 2024 Recen...",release 200 pm edt may 1 2024 recent indicator...


In [22]:
import pandas as pd

# Assuming you have a DataFrame named df with clean_content column
# You can replace df['clean_content'] with your actual column containing text data
docs = df['clean_content'].tolist()

# Now you can proceed with your analysis using the docs variable


In [23]:
from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)

2024-05-02 15:30:15,615 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-05-02 15:30:34,198 - BERTopic - Embedding - Completed ✓
2024-05-02 15:30:34,199 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-02 15:30:36,852 - BERTopic - Dimensionality - Completed ✓
2024-05-02 15:30:36,855 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-02 15:30:36,873 - BERTopic - Cluster - Completed ✓
2024-05-02 15:30:36,881 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-02 15:30:37,340 - BERTopic - Representation - Completed ✓


In [24]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,26,0_federal_committee_percent_market,"[federal, committee, percent, market, rate, op...",[release 200 pm edt november 1 2023 recent ind...
1,1,24,1_board_market_federal_governors,"[board, market, federal, governors, division, ...",[_____________________________________________...
2,2,18,2_federal_committee_agency_securities,"[federal, committee, agency, securities, marke...",[release 2 pm edt april 28 2021 federal reserv...
3,3,14,3_board_division_inflation_participants,"[board, division, inflation, participants, mar...",[_ ___________________________________________...


In [25]:
topic_model.get_topic(0)  # Select the most frequent topic

[('federal', 0.09674019699307575),
 ('committee', 0.08145697562462453),
 ('percent', 0.06353361200011001),
 ('market', 0.06196893270176667),
 ('rate', 0.05990714580772814),
 ('open', 0.059794806406564656),
 ('policy', 0.05914688934135025),
 ('inflation', 0.05465106079395126),
 ('agency', 0.04923693764609946),
 ('securities', 0.04460828371096056)]

In [26]:
topic_model.topics_[:10]

[0, 1, 0, 1, 0, 1, 0, 1, 0, 1]

In [27]:
topic_model.visualize_topics()

In [28]:
topic_model.visualize_barchart(top_n_topics=100)