In [1]:
import pandas as pd
import webbrowser
from bertopic import BERTopic




In [3]:
# Load the sentiment dataset
df = pd.read_csv("C:\\Users\\KIIT\\Documents\\git\\StockOracle-AI-Powered-Stock-Prediction-Forecasting-System\\data\\refined_textual_data.csv")

In [4]:
# Convert 'date' column to datetime format
df["date"] = pd.to_datetime(df["date"])

In [5]:
# Extract the processed text for topic modeling
documents = df["processed"].dropna().tolist()  # Remove NaN values if any

In [6]:
# Initialize BERTopic model
topic_model = BERTopic(verbose=True)

In [7]:
# Fit and transform the data
topics, probs = topic_model.fit_transform(documents)

2025-03-23 11:28:47,602 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/393 [00:00<?, ?it/s]

2025-03-23 11:30:16,554 - BERTopic - Embedding - Completed ✓
2025-03-23 11:30:16,556 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-23 11:31:00,303 - BERTopic - Dimensionality - Completed ✓
2025-03-23 11:31:00,305 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-23 11:31:00,879 - BERTopic - Cluster - Completed ✓
2025-03-23 11:31:00,897 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-23 11:31:01,299 - BERTopic - Representation - Completed ✓


In [8]:
# Display topic representation
topic_info = topic_model.get_topic_info()
print(topic_info.head())

   Topic  Count                                 Name  \
0     -1    185        -1_nvda_optimistic_at_surging   
1      0    249       0_googl_excited_strong_surging   
2      1    216  1_jpm_disappointed_trading_dropping   
3      2    215      2_csco_optimistic_higher_closed   
4      3    212     3_msft_disappointed_to_struggled   

                                      Representation  \
0  [nvda, optimistic, at, surging, feeling, for, ...   
1  [googl, excited, strong, surging, higher, clos...   
2  [jpm, disappointed, trading, dropping, today, ...   
3  [csco, optimistic, higher, closed, strong, sur...   
4  [msft, disappointed, to, struggled, today, dro...   

                                 Representative_Docs  
0  [nvda had a steady day, closing at 125.18. mar...  
1  [strong day for googl! closed higher at 139.64...  
2  [jpm struggled today, dropping to 79.36. inves...  
3  [strong day for csco! closed higher at 47.08, ...  
4  [msft struggled today, dropping to 223.86. inv..

In [9]:
# Generate visualizations
fig_cluster = topic_model.visualize_topics()
fig_barchart = topic_model.visualize_barchart(top_n_topics=10)

In [10]:
# Generate Topics Over Time plot
topics_over_time = topic_model.topics_over_time(documents, df["date"], nr_bins=20)
fig_time = topic_model.visualize_topics_over_time(topics_over_time)

20it [00:04,  4.50it/s]


In [11]:
# Save the visualization
with open("Topic_Visualization.html", "w", encoding="utf-8") as f:
    f.write("<h1>Topic Clusters</h1>")
    f.write(fig_cluster.to_html(full_html=False, include_plotlyjs='cdn'))
    
    f.write("<h1>Topic Bar Chart</h1>")
    f.write(fig_barchart.to_html(full_html=False, include_plotlyjs='cdn'))
    
    f.write("<h1>Topics Over Time</h1>")
    f.write(fig_time.to_html(full_html=False, include_plotlyjs='cdn'))

print("Completed! Visualization saved as 'Topic_Visualization.html'")

Completed! Visualization saved as 'Topic_Visualization.html'


In [13]:
# (Optional) Open the saved file directly
webbrowser.open("Topic_Visualization.html")

True