## @misc{grootendorst2020bertopic,
  author       = {Maarten Grootendorst},
  title        = {BERTopic: Leveraging BERT and c-TF-IDF to create easily interpretable topics.},
  year         = 2020,
  publisher    = {Zenodo},
  version      = {v0.9.4},
  doi          = {10.5281/zenodo.4381785},
  url          = {https://doi.org/10.5281/zenodo.4381785}
}

In [None]:
print('test')
# https://colab.research.google.com/notebooks/io.ipynb#scrollTo=J4QxBareshEV

test


In [None]:
# BERTopic Maarten Grootendorst
# Installation with sentence-transformers

! pip install bertopic

#    3 main algorithm components
# 1. Embed Documents: Extract document embeddings with Sentence Transformers
# 2. Cluster Documents: Create groups of similar documents with UMAP (to reduce the dimensionality of embeddings) 
#    and HDBSCAN (to identify and cluster semantically similar documents)
# 3. Create Topic Representation: Extract and reduce topics with c-TF-IDF 
#    (class-based term frequency, inverse document frequency)


Collecting bertopic
  Downloading bertopic-0.9.4-py2.py3-none-any.whl (57 kB)
[?25l[K     |█████▊                          | 10 kB 27.3 MB/s eta 0:00:01[K     |███████████▍                    | 20 kB 9.4 MB/s eta 0:00:01[K     |█████████████████               | 30 kB 8.2 MB/s eta 0:00:01[K     |██████████████████████▊         | 40 kB 3.6 MB/s eta 0:00:01[K     |████████████████████████████▍   | 51 kB 4.0 MB/s eta 0:00:01[K     |████████████████████████████████| 57 kB 2.7 MB/s 
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 7.6 MB/s 
[?25hCollecting hdbscan>=0.8.27
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 37.7 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting umap-learn>=0.5.0
  Downloading

In [None]:
# extracting topics and generting probabilities

from bertopic import BERTopic
import pandas as pd 
import numpy as np
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
 
docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)

In [None]:
# access the frequent topics that were generated
# -1 refers to all outliers and should typically be ignored

topic_model.get_topic_info()

SyntaxError: invalid syntax (<ipython-input-3-22b5019ff26f>, line 3)

In [None]:
# most frequent topic that was generated, topic 0

topic_model.get_topic(0)

In [None]:
# Visualize Topics

topic_model.visualize_topics()
topic_model.visualize_barchart()

In [None]:
# Dynamic Topic Modeling (DTM) is a collection of techniques aimed at analyzing the evolution of topics over time. 
# These methods allow you to understand how a topic is represented over time.
# Here, we will be using all of Donald Trump's tweet to see how he talked over certain topics over time:

import re
import pandas as pd

trump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')
trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
trump = trump.loc[(trump.isRetweet == "f") & (trump.text != ""), :]
timestamps = trump.date.to_list()
tweets = trump.text.to_list()

In [None]:
# Extract the global topic representations by creating and training a BERTopic model

topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(tweets)

In [None]:
# From these topics generate the topic representations at each timestamp for each topic
# by calling topics_over_time and pass in his tweets, the corresponding timestamps, and the related topics

topics_over_time = topic_model.topics_over_time(tweets, topics, timestamps, nr_bins=20)

In [None]:
# Visualize the topics by calling visualize_topics_over_time()

topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=6)