In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install transformers sentence-transformers datasets fsspec bertopic

In [None]:
from datasets import load_dataset
dataset = load_dataset("maartengr/arxiv_nlp")["train"]

In [None]:
dataset.column_names

In [None]:
abstracts = dataset['Abstracts']
titles = dataset['Titles']

In [None]:
!pip uninstall huggingface_hub -y
!pip install huggingface-hub==0.25.2


In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
embedding_model = SentenceTransformer("thenlper/gte-small")

In [None]:
from transformers.utils import cached_file

model_id = "thenlper/gte-small"
file_name = "vocab.txt"
file_path = cached_file(model_id, file_name)
print(file_path)

In [None]:
embeddings = embedding_model.encode(
    sentences = list(abstracts),
    show_progress_bar=True,
)

In [None]:
from umap import UMAP

In [None]:
umap_model = UMAP(
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=42,
)

reduced_embeddings = umap_model.fit_transform(list(embeddings))


In [None]:
from hdbscan import HDBSCAN

In [None]:
hdbscan_model = HDBSCAN(
    min_cluster_size=15,
    metric="euclidean",
    cluster_selection_method="eom",
).fit(reduced_embeddings)

In [None]:
clusters = hdbscan_model.labels_
len(set(clusters))

In [None]:
from bertopic import BERTopic

In [None]:
topic_model = BERTopic(
   embedding_model=embedding_model,
   umap_model=umap_model,
   hdbscan_model=hdbscan_model,
   verbose=True)

topic_model = topic_model.fit(list(abstracts), embeddings)


In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(1)

In [None]:
topic_model.find_topics("topic embedding")

In [None]:
from bertopic.representation import KeyBERTInspired

In [None]:
representation_model = KeyBERTInspired()

In [None]:
from copy import deepcopy

original_topics = deepcopy(topic_model.topic_representations_)

In [None]:
import sentence_transformers
print(sentence_transformers.__version__)


In [None]:
import pandas as pd

def topic_differences(model, original_topics, nr_topics=5):
  """Show the differences in topic representations between two models """
  df = pd.DataFrame(columns=["Topic", "Original", "Updated"])
  for topic in range(nr_topics):

    # Extract top 5 words per topic per model
    og_words = " | ".join(list(zip(*original_topics[topic]))[0][:5])
    new_words = " | ".join(list(zip(*model.get_topic(topic)))[0][:5])
    df.loc[len(df)] = [topic, og_words, new_words]
  return df


In [None]:
topic_model.update_topics(
    abstracts,
    representation_model=representation_model)


In [None]:
topic_differences(topic_model, original_topics)

In [None]:
from bertopic.representation import MaximalMarginalRelevance #MMR
# Update our topic representations to MaximalMarginalRelevance
representation_model = MaximalMarginalRelevance(diversity=0.2)
topic_model.update_topics(abstracts, representation_model=representation_model)
# Show topic differences
topic_differences(topic_model, original_topics)

In [None]:
from transformers import pipeline
from bertopic.representation import TextGeneration

prompt = """I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: '[KEYWORDS]'.
Based on the documents and keywords, what is this topic about?"""

# Update our topic representations using Flan-T5
generator = pipeline("text2text-generation", model="google/flan-t5-small")

representation_model = TextGeneration(generator, prompt=prompt, doc_length=50, tokenizer="whitespace")

topic_model.update_topics(abstracts, representation_model=representation_model)
topic_differences(topic_model, original_topics)

In [None]:
import openai
from bertopic.representation import OpenAI
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short topic label in the following
format:
topic: <short topic label>
"""
# Update our topic representations using GPT-3.5
client = OpenAI(api_key="")
representation_model = OpenAI(
client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt
)
topic_model.update_topics(abstracts, representation_model=representation_model)
topic_differences(topic_model, original_topics)
