## Loading Packages

In [1]:
from sentence_transformers import SentenceTransformer
import umap.umap_ as UMAP 
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

### https://www.pinecone.io/learn/bertopic/ ###

## Loading Data

In [2]:
data = pd.read_csv("abstract_title.csv")

## Lemmatization

In [3]:

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
 
# Create WordNetLemmatizer object
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lassehyldighansen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def lemmatize_text(text):
     return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [5]:
data['Abstract_Lemma'] = data['Abstract'].apply(lemmatize_text)

In [6]:
data['title_Lemma'] = data['title'].apply(lemmatize_text)

In [7]:
data = data.astype(str)


## Running model

In [55]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
import nltk
nltk.download('stopwords')
from hdbscan import HDBSCAN

embedding_model = SentenceTransformer('all-mpnet-base-v2')
umap_model = UMAP()
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=5,
                        gen_min_span_tree=True,
                        prediction_data=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lassehyldighansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stopwords = list(stopwords.words('english'))

# we add this to remove stopwords that can pollute topcs
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

In [57]:
model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5,
    language='english',
    calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform(data['Abstract'])

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

2022-09-05 13:53:03,972 - BERTopic - Transformed documents to Embeddings
2022-09-05 13:53:05,617 - BERTopic - Reduced dimensionality
2022-09-05 13:53:05,629 - BERTopic - Clustered reduced embeddings


In [53]:
preds, probs = model.transform(data['title_Lemma'])

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

2022-09-05 13:51:44,239 - BERTopic - Reduced dimensionality
2022-09-05 13:51:44,260 - BERTopic - Calculated probabilities with HDBSCAN
2022-09-05 13:51:44,262 - BERTopic - Predicted clusters


In [58]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,7,-1_user_evaluation_robot_system
1,0,131,0_robot_human_empathy_robots
2,1,10,1_service_robot_robots_social


In [22]:
probs

array([[9.99999996e-001, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 9.99999996e-001, 0.00000000e+000],
       [1.17862566e-001, 1.52212575e-001, 1.23407877e-001,
        1.25214778e-001, 2.48501885e-001, 1.42033557e-001],
       [1.62205138e-099, 1.36541665e-001, 8.37676435e-002,
        2.13535804e-134, 6.17066187e-001, 1.49660704e-001],
       [1.43528032e-001, 1.08148424e-001, 7.40811026e-002,
        9.61580273e-002, 9.40703933e-002, 1.30608820e-001],
       [1.52519815e-001, 1.22721237e-001, 9.68931602e-002,
        7.42868678e-002, 8.33020430e-002, 9.02835825e-002],
       [9.99999996e-001, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [1.22883466e-001, 1.51115493e-001, 1.34473448e-001,
        1.32068981e-001, 2.11646771e-001, 1.44372581e-001],
       [1.02990082e-001, 1.38850745e-001, 8.5211

In [51]:
model.visualize_barchart()


In [17]:
model.visualize_topics()