In [1]:
# script for testing the bertopic functionality and classes
from bertopic import BERTopic
from logging import error
import data
import conf
import pandas as pd
import tools
import model
import json
import os
from sentence_transformers import SentenceTransformer

paths = conf.get_paths()
raw_orgFiles, sdgs_orgFiles = data.get_sdgs_org_files(paths["SDGs_inf"])
raw_natureShort, sdgs_nature, index_abstracts = data.get_nature_abstracts()
raw_natureExt, sdgs_natureAll, index_full = data.get_nature_files(abstract=True, kw=True, intro=True, body=True, concl=True)
raw_pathFinder, sdgs_pathFinder = data.get_sdgs_pathfinder(paths["ref"], min_words=200)
raw_extraFiles, sdgs_extra = data.get_extra_manual_files(paths["ref"])
raw_healthcare, sdgs_healthcare = data.get_health_care_files(paths["ref"], n_files=100)

def prepare_texts(corpus):
    newCorpus = []
    for text in corpus:
        newCorpus.append(" ".join(tools.tokenize_text(text, lemmatize=False, stem=False ,extended_stopwords=True)))
    return newCorpus
        
# trainFiles = prepare_texts(raw_trainFiles)
orgFiles = prepare_texts(raw_orgFiles)
extraFiles = prepare_texts(raw_extraFiles)
healthcareFiles = prepare_texts(raw_healthcare)
natureShort = prepare_texts(raw_natureShort)
natureExt = prepare_texts(raw_natureExt)

sdgs_seed_list = data.get_sdgs_seed_list(paths["ref"])

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

- 510 sdgs files were found
- 164 nature abstracts were found
- 186 nature files were found
- 434 texts in the pathfinder dataset
- 34 manual files were found
- 2000 health care texts


In [21]:
# trainData = [raw_orgFiles + raw_extraFiles + raw_healthcare, sdgs_orgFiles + sdgs_extra + sdgs_healthcare]
trainData = [orgFiles + extraFiles + healthcareFiles, sdgs_orgFiles + sdgs_extra + sdgs_healthcare]
# trainData = [raw_orgFiles, sdgs_orgFiles]
topic_model = model.BERTopic_classifier(paths)

# inherit the whole class?

# TODO: try to generate a methodology for assigning labels to the training texts in order t oclassify them?
topic_model.train_global_model(trainData, seed_topic_list=sdgs_seed_list)
# topic_model.load_global_model()
# topic_model.map_model_topics_to_sdgs(associated_sdgs=trainData[1])

Batches: 100%|██████████| 21/21 [00:28<00:00,  1.34s/it]
2022-05-07 11:17:38,775 - BERTopic - Transformed documents to Embeddings
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.89it/s]
2022-05-07 11:17:43,820 - BERTopic - Reduced dimensionality
2022-05-07 11:17:43,915 - BERTopic - Clustered reduced embeddings


AttributeError: 'BERTopic' object has no attribute 'get_num_topics'

In [24]:
topic_model.global_model.visualize_barchart(top_n_topics=18, n_words=20)

In [26]:
topic_model.topics_association = [3, 3, 11, 16, 9, 5, 14, 15, 2, 4, 8, 1, 6, 7, 13, 12, 10]
topics, probs = topic_model.global_model.transform(natureShort[0])

Batches: 100%|██████████| 1/1 [00:00<00:00, 50.25it/s]
2022-05-07 11:23:49,659 - BERTopic - Reduced dimensionality
2022-05-07 11:23:49,663 - BERTopic - Calculated probabilities with HDBSCAN
2022-05-07 11:23:49,664 - BERTopic - Predicted clusters


In [None]:
topic_model.visualize_barchart(top_n_topics=17, n_words=20)

In [None]:
topics, probs = topic_model.transform(natureExt)

In [None]:
docs = range(60)
for doc in docs:
    probs_ascii = ["x{}:{:.3f}".format(ii, prob) for ii, prob in zip(range(len(list(probs[0]))), probs[doc])]
    print(['|'.join(probs_ascii), sdgs_natureAll[doc]])

In [None]:

# freq = topic_model.get_topic_info(); freq.head(10)
topic_model.visualize_topics()
documents = topic_model.get_representative_docs(topic=None) # to understand the
topic_model.visualize_distribution(probs[2], min_probability=0.015)

In [None]:
probs[4]

In [None]:
topic_model.visualize_hierarchy(top_n_topics=50)

In [None]:
topic_model.visualize_heatmap(n_clusters=15, width=1000, height=1000)

In [None]:
# combine with the elbow method to adjust the best number of words per topic
topic_model.visualize_term_rank()

In [None]:
# FINE TUNE THE MODEL FOR BETTER UNDERSTANDING
topic_model.update_topics(docs, topics, n_gram_range=(1, 2))
new_topics, new_probs = topic_model.reduce_topics(docs, topics, probs, nr_topics=60)
topic_model.save("my_model")	
my_model = BERTopic.load("my_model")