# MEDBLAST

## LOAD LIB

In [2]:
%%capture 
%pip install -U bertopic numpy pandas

In [3]:
# !conda install -c conda-forge hdbscan

In [4]:
# %pip install bertopic

In [5]:
import numpy as np
import pandas as pd
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


# GET THE DATA

In [6]:
MODEL_NAME = "BERTOPIC_MEDBLAST"

In [7]:
embeddings = np.load(f"{MODEL_NAME}-embedded.npy")

In [8]:
df2023 = pd.read_csv("clean2023.csv")
df2022 = pd.read_csv("clean2022.csv")
df2021 = pd.read_csv("clean2021.csv")
df2020 = pd.read_csv("clean2020.csv")
df2019 = pd.read_csv("clean2019.csv")
df2018 = pd.read_csv("clean2018.csv")

dfs = [df2023, df2022, df2021, df2020, df2019, df2018]
df = pd.concat(dfs)

docs = list(df["Title_Abstract"].astype(str))[:]

## LOAD MODEL

In [9]:
topic_model = BERTopic.load("Tsunnami/BERTopic-ALI-LARGE", embedding_model=embeddings)

In [10]:
freq = topic_model.get_topic_info()

freq.head()

# idk why NaN, but should work?

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5490,-1_and_the_of_was,"[and, the, of, was, to, in, for, were, with, by]",
1,0,528,0_proton_collisions_tev_boson,"[proton, collisions, tev, boson, cms, search, ...",
2,1,260,1_af_cardiac_heart_ventricular,"[af, cardiac, heart, ventricular, patients, ab...",
3,2,194,2_supply_innovation_chain_smes,"[supply, innovation, chain, smes, business, or...",
4,3,152,3_firms_board_corporate_market,"[firms, board, corporate, market, takeover, fi...",


In [11]:
pd.DataFrame({"Document": docs, "Topic": topic_model.topics_})

Unnamed: 0,Document,Topic
0,Graphene oxide-alginate hydrogel-based indicat...,21
1,Rare coordination behavior of triethanolamine ...,230
2,Total ammonia nitrogen removal and microbial c...,168
3,Effects of microaeration and sludge recirculat...,107
4,Bioaccumulation of heavy metals in commerciall...,17
...,...,...
19545,Association between leukocyte telomere length ...,117
19546,Anaerobic co-digestion of hydrolysate from alk...,-1
19547,Bullying at work: Cognitive appraisal of negat...,369
19548,Three-dimensional interaction diagram for the ...,18


### BACKUPS

In [12]:
topic_model = BERTopic.load(f"{MODEL_NAME}", embedding_model=embeddings)

## VIZUALIZER

In [13]:
%pip install --upgrade nbformat

Note: you may need to restart the kernel to use updated packages.


In [39]:
%pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.8.4-cp311-cp311-win_amd64.whl.metadata (5.9 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.2.1-cp311-cp311-win_amd64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.51.0-cp311-cp311-win_amd64.whl.metadata (162 kB)
     ---------------------------------------- 0.0/162.8 kB ? eta -:--:--
     --------- --------------------------- 41.0/162.8 kB 991.0 kB/s eta 0:00:01
     -------------------------------------- 162.8/162.8 kB 3.3 MB/s eta 0:00:00
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.5-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.1.2-py3-none-any.whl.metadata (5.1 kB)
Downloading matplotlib-3.8.4-cp311-cp311-win_amd64.whl (7.7 MB)
   ---------------------

In [42]:
import nbformat
from umap import UMAP
import matplotlib

In [15]:
topic_model.visualize_topics()

In [17]:
hierarchical_topics = topic_model.visualize_hierarchy()

In [19]:
from sentence_transformers import SentenceTransformer

In [25]:
docs = pd.read_csv("BERTOPIC_MEDBLAST-pred.csv")

# Extract the text data from the DataFrame
text_data = docs['Document'].tolist()  # Replace 'your_text_column' with the column containing your text data

# Initialize SentenceTransformer model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode the text data
embeddings = sentence_model.encode(text_data, show_progress_bar=False)

topic_model.visualize_documents(text_data, embeddings=embeddings)

# Reduce dimensionality of embeddings (optional)
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(text_data, reduced_embeddings=reduced_embeddings)

KeyboardInterrupt: 

In [26]:
topic_model.visualize_barchart()

In [27]:
topic_model.visualize_heatmap()

In [28]:
topic_model.visualize_term_rank()

In [29]:
topic_model.visualize_term_rank(log_scale=True)

In [41]:
docs = pd.read_csv("BERTOPIC_MEDBLAST-pred.csv")
docs  = docs["Document"]
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)

# Visualize the token-level distributions
dft = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])
dft


100%|██████████| 20/20 [02:28<00:00,  7.41s/it]


ImportError: background_gradient requires matplotlib.

<pandas.io.formats.style.Styler at 0x2027a37a510>