In [24]:
!pip install nbib
!pip install bertopic



In [44]:
from bertopic import BERTopic

In [26]:
## Script for converting PubMed search results file into .csv

# import pandas as pd
# import nbib
# import os
# import sys

## This script takes a "citation manager file" of metadata from a PubMed Seach that arrives in .nbib format and converts
## it into a .csv using the Python library nbib.

## input: filepath to .nbib file, desired name of the output .csv

# def make_csv(filepath,output_name):

#     docs = nbib.read_file(filepath)
#     entries_list = []
#     bad_list = 0
#     for doc in docs:
#         try:
#             entry = {
#                 'pubmed_id':doc['pubmed_id'],
#                 'title':doc['title'],
#                 'date':doc['pubmed_time'],
#                 'doi':doc['doi'],
#                 'descriptors':doc['descriptors'],
#                 'abstract':doc['abstract'],
#                 'journal':doc['journal']
#             }
#             entries_list.append(entry)
#         except:
#             bad_list +=1

#     # Check to see if the /data folder exists.  Create one if not.

#     path = os.getcwd()+'/data'
#     if not os.path.exists(path):
#         os.makedirs(path)
#     print(path)
#     pd.DataFrame(entries_list).to_csv(path+'/'+output_name+'.csv',index=False)

# make_csv('./pubmed-oxycontin-set(1).nbib','oxycontin_96_23')



## Load corpus

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
import pandas as pd
corpus = pd.read_csv('/content/drive/MyDrive/a_datasets/oxycontin_abstracts_96_23.csv')
abstracts = corpus.abstract.tolist()

## Create embeddings for each document in the corpus using the "all-Mini_LM-L6-V2" language model that is the default for BERTopic

In [29]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

Batches:   0%|          | 0/118 [00:00<?, ?it/s]

## Set up the pipeline for the remainder of the topic modeling process

Note: because the documents are now pre-embedded, you can tune hyperparameters and try different approaches much more quickly than you could otherwise

## 1. Reduce the dimensionality of the embeddings

In [30]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=4)
## note: setting the random_state here prevents stochastic behavior when you re-run the pipeline

## 2. Cluster the resulting dense vectors

In [31]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


## 3. Prep for for topic modeling

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 4))
## note: the author of the library rarely finds removing stopwords from the topic naming process helpful.
## With this dataset, I found that it removed some noise from topic names.

## 4. Select the representation model that will draw the words that best describe the core themes of each topic cluster

In [33]:
from bertopic.representation import KeyBERTInspired

# Create your representation model
representation_model = KeyBERTInspired()

# Use the representation model in BERTopic on top of the default pipeline
# topic_model = BERTopic(representation_model=representation_model)

## 5. Run the pipeline to create a topic_model

In [34]:
from bertopic import BERTopic

from bertopic.representation import PartOfSpeech

# Create your representation model
representation_model = PartOfSpeech("en_core_web_sm")

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics, probs = topic_model.fit_transform(abstracts, embeddings)

2023-09-21 13:45:19,984 - BERTopic - Reduced dimensionality
2023-09-21 13:45:20,121 - BERTopic - Clustered reduced embeddings


## Review topics
Note: Topic -1 is the outliers that were not clustered into any other topics

In [35]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1095,-1_opioid_oxycodone_pain_patients,"[opioid, oxycodone, pain, patients, opioids, u...",[BACKGROUND: This review is one of a series on...
1,0,270,0_cancer_pain_patients_oxycodone,"[cancer, pain, patients, oxycodone, morphine, ...",[BACKGROUND: Many people with cancer experienc...
2,1,213,1_urine_ng_samples_testing,"[urine, ng, samples, testing, method, specimen...","[OxyContin, a controlled-release formulation o..."
3,2,170,2_opioid_surgery_patients_postoperative,"[opioid, surgery, patients, postoperative, pre...",[BACKGROUND: While there is an increasing burd...
4,3,170,3_receptor_antinociceptive_morphine_effects,"[receptor, antinociceptive, morphine, effects,...",[Effective treatments for chronic pain without...
5,4,160,4_oxycodone_rats_self_mice,"[oxycodone, rats, self, mice, administration, ...",[Recent evidence suggests that inhibition of t...
6,5,137,5_group_knee_patients_postoperative,"[group, knee, patients, postoperative, pain, a...",[BACKGROUND: The utility of a femoral nerve bl...
7,6,111,6_constipation_bowel_naloxone_patients,"[constipation, bowel, naloxone, patients, func...",[In patients managed with opioids for chronic ...
8,7,109,7_women_cesarean_delivery_pain,"[women, cesarean, delivery, pain, opioid, grou...",[OBJECTIVE: To identify characteristics associ...
9,8,107,8_group_kg_oxycodone_mg,"[group, kg, oxycodone, mg, groups, postoperati...",[BACKGROUND: The aim of this study was to eval...


In [36]:
docs = corpus.abstract.tolist()

In [37]:
from scipy.cluster import hierarchy as sch
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

titles = corpus.title.tolist()
cluster_fig = topic_model.visualize_documents(titles, embeddings=embeddings)
cluster_fig.write_html('/content/drive/MyDrive/a_datasets/topic_distro.html')
cluster_fig

100%|██████████| 36/36 [00:32<00:00,  1.12it/s]


In [47]:
hierarch_fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
hierarch_fig.write_html('/content/drive/MyDrive/a_datasets/hierarch_cluster.html')

hierarch_fig

In [39]:
topic_model.visualize_hierarchical_documents(titles, hierarchical_topics, embeddings=embeddings)

In [None]:
corpus['date'] = pd.to_datetime(corpus['date'])
timestamps = corpus['date'].tolist()
topics_over_time = topic_model.topics_over_time(docs, timestamps,nr_bins=20)
time_fig = topic_model.visualize_topics_over_time(topics_over_time)
time_fig.write_html('/content/drive/MyDrive/a_datasets/topics_over_time.html')
time_fig

## Explore Search Uses

In [None]:
def search_model(search_text):
  similar_topics, similarity = topic_model.find_topics(search_text, top_n=5)

  df = pd.DataFrame({'topic':topic_model.topics_,'title':corpus['title'],'doi':corpus['doi'],'abstract':corpus['abstract'],'pubmed_id':corpus['pubmed_id']})
  hits = df[df.topic==similar_topics[0]][0:10]
  print('Topics:{}'.format(similar_topics))
  for i in hits[:10].itertuples():
    print(i.title)
    print('https://pubmed.ncbi.nlm.nih.gov/' + str(i.pubmed_id))

In [40]:
search_model('horses')

Topics:[28, 4, 36, 35, 3]
Prophylactic vaccination protects against the development of oxycodone self-administration.
https://pubmed.ncbi.nlm.nih.gov/29936242
Opioid Dose- and Route-Dependent Efficacy of Oxycodone and Heroin Vaccines in Rats.
https://pubmed.ncbi.nlm.nih.gov/29535156
Pharmacological mechanisms underlying the efficacy of antibodies generated by a vaccine to treat oxycodone use disorder.
https://pubmed.ncbi.nlm.nih.gov/34126123
Polymer-mediated delivery of vaccines to treat opioid use disorders and to reduce opioid-induced toxicity.
https://pubmed.ncbi.nlm.nih.gov/32439214
Pre-clinical safety and toxicology profile of a candidate vaccine to treat oxycodone use disorder.
https://pubmed.ncbi.nlm.nih.gov/35469698
Structures of drug-specific monoclonal antibodies bound to opioids and nicotine reveal a common mode of binding.
https://pubmed.ncbi.nlm.nih.gov/36513069
Effect of currently approved carriers and adjuvants on the pre-clinical efficacy of a conjugate vaccine against 

## Possible evaluation approaches

In [45]:
len(corpus[(corpus.abstract.str.lower().str.contains('abuse'))|(corpus.abstract.str.lower().str.contains('overdose'))])

863

In [42]:
df = pd.DataFrame({'topic':topic_model.topics_,'title':corpus['title'],'abstract':corpus['abstract']})


In [46]:
len(df[df.topic.isin([10, 35, 12, 22, 30])])

233