In [1]:
import sys
import pandas as pd
from bertopic import BERTopic
from time import sleep
import os
from transformers import AutoTokenizer, AutoModel
from openai import OpenAI

# Add the scripts directory to the path
sys.path.append('../scripts')

In [4]:
# Own scripts
import pubmed_helpers
import text_cleaning as tc

In [5]:
pubmed_200_docs = ['39823503','39821144','39820866','39820942','39823462','39823446','39820610','39820615','39823483','39820171','39823449','39820811','39820188','39821177','39820243','39820578','39823394','39820631','39820026','39823512','39823427','39823480','39820608','39820210','39821163','39820153','39823443','39821188','39820819','39821125','39823507','39820181','39820939','39821170','39821152','39823409','39823466','39821157','39820180','39823447','39823453','39823470','39823401','39820616','39823452','39823402','39823476','39823458','39823435','39823505','39820922','39820018','39821120','39820032','39820962','39821117','39820892','39823489','39820204','39820091','39820222','39823399','39823442','39820183','39823421','39823490','39820857','39820552','39823484','39823496','39820943','39823485','39820186','39823506','39821186','39823481','39820176','39823397','39820622','39820590','39820894','39820059','39820967','39823422','39820033','39820937','39820199','39823448','39823460','39821194','39820585','39823411','39820848','39823504','39820173','39823441','39820142','39820056','39823510','39823438','39820015','39820031','39820158','39823404','39821178','39823478','39820822','39823461','39820812','39823494','39820847','39821181','39820800','39820600','39823479','39820810','39823396','39821102','39821142','39820827','39820104','39820592','39820175','39823501','39820955','39821245','39823491','39823405','39823407','39823428','39823472','39823444','39820599','39820858','39820225','39820159','39821168','39820611','39821118','39820604','39823486','39821171','39820223','39823464','39820233','39820579','39823430','39823493','39820936','39821197','39820868','39820192','39821173','39823417','39820149','39823502','39821111','39820571','39823431','39820594','39823419','39821228','39823432','39820588','39823425','39820627','39823488','39823423','39820785','39820788','39820184','39823412','39820808','39820216','39820167','39823469','39820607','39820168','39823433','39820583','39820793','39820987','39823527','39821124','39820229','39820771','39820096','39820178','39820609','39823436','39821154','39823465','39820587','39823471','39820172','39823500','39820798','39820613','39820205','39820614']


In [15]:
# Get the data, append abstract and title to tuple.
# Wait 0.4 seconds between each request to avoid getting blocked

pubmed_data = []
for doc_id in pubmed_200_docs:
    pubmed_data.append(pubmed_helpers.fetch_pubmed_data(doc_id))
    # print status for every 10th document
    if len(pubmed_data) % 10 == 0:
        print(f'Fetched {len(pubmed_data)} documents')
    sleep(0.4)

In [17]:
# Add a random integer as a publication year to the data. This is just for testing purposes. Publication year should be between 2000 and 2021
import random
for i in range(len(pubmed_data)):
    pubmed_data[i] = pubmed_data[i] + (random.randint(2000,2021),)

In [None]:
# Store the docs as a Parquet file
# Don't overwrite if the file exists
if not os.path.exists('data/pubmed_data_200.parquet'):
    pd.DataFrame(pubmed_data, columns=['pmid','title', 'abstract','publication_year']).to_parquet('data/pubmed_data_200.parquet')

In [6]:
df = pd.read_parquet('data/pubmed_data_200.parquet')

In [6]:
def fetch_single_pubmed_article():
    pubmed_id = "31452104"
    pmid, title, abstract = pubmed_helpers.fetch_pubmed_data(pubmed_id)
    return pmid, title, abstract

fetch_single_pubmed_article()

('31452104',
 'Molegro Virtual Docker for Docking.',
 'Molegro Virtual Docker is a protein-ligand docking simulation program that allows us to carry out docking simulations in a fully integrated computational package. MVD has been successfully applied to hundreds of different proteins, with docking performance similar to other docking programs such as AutoDock4 and AutoDock Vina. The program MVD has four search algorithms and four native scoring functions. Considering that we may have water molecules or not in the docking simulations, we have a total of 32 docking protocols. The integration of the programs SAnDReS ( https://github.com/azevedolab/sandres ) and MVD opens the possibility to carry out a detailed statistical analysis of docking results, which adds to the native capabilities of the program MVD. In this chapter, we describe a tutorial to carry out docking simulations with MVD and how to perform a statistical analysis of the docking results with the program SAnDReS. To illustr

In [7]:
cleaned_docs = [tc.clean_text_ext_spacy(doc) for doc in df['abstract']]


In [8]:
df['cleaned_abstract'] = cleaned_docs

In [9]:
# BERTopic often wants the documents to be longer, so we can multiply the documents by 10
cleaned_docs = cleaned_docs*10

In [9]:
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
embed_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

In [50]:
# Fit the topic model
topic_model = BERTopic(verbose=True, embedding_model=embed_model, language='english', calculate_probabilities=True, nr_topics=10)
topics, probs = topic_model.fit_transform(df['cleaned_abstract'])
topic_model.get_topic_info()

2025-01-20 23:02:35,717 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2025-01-20 23:02:37,764 - BERTopic - Embedding - Completed ✓
2025-01-20 23:02:37,764 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-20 23:02:37,862 - BERTopic - Dimensionality - Completed ✓
2025-01-20 23:02:37,863 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-20 23:02:37,875 - BERTopic - Cluster - Completed ✓
2025-01-20 23:02:37,875 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-01-20 23:02:37,903 - BERTopic - Representation - Completed ✓
2025-01-20 23:02:37,904 - BERTopic - Topic reduction - Reducing number of topics
2025-01-20 23:02:37,905 - BERTopic - Topic reduction - Reduced number of topics from 6 to 6


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,78,-1_patient_health_value_base,"[patient, health, value, base, high, aim, time...",[aim provide update overview medical error tax...
1,0,26,0_feature_image_classification_accuracy,"[feature, image, classification, accuracy, pro...",[paper present novel improve semantic segmenta...
2,1,24,1_carbon_soil_financial_community,"[carbon, soil, financial, community, specie, s...",[integrated crop livestock system livestock gr...
3,2,34,2_cell_cancer_mtx_gene,"[cell, cancer, mtx, gene, abl, protein, viral,...",[Rheumatoid arthritis RA long term autoinflamm...
4,3,23,3_child_pregnancy_health_risk,"[child, pregnancy, health, risk, mother, dengu...",[effective prevention mother child transmissio...
5,4,15,4_social_fi_ses_self,"[social, fi, ses, self, report, measure, adult...",[food insecurity FI lack access adequate food ...


In [52]:
# topic_model.set_topic_labels({0: 'Covid-19', 1: 'Vaccines', 2: 'Healthcare', 3: 'Cancer', 4: 'Mental Health', 5: 'Diabetes', 6: 'Cardiovascular', 7: 'Neuroscience', 8: 'Genetics', 9: 'Public Health'})

In [46]:
def get_topic_representation(keywords):
    client = OpenAI(
        api_key='ollama',  
        # api_version = "2024-02-15-preview",
        base_url='http://localhost:11434/v1/',
    )
    # Create the prompt to generate a concise topic label
    prompt = f"Here is a list of keywords that represent a topic: {', '.join(keywords)}. Can you generate a concise and meaningful topic label for this? Please respond only with the topic label."

    # Generate the topic label using the GPT-4 model
    response = client.chat.completions.create(
            # model = "gpt4-nnfimpact",
            model = "llama3.2",
            temperature = 0.0,
            messages = [
                    {"role": "user", "content": prompt}
            ]
        )
    return response.choices[0].message.content

In [65]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,78,-1_patient_health_value_base,Healthcare Quality Standards and Benchmarking.,"[patient, health, value, base, high, aim, time...",[aim provide update overview medical error tax...
1,0,26,0_feature_image_classification_accuracy,Machine Learning Model Development,"[feature, image, classification, accuracy, pro...",[paper present novel improve semantic segmenta...
2,1,24,1_carbon_soil_financial_community,Soil Carbon Analysis in Community Development ...,"[carbon, soil, financial, community, specie, s...",[integrated crop livestock system livestock gr...
3,2,34,2_cell_cancer_mtx_gene,Viral oncogenesis and gene regulation in cancer.,"[cell, cancer, mtx, gene, abl, protein, viral,...",[Rheumatoid arthritis RA long term autoinflamm...
4,3,23,3_child_pregnancy_health_risk,"""Health Risks During Pregnancy: Dengue Prevent...","[child, pregnancy, health, risk, mother, dengu...",[effective prevention mother child transmissio...
5,4,15,4_social_fi_ses_self,Adult Social Injury Reporting and Health Measu...,"[social, fi, ses, self, report, measure, adult...",[food insecurity FI lack access adequate food ...


In [60]:
def get_topic_labels():
    topic_label = []
    for topic in topic_model.get_topic_info()['Representation']:
        topic_label.append(get_topic_representation(topic))
    return topic_label

In [62]:
topic_labels = get_topic_labels()

In [63]:
topic_model.set_topic_labels(topic_labels)

In [66]:
topic_model.visualize_topics(top_n_topics=10, custom_labels=True, width=800, height=800)

In [20]:
zeroshot_topic_list = ["Docking", "Program", "simulation"]
from bertopic.representation import KeyBERTInspired


topic_model_zeroshot = BERTopic(
    embedding_model="thenlper/gte-small", 
    min_topic_size=15,
    zeroshot_topic_list=zeroshot_topic_list,
    zeroshot_min_similarity=.85,
    representation_model=KeyBERTInspired()
)

In [13]:
topics_zeroshot, _ = topic_model_zeroshot.fit_transform(cleaned_docs)


In [16]:
topic_model_zeroshot.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,10,-1_lta_transition_regression_latent,"[lta, transition, regression, latent, , , , , , ]",[Latent transition analysis LTA useful statist...
1,0,120,0_pandemic_stress_athlete_anxiety,"[pandemic, stress, athlete, anxiety, mental, d...",[pandemic affected elite athlete lead increase...
2,1,100,1_vegetation_forest_saharan_africa,"[vegetation, forest, saharan, africa, africana...",[Dry evergreen Afromontane forest severely thr...
3,2,90,2_simulation_classification_critical_medical,"[simulation, classification, critical, medical...",[aim provide update overview medical error tax...
4,3,80,3_carbon_sustainable_global_approach,"[carbon, sustainable, global, approach, influe...",[global climate change large scale widespread ...
5,4,60,4_utilization_dental_health_oral,"[utilization, dental, health, oral, iran, prev...",[World Health Organization invite nation progr...
6,5,50,5_microbiome_microbiota_microbial_gut,"[microbiome, microbiota, microbial, gut, bee, ...",[disruption host associate microbial community...
7,6,50,6_hepatitis_ghana_hbv_hcv,"[hepatitis, ghana, hbv, hcv, viral, endemic, h...",[Hepatitis B c viral HBV HCV infection endemic...
8,7,50,7_sars_dog_dogs_qpcr,"[sars, dog, dogs, qpcr, animal, detection, tes...",[Dogs discriminate people infect SARS uninfect...
9,8,50,8_watershed_classification_attention_pipeline,"[watershed, classification, attention, pipelin...",[rapid development artificial intelligence tec...


In [17]:
topic_model_zeroshot.visualize_topics()