In [29]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from openai import OpenAI
from umap import UMAP
import pandas as pd
import sys

In [6]:
# Add the scripts directory to the path
sys.path.append('../scripts')
# Own scripts
import pubmed_helpers
import text_cleaning as tc

In [2]:
df = pd.read_parquet('data/pubmed_data_200.parquet')

In [7]:
# Clean the text with Spacy
cleaned_docs = [tc.clean_text_ext_spacy(doc) for doc in df['abstract']]

In [8]:
# Add the cleaned text back in the dataframe
df['cleaned_abstract'] = cleaned_docs

In [12]:
df_applications = pd.read_excel('../data/test_records.xlsx', engine='calamine')

In [15]:
df_applications.columns

Index(['application_reference', 'project_title', 'is_grant', 'is_application',
       'call_name', 'granted_amount', 'brief_project_description',
       'project_description'],
      dtype='object')

In [16]:
cleaned_application_project_description = [tc.clean_text_ext_spacy(doc) for doc in df_applications['project_description']]

In [17]:
cleaned_application_brief_project_description = [tc.clean_text_ext_spacy(doc) for doc in df_applications['brief_project_description']]

In [23]:
df_applications['cleaned_project_description'] = cleaned_application_project_description
df_applications['cleaned_brief_project_description'] = cleaned_application_brief_project_description

In [24]:
# Write df_applications to parquet
df_applications.to_parquet('../data/test_records_cleaned.parquet')

In [43]:
sentence_model = SentenceTransformer('sentence-transformers/allenai-specter')

embeddings = sentence_model.encode(df_applications['cleaned_project_description'], show_progress_bar=True)

topic_model = BERTopic().fit(df_applications['cleaned_project_description'], embeddings)

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [44]:
topic_model.visualize_barchart()

In [45]:
def get_topic_representation(keywords, ollama_model="llama3.2:1b"):
    client = OpenAI(
        api_key='ollama',  
        # api_version = "2024-02-15-preview",
        base_url='http://localhost:11434/v1/',
    )
    # Create the prompt to generate a concise topic label
    prompt = f"Here is a list of keywords that represent a topic: {', '.join(keywords)}. Can you generate a concise and meaningful topic label for this? Please respond only with the topic label."

    # Generate the topic label using the GPT-4 model
    response = client.chat.completions.create(
            # model = "gpt4-nnfimpact",
            model = ollama_model,
            temperature = 0.0,
            messages = [
                    {"role": "user", "content": prompt}
            ]
        )
    return response.choices[0].message.content

In [46]:
def get_topic_labels():
    """Call the get_topic_representation function for each topic in the topic model."""
    topic_label = []
    for topic in topic_model.get_topic_info()['Representation']:
        topic_label.append(get_topic_representation(topic))
    return topic_label

In [155]:
# Use the Ollama local model to generate topic labels
topic_labels = get_topic_labels()

In [156]:
# Set the topic labels in the topic model
topic_model.set_topic_labels(topic_labels)

In [157]:
df_applications['application_reference'] = df_applications['application_reference'].astype(str)

In [55]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(df_applications['application_reference'], embeddings=embeddings, custom_labels=True, )

In [39]:

reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(df_applications['cleaned_project_description'], reduced_embeddings=reduced_embeddings, custom_labels=True)

In [57]:
df_applications_large = pd.read_excel('../data/endocrinology_metabolism_2020_to_2024_and_bbm_project_grants_2018_to_2024.xlsx', engine='calamine')

In [59]:
cleaned_application_project_description = [tc.clean_text_ext_spacy(doc) for doc in df_applications_large['project_description']]
cleaned_application_brief_project_description = [tc.clean_text_ext_spacy(doc) for doc in df_applications_large['brief_project_description']]
df_applications_large['cleaned_project_description'] = cleaned_application_project_description
df_applications_large['cleaned_brief_project_description'] = cleaned_application_brief_project_description
df_applications_large.to_parquet('../data/endocrinology_metabolism_2020_to_2024_and_bbm_project_grants_2018_to_2024.parquet')
sentence_model = SentenceTransformer('sentence-transformers/allenai-specter')
embeddings = sentence_model.encode(df_applications_large['cleaned_project_description'], show_progress_bar=True)
topic_model = BERTopic().fit(df_applications_large['cleaned_project_description'], embeddings)
# Use the Ollama local model to generate topic labels
topic_labels = get_topic_labels()

Batches:   0%|          | 0/73 [00:00<?, ?it/s]

In [154]:
topic_model.set_topic_labels(topic_labels)

ValueError: Make sure that `topic_labels` contains the same number of labels as there are topics.

In [115]:
topics, probs = topic_model.fit_transform(df_applications_large['cleaned_project_description'])

In [116]:
new_topics = topic_model.reduce_outliers(df_applications_large['cleaned_project_description'], topics)
topic_model_smaller = topic_model
topic_model_smaller.update_topics(df_applications_large['cleaned_project_description'], topics=new_topics)
topic_model_smaller.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,174,0_islet_cell_insulin_beta,"[islet, cell, insulin, beta, pancreatic, secre...",[Purpose aim incapacity endocrine pancrea main...
1,1,106,1_antibiotic_infection_phage_bacteria,"[antibiotic, infection, phage, bacteria, bacte...",[abbreviation AMR antimicrobial resistance CRI...
2,2,180,2_protein_structure_peptide_bind,"[protein, structure, peptide, bind, drug, comp...",[BIOLOGICAL Autophagy lysosomal self eat proce...
3,3,98,3_muscle_exercise_skeletal_glucose,"[muscle, exercise, skeletal, glucose, insulin,...",[PURPOSE exercise essential regulate energy me...
4,4,85,4_liver_nafld_nash_hepatic,"[liver, nafld, nash, hepatic, hepatocyte, fibr...",[NAFLD nash disease correlate hepatocyte macro...
5,5,79,5_heart_cardiac_af_channel,"[heart, cardiac, af, channel, cam, cardiomyocy...",[purpose Aim heart failure critical clinical d...
6,6,74,6_pcos_follicle_woman_ovarian,"[pcos, follicle, woman, ovarian, sex, hormone,...",[PURPOSE Polycystic ovary syndrome PCOS lead c...
7,7,62,7_bone_fracture_osteoporosis_estrogen,"[bone, fracture, osteoporosis, estrogen, osteo...",[purpose project healthy bone dynamic tissue c...
8,8,62,8_ldl_cholesterol_lipoprotein_lipid,"[ldl, cholesterol, lipoprotein, lipid, aaa, hd...",[purpose examine basic molecular mechanism inv...
9,9,98,9_risk_diabetes_genetic_patient,"[risk, diabetes, genetic, patient, disease, da...",[past decade prevalence diabete develop develo...


In [114]:
topic_model.save('../data/endocrinology_metabolism_2020_to_2024_and_bbm_project_grants_2018_to_2024_model.safetensors', serialization='safetensors')

In [122]:
load_bertopic = BERTopic.load('../data/endocrinology_metabolism_2020_to_2024_and_bbm_project_grants_2018_to_2024_model.safetensors')




In [64]:
# topic_model.save('../data/endocrinology_metabolism_2020_to_2024_and_bbm_project_grants_2018_to_2024_model.safetensors', serialization='safetensors')

In [None]:
# new_topics = topic_model.reduce_outliers(cleaned_docs, topics)
# topic_model.update_topics(cleaned_docs, topics=new_topics)
# topic_model.get_topic_info()

In [70]:
df_applications_large.columns

Index(['application_reference', 'project_title', 'is_grant', 'is_application',
       'call_name', 'granted_amount', 'brief_project_description',
       'project_description', 'time_dimension_date',
       'cleaned_project_description', 'cleaned_brief_project_description'],
      dtype='object')

In [72]:
# topic_model.visualize_topics_over_time(df_applications_large['time_dimension_date'])

topics_over_time = topic_model.topics_over_time(df_applications_large['cleaned_project_description'], df_applications_large['time_dimension_date'], nr_bins=20)

In [125]:
load_bertopic.visualize_topics_over_time(topics_over_time, custom_labels=True, topics=range(10))

In [129]:
df_applications_large.columns

Index(['application_reference', 'project_title', 'is_grant', 'is_application',
       'call_name', 'granted_amount', 'brief_project_description',
       'project_description', 'time_dimension_date',
       'cleaned_project_description', 'cleaned_brief_project_description'],
      dtype='object')

In [131]:
with_topics = load_bertopic.get_document_info(df_applications_large['application_reference'].astype(str))

In [136]:
with_topics = with_topics.rename(columns={'Document': 'application_reference'})

In [138]:
df_applications_large['application_reference'] = df_applications_large['application_reference'].astype(str)

In [140]:
result = with_topics.merge(df_applications_large, on='application_reference')

In [142]:
result.to_parquet('../data/endocrinology_metabolism_2020_to_2024_and_bbm_project_grants_2018_to_2024_with_topics.parquet')

In [147]:
result[['application_reference','CustomName', 'is_grant', 'time_dimension_date', 'granted_amount', 'call_name']].to_clipboard(index=False)

In [152]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,174,0_islet_cell_insulin_beta,"[islet, cell, insulin, beta, pancreatic, secre...",[Purpose aim incapacity endocrine pancrea main...
1,1,106,1_antibiotic_infection_phage_bacteria,"[antibiotic, infection, phage, bacteria, bacte...",[abbreviation AMR antimicrobial resistance CRI...
2,2,180,2_protein_structure_peptide_bind,"[protein, structure, peptide, bind, drug, comp...",[BIOLOGICAL Autophagy lysosomal self eat proce...
3,3,98,3_muscle_exercise_skeletal_glucose,"[muscle, exercise, skeletal, glucose, insulin,...",[PURPOSE exercise essential regulate energy me...
4,4,85,4_liver_nafld_nash_hepatic,"[liver, nafld, nash, hepatic, hepatocyte, fibr...",[NAFLD nash disease correlate hepatocyte macro...
5,5,79,5_heart_cardiac_af_channel,"[heart, cardiac, af, channel, cam, cardiomyocy...",[purpose Aim heart failure critical clinical d...
6,6,74,6_pcos_follicle_woman_ovarian,"[pcos, follicle, woman, ovarian, sex, hormone,...",[PURPOSE Polycystic ovary syndrome PCOS lead c...
7,7,62,7_bone_fracture_osteoporosis_estrogen,"[bone, fracture, osteoporosis, estrogen, osteo...",[purpose project healthy bone dynamic tissue c...
8,8,62,8_ldl_cholesterol_lipoprotein_lipid,"[ldl, cholesterol, lipoprotein, lipid, aaa, hd...",[purpose examine basic molecular mechanism inv...
9,9,98,9_risk_diabetes_genetic_patient,"[risk, diabetes, genetic, patient, disease, da...",[past decade prevalence diabete develop develo...


In [84]:
topics_over_time.columns

Index(['Topic', 'Words', 'Frequency', 'Timestamp', 'Name'], dtype='object')

In [86]:
topics_over_time['Year'] = topics_over_time['Timestamp'].dt.year


In [106]:
# Frequency of topics over time
topics_over_time[['Year', 'Name', 'Frequency',]].to_clipboard(index=False)

In [101]:
topics_over_time.groupby(['Year', "Name"]).size().unstack(fill_value=0)

Unnamed: 0,Topic,Words,Frequency,Timestamp,Name,Year
0,-1,"cell, protein, project, gene, dna",56,2018-12-28 19:45:07.200,Cellular Disease Study Analysis Proteomics Res...,2018
1,0,"cell, proinsulin, insulin, crs, islet",5,2018-12-28 19:45:07.200,Islet Cell Function in Diabetes Research,2018
2,1,"muscle, exercise, skeletal, tendon, training",6,2018-12-28 19:45:07.200,Muscle Training and Nutrition for Optimal Heal...,2018
3,2,"antibiotic, biofilm, phage, bacteria, strain",14,2018-12-28 19:45:07.200,Antimicrobial Resistance in Bacterial Infectio...,2018
4,3,"liver, nafld, nash, hcc, hepatic",6,2018-12-28 19:45:07.200,Liver Disease and Metabolic Disorders,2018
...,...,...,...,...,...,...
319,37,"sortilin, cnv, rpe, retinal, amd",1,2024-08-29 03:36:00.000,Retinal Cell Signaling Pathways in AMD and CNV,2024
320,38,"sleep, circadian, hcrt, melatonin, rhythm",4,2024-08-29 03:36:00.000,Sleep Regulation and Circadian Rhythms,2024
321,39,"club, lung, airway, dirty, cell",2,2024-08-29 03:36:00.000,Respiratory Cell Biology and Disease Research,2024
322,40,"motor, neuron, striatal, error, sensory",2,2024-08-29 03:36:00.000,Neurological Circuitry in Motor Control and Ne...,2024


In [100]:
palette = "viridis"
df_topics = topics_over_time.groupby(['Year', "Name"]).size().unstack(fill_value=0)

df_topics.to_clipboard()#.plot(kind='bar', stacked=True, colormap=palette)
# topics_over_time.plot(kind='bar', stacked=True, figsize=(10, 6), colormap=palette)

In [162]:
with_topics

Unnamed: 0,application_reference,Topic,Name,CustomName,Representation,Representative_Docs,Top_n_words,Representative_document
0,51994,7,7_obesity_imp_fat_gut,Nutritional Risk Factors for Obesity and Relat...,"[obesity, imp, fat, gut, microbiota, diet, ris...",,obesity - imp - fat - gut - microbiota - diet ...,False
1,52000,13,13_pd_ad_protein_aggregation,Neurodegenerative Disease Research on Protein ...,"[pd, ad, protein, aggregation, neuron, disease...",,pd - ad - protein - aggregation - neuron - dis...,False
2,52001,-1,-1_cell_project_protein_datum,Cellular Disease Study Analysis Proteomics Res...,"[cell, project, protein, datum, study, mouse, ...",,cell - project - protein - datum - study - mou...,False
3,52003,31,31_te_ecm_fn_cell,Cell Differentiation and Oxidation,"[te, ecm, fn, cell, nend, ipscs, differentiati...",,te - ecm - fn - cell - nend - ipscs - differen...,False
4,52004,29,29_synaptic_protein_mutation_brain,Neurotransmitter Regulation in Organoids and B...,"[synaptic, protein, mutation, brain, adnp, neu...",,synaptic - protein - mutation - brain - adnp -...,False
...,...,...,...,...,...,...,...,...
2327,96466,10,10_adipocyte_adipose_insulin_lipid,Metabolic Regulation in Obesity and Adiposity ...,"[adipocyte, adipose, insulin, lipid, obesity, ...",,adipocyte - adipose - insulin - lipid - obesit...,False
2328,96470,-1,-1_cell_project_protein_datum,Cellular Disease Study Analysis Proteomics Res...,"[cell, project, protein, datum, study, mouse, ...",,cell - project - protein - datum - study - mou...,False
2329,96480,9,9_cell_antigen_immune_chemokine,Cellular Immune Response to Antigens and Chemo...,"[cell, antigen, immune, chemokine, tcr, ms, es...",,cell - antigen - immune - chemokine - tcr - ms...,False
2330,96490,30,30_skin_melanoma_tumor_cell,Molecular Pathways in Cutaneous Melanoma,"[skin, melanoma, tumor, cell, lv, psoriasis, s...",,skin - melanoma - tumor - cell - lv - psoriasi...,False


In [170]:
result.columns

Index(['application_reference', 'Topic', 'Name', 'CustomName',
       'Representation', 'Representative_Docs', 'Top_n_words',
       'Representative_document', 'project_title', 'is_grant',
       'is_application', 'call_name', 'granted_amount',
       'brief_project_description', 'project_description',
       'time_dimension_date', 'cleaned_project_description',
       'cleaned_brief_project_description'],
      dtype='object')

In [176]:
# Concate application_reference, project_title and CustomName for the visualization
result['cluster_title'] = result['application_reference']  + ' - ' + result['project_title'] + ' - ' + result['CustomName']

In [177]:
topic_model.visualize_documents(result['cluster_title'], custom_labels=True, hide_annotations=True, width=2000, height=1000)

In [178]:
fig = topic_model.visualize_documents(result['cluster_title'], custom_labels=True, hide_annotations=True, width=2000, height=1000)

In [179]:
fig.write_html('../data/demo.html')