In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter

In [2]:
from bertopic import BERTopic
from sklearn.mixture import BayesianGaussianMixture
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pickle
from typing import List, Dict, Any, Tuple


In [3]:
import colorcet
import glob
from tqdm.notebook import tqdm
import json

In [4]:
# Set up LaTeX fonts
#plt.style.use(['no-latex'])
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams['text.usetex'] = True
#plt.rcParams['text.latex.unicode'] = True
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Liberation Sans', 'Bitstream Vera Sans', 'sans-serif']

In [5]:
# read embedding dict from disk
embedding_file = "../data/doc_summary_embedding_dict.pkl"

with open(embedding_file, "rb") as file:
    doc2embedding: Dict[str, Dict] = pickle.load(file)

In [6]:
embeddings: np.array = np.array(
    [
        emb for emb in doc2embedding.values()
    ]
)

In [7]:
document_indexes: List[str] = [
    emb for emb in doc2embedding.keys()
]

In [8]:
# read articles from from yaml files
text_path = '../data/json_files/' 

# get all files in the path 
json_files: List[str] = glob.glob(text_path + '*.json')

doc_titles_dict: Dict[str, str] = defaultdict()
doc_tldr_dict: Dict[str, str] = defaultdict()
doc_topics_dict: Dict[str, List[str]] = defaultdict(list)

# read all json files
for file in json_files:
    with open(file, 'r') as f:
        article_json = json.load(f)

        doc_ID = file.split('/')[-1].split('.')[0]

        doc_titles_dict[doc_ID] = article_json['revised_title']
        doc_tldr_dict[doc_ID] = article_json['tldr']
        doc_topics_dict[doc_ID] = ' '.join(article_json['semantic_tags'])

In [9]:
doc_titles = [doc_titles_dict[i] for i in doc2embedding]
doc_topics = [doc_topics_dict[i] for i in doc2embedding]

In [10]:
tsne = TSNE(n_components=2, perplexity=100, random_state=42, verbose=True)

tsne_embeddings = tsne.fit_transform(embeddings)

[t-SNE] Computing 301 nearest neighbors...
[t-SNE] Indexed 47558 samples in 0.007s...
[t-SNE] Computed neighbors for 47558 samples in 36.473s...
[t-SNE] Computed conditional probabilities for sample 1000 / 47558
[t-SNE] Computed conditional probabilities for sample 2000 / 47558
[t-SNE] Computed conditional probabilities for sample 3000 / 47558
[t-SNE] Computed conditional probabilities for sample 4000 / 47558
[t-SNE] Computed conditional probabilities for sample 5000 / 47558
[t-SNE] Computed conditional probabilities for sample 6000 / 47558
[t-SNE] Computed conditional probabilities for sample 7000 / 47558
[t-SNE] Computed conditional probabilities for sample 8000 / 47558
[t-SNE] Computed conditional probabilities for sample 9000 / 47558
[t-SNE] Computed conditional probabilities for sample 10000 / 47558
[t-SNE] Computed conditional probabilities for sample 11000 / 47558
[t-SNE] Computed conditional probabilities for sample 12000 / 47558
[t-SNE] Computed conditional probabilities for s

In [17]:
class GMMWrapper(BayesianGaussianMixture): # or regular GaussianMixture
    def fit(self,*args,**kwargs):
        clusters = self.fit_predict(*args,**kwargs)
        
        # get rid of empty clusters
        cluster_map = -100*np.ones(clusters.max()+1)
        cluster_ids_unique = np.unique(clusters)
        cluster_map[cluster_ids_unique] = np.arange(len(cluster_ids_unique))
        clusters_new = cluster_map[clusters]

        # save it in labels attribute
        self.labels_ = clusters_new
        return self



topic_model_gmm = BERTopic(
    verbose=True,
    hdbscan_model=GMMWrapper(n_components=300)
)

topics, probs = topic_model_gmm.fit_transform(
    doc_topics, 
    embeddings,
)

2024-11-19 21:01:29,621 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


2024-11-19 21:01:55,357 - BERTopic - Dimensionality - Completed ✓
2024-11-19 21:01:55,358 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-19 21:07:03,167 - BERTopic - Cluster - Completed ✓
2024-11-19 21:07:03,179 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-19 21:07:03,728 - BERTopic - Representation - Completed ✓


In [18]:
topic_model_gmm.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,1091,0_selection_population_evolutionary_genetics,"[selection, population, evolutionary, genetics...",['Biogeography' 'Population genetics' 'Morphol...
1,1,804,1_biographies_engineering_obituaries_history,"[biographies, engineering, obituaries, history...",[Biographies History of Science Science and So...
2,2,770,2_flow_fluid_layer_dynamics,"[flow, fluid, layer, dynamics, boundary, vorte...",[Fluid Dynamics Vortex Flow Boundary Layer Flo...
3,3,754,3_paleontology_marine_zoology_coral,"[paleontology, marine, zoology, coral, taxonom...",[Zoology Sponge biology Taxonomy Animal classi...
4,4,728,4_relativity_general_gravitational_field,"[relativity, general, gravitational, field, qu...",[Quantum Field Theory Electromagnetism Particl...
...,...,...,...,...,...
182,182,23,182_acoustic_nondestructive_characterization_r...,"[acoustic, nondestructive, characterization, r...",[Scanning electron acoustic microscopy Imaging...
183,183,23,183_finance_economic_portfolio_pricing,"[finance, economic, portfolio, pricing, option...",[Finance Theory Mathematical Models Financial ...
184,184,14,184_airsea_interaction_layer_boundary,"[airsea, interaction, layer, boundary, oceanog...",[Air-Sea Interaction Boundary Layer Meteorolog...
185,185,10,185_pigmentation_skin_racial_tyrosinase,"[pigmentation, skin, racial, tyrosinase, coat,...",[Human Genetics Skin Pigmentation Racial Chara...


In [19]:
# # Reduce outliers
# topics = topic_model.reduce_outliers(doc_topics, topics)
# topic_model.update_topics(doc_titles, topics=topics)

# topic_set = set(topics)

# color_set = sns.color_palette(
#     colorcet.glasbey,
#     n_colors=len(topic_set)
# )

# topic2color = {
#     topic: color
#     for topic, color in zip(topic_set, color_set)
# }

# topic2color[-1] = (0.6, 0.6, 0.6)

# topic_names = topic_model.get_topic_info()['Name'].values

# # topic2name = {
# #     int(t_name[:t_name.find('_')]): t_name
# #     for t_name in topic_names
# # }

In [38]:
topic_embeddings = topic_model_gmm.get_topics()
topic_embeddings

{0: [('selection', 0.045419305374342474),
  ('population', 0.0431430886519868),
  ('evolutionary', 0.03861937388797094),
  ('genetics', 0.03293301157111295),
  ('behavior', 0.029284020697224165),
  ('sexual', 0.028656156650525972),
  ('ecology', 0.025654129235928955),
  ('biology', 0.021983525034910523),
  ('competition', 0.019201526829181043),
  ('species', 0.018808787186836005)],
 1: [('biographies', 0.06546370440841473),
  ('engineering', 0.0316387512060673),
  ('obituaries', 0.03031623462624815),
  ('history', 0.028141066036226382),
  ('scientific', 0.027160608180978475),
  ('education', 0.02608903704680164),
  ('society', 0.024293135698882973),
  ('royal', 0.023429844538920313),
  ('war', 0.022080715787449134),
  ('world', 0.02201424915384716)],
 2: [('flow', 0.07319254856149228),
  ('fluid', 0.07196182983185481),
  ('layer', 0.059580597626644104),
  ('dynamics', 0.05673076935101542),
  ('boundary', 0.051899563530878226),
  ('vortex', 0.051319151288773075),
  ('aerodynamics', 0.03

In [20]:
hierarchical_topics = topic_model_gmm.hierarchical_topics(doc_titles)

  0%|          | 0/186 [00:00<?, ?it/s]

100%|██████████| 186/186 [00:00<00:00, 353.98it/s]


In [21]:
# article_titles = [
#     doc2json[doc]["article_id"] + ': ' + doc2json[doc]["revised_title"] + '. Keyterms: ' + ', '.join(doc2json[doc]["topics"])
#     for doc in document_indexes
# ]

topic_model_gmm.visualize_documents(doc_titles, reduced_embeddings=tsne_embeddings)

In [22]:
topic_model_gmm.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [24]:
topic_model_gmm.visualize_barchart(top_n_topics=187, n_words=10)

In [26]:
doc2topic = {
    doc: topic
    for doc, topic in zip(document_indexes, topics)
}

In [29]:
doc2topic['rstb_1994_0126']

117

In [78]:
# topic_model.get_document_info(doc_titles)

In [28]:
topic_model_gmm.visualize_heatmap()

In [30]:
for doc, topic in zip(document_indexes, topics):
    if topic == 117:
        #print(f"Document: {doc}")
        #print(f"Topic: {topic}")
        print(f"Title: {doc_titles_dict[doc]}")
        print(f"TL;DR: {doc_tldr_dict[doc]}")
        print(f"Topics: {doc_topics_dict[doc]}")
        print("\n")

Title: Edge Pitch Perception in Harmonic Complex Sounds
TL;DR: The authors conducted experiments to study how the human auditory system processes complex sounds and perceives pitch related to the spectral edges of broadband signals. They found that subjects could accurately adjust the frequency of a comparison tone to match the edge pitch in harmonic complexes. The accuracy of these matches was consistent across different fundamental frequencies and upper-edge frequencies of the complex sounds. This suggests that the perception of edge pitch in harmonic signals is related to the temporal resolution of the hearing system, which depends on the time constants of basilar-membrane filters and neuronal processes.
Topics: Auditory System Pitch Perception Spectral Edges Complex Sounds Harmonic Complexes


Title: Investigating Electrical Responses of the Cochlea and Auditory Tract to Phase Reversal in Musical Tones
TL;DR: The authors investigated the electrical responses in the cochlea and audi

In [106]:
from bertopic import BERTopic
from sklearn.mixture import BayesianGaussianMixture
import numpy as np
import pandas as pd

class GMMWrapper(BayesianGaussianMixture): # or regular GaussianMixture
    def fit(self,*args,**kwargs):
        clusters = self.fit_predict(*args,**kwargs)
        
        # get rid of empty clusters
        cluster_map = -100*np.ones(clusters.max()+1)
        cluster_ids_unique = np.unique(clusters)
        cluster_map[cluster_ids_unique] = np.arange(len(cluster_ids_unique))
        clusters_new = cluster_map[clusters]

        # save it in labels attribute
        self.labels_ = clusters_new
        return self



topic_model_gmm = BERTopic(
    verbose=True,
    hdbscan_model=GMMWrapper(n_components=20)
)

topics, probs = topic_model_gmm.fit_transform(
    doc_topics, 
    embeddings,
)

2024-11-19 20:33:29,742 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-19 20:34:13,014 - BERTopic - Dimensionality - Completed ✓
2024-11-19 20:34:13,016 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-19 20:34:51,972 - BERTopic - Cluster - Completed ✓
2024-11-19 20:34:52,000 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-19 20:34:52,460 - BERTopic - Representation - Completed ✓


In [109]:
topic_model_gmm.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,4368,0_chemistry_chemical_reactions_gas,"[chemistry, chemical, reactions, gas, kinetics...",[Organic Chemistry Chemical Kinetics Reaction ...
1,1,4351,1_astronomy_solar_optics_celestial,"[astronomy, solar, optics, celestial, light, o...",[Astronomy Lunar Theory Celestial Mechanics Hi...
2,2,4102,2_plant_biology_protein_cell,"[plant, biology, protein, cell, enzyme, microb...",[Plant Biology Photosynthesis Carbon Cycle Nit...
3,3,2878,3_history_scientific_science_society,"[history, scientific, science, society, biogra...",[History of Science Royal Society Bibliography...
4,4,2675,4_crystallography_materials_crystal_science,"[crystallography, materials, crystal, science,...",[Metallurgy Material Science Stress Analysis C...
5,5,2571,5_spectroscopy_atomic_physics_molecular,"[spectroscopy, atomic, physics, molecular, qua...",[Photoelectron Spectroscopy Molecular Spectros...
6,6,2488,6_electrical_electrochemistry_electricity_cond...,"[electrical, electrochemistry, electricity, co...",[Electrical Discharges Plasma Physics Gas Disc...
7,7,2405,7_meteorology_magnetic_geomagnetism_atmospheric,"[meteorology, magnetic, geomagnetism, atmosphe...",[Meteorology Tides Atmospheric Pressure Weathe...
8,8,2351,8_population_genetics_biology_behavior,"[population, genetics, biology, behavior, sele...",[Evolutionary Biology Population Genetics Unit...
9,9,2348,9_paleontology_anatomy_biology_fossil,"[paleontology, anatomy, biology, fossil, evolu...",[Vertebrate anatomy Skull morphology Comparati...
