Script to rank different embeedings based on clustering capabilities

In [13]:
import pandas as pd

# Load the gold standards CSV file
file_path = "./GoldStandardPapers/standards_v1_140125.csv"  # Replace with actual file path
df = pd.read_csv(file_path)

# Ensure no missing values in critical columns
df = df.dropna(subset=["title", "abstract", "catalysis_type"])

# Concatenate title and abstract
df["text"] = df["title"] + " [SEP] " + df["abstract"]

# Check unique categories
categories = df["catalysis_type"].unique()
print(f"Unique categories: {categories}")

Unique categories: ['bio' 'electro' 'hetero' 'homo' 'organo' 'photo' 'unknown'
 'not_catalysis']


In [33]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import umap.umap_ as umap
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, TapTool, OpenURL, LabelSet
from bokeh.transform import factor_cmap

In [None]:

# Load Specter2 tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")
model = AutoAdapterModel.from_pretrained("allenai/specter2_base")
model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)

# Generate embeddings
texts = df["text"].tolist()
batch_size = 1
embeddings = []

for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings", unit="batch"):
    batch = texts[i:i+batch_size]
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512)
    outputs = model(**inputs)
    batch_embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    embeddings.append(batch_embeddings)

# Combine all embeddings
embeddings = np.vstack(embeddings)

In [15]:
# --------------------------------------------------
# 3. Clustering
# --------------------------------------------------
# Example: KMeans clustering
kmeans = KMeans(n_clusters=len(df["catalysis_type"].unique()), random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

# Add cluster labels to the DataFrame
df["cluster"] = cluster_labels

# --------------------------------------------------
# 4. Evaluate Clustering
# --------------------------------------------------
# Calculate V-Measure for catalysis_type
v_measure = v_measure_score(df["catalysis_type"], df["cluster"])
print(f"V-Measure Score: {v_measure:.2f}")

# Optional: Calculate Silhouette Score (global clustering quality)
silhouette = silhouette_score(embeddings, cluster_labels, metric="euclidean")
print(f"Silhouette Score: {silhouette:.2f}")

# --------------------------------------------------
# 5. Visualization with UMAP (optional)
# --------------------------------------------------
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embeddings_2d = umap_reducer.fit_transform(embeddings)

# Add UMAP results to DataFrame for visualization
df["x"] = embeddings_2d[:, 0]
df["y"] = embeddings_2d[:, 1]

# --------------------------------------------------
# 6. Save Results
# --------------------------------------------------
# Save embeddings and clustering results for analysis
np.save("embeddings_specter2.npy", embeddings)
df.to_csv("clustering_results.csv", index=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


V-Measure Score: 0.23
Silhouette Score: 0.03


  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [65]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import umap.umap_ as umap
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, TapTool, OpenURL
from tqdm import tqdm

# --------------------------------------------------
# 1. Load and Prepare Data
# --------------------------------------------------
file_path = './GoldStandardPapers/standards_v2_140125.csv'
df = pd.read_csv(file_path)

# Drop rows with missing abstracts or titles
df.dropna(subset=["abstract", "title"], inplace=True)

# Remove rows where `catalysis_type` is "Unknown"
df = df[df["catalysis_type"] != "unknown"]

# Concatenate title and abstract for embedding generation
df["text"] = df["title"] + " [SEP] " + df["abstract"]

# Add OpenAlex link
df["weblink"] = "http://openalex.org/works/" + df["oaid"]

# --------------------------------------------------
# 2. Generate Embeddings with SPECTER2
# --------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")
model = AutoAdapterModel.from_pretrained("allenai/specter2_base")
model.load_adapter("allenai/specter2_classification", source="hf", load_as="classification", set_active=True)

batch_size = 1
texts = df["text"].tolist()
embeddings = []

for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings", unit="batch"):
    batch = texts[i:i + batch_size]
    inputs = tokenizer(batch, padding=True, truncation=True,
                       return_tensors="pt", return_token_type_ids=False, max_length=512)
    outputs = model(**inputs)
    batch_embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()  # CLS token
    embeddings.append(batch_embeddings)

# Combine embeddings into a single array
embeddings = np.vstack(embeddings)


Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 27191.60it/s]
  state_dict = torch.load(weights_file, map_location="cpu")
Generating Embeddings: 100%|██████████| 1951/1951 [02:10<00:00, 14.95batch/s]


In [54]:
embeddings.shape

(1951, 768)

In [58]:
df

Unnamed: 0,order,doi,oaid,data_source,review_check,catalysis_type,application_theme,standard_class,standard_logic,cited_by_patent,...,cit_count,primary_topic_code,primary_topic_name,all_topics,topic_matches_1,topic_matches_2,all_keywords,catalysis_concept,text,weblink
0,1,10.1039/c6nr04181g,W2472592562,Lens_catalysis_filter,False,bio,unknown,background,unknown,FALSE,...,49.0,T11048,Bacteriophages and microbial interactions,{'Bacteriophages and microbial interactions': ...,{},,"{'nanocages': 0.55919415, 'catalytic-efficienc...",Catalysis,Highly efficient enzyme encapsulation in a pro...,http://openalex.org/works/W2472592562
1,2,10.1002/anie.201606269,W2517719610,Lens_catalysis_filter,False,bio,unknown,background,unknown,FALSE,...,11.0,T12441,Porphyrin Metabolism and Disorders,"{'Porphyrin Metabolism and Disorders': 0.9984,...",{'Metalloenzymes and iron-sulfur proteins': 0....,Metalloenzymes and iron-sulfur proteins': 0.992,{'methyl-group': 0.4104057},Catalysis,The Biological Methane‐Forming Reaction: Mecha...,http://openalex.org/works/W2517719610
2,3,10.1039/c6nr06115j,W2523753282,Lens_catalysis_filter,False,bio,unknown,background,unknown,FALSE,...,63.0,T10212,Electrochemical sensors and biosensors,{'Electrochemical sensors and biosensors': 0.9...,{'Enzyme Catalysis and Immobilization': 0.9987...,"Enzyme Catalysis and Immobilization': 0.9987, ...","{'horseradish-peroxidase': 0.7020004, 'glucose...",Catalysis,"Highly active, stable and self-antimicrobial e...",http://openalex.org/works/W2523753282
3,4,10.1002/cplu.201600617,W2579595625,Lens_catalysis_filter,False,bio,unknown,background,unknown,FALSE,...,18.0,T10908,Analytical Chemistry and Chromatography,{'Analytical Chemistry and Chromatography': 0....,{'Enzyme Catalysis and Immobilization': 0.9942},Enzyme Catalysis and Immobilization': 0.9942,"{'chemoselectivity': 0.8698373, 'ingredient': ...",Catalysis,Sustainable Manufacture of a Valuable Fragranc...,http://openalex.org/works/W2579595625
4,5,10.1021/acs.accounts.6b00321,W2586486617,Lens_catalysis_filter,False,bio,unknown,background,unknown,FALSE,...,121.0,T10044,Protein Structure and Dynamics,"{'Protein Structure and Dynamics': 0.9995, 'Ph...",{},,"{'reaction-rate': 0.43159986, 'entropy-of-acti...",Catalysis,Entropy and Enzyme Catalysis [SEP] The role pl...,http://openalex.org/works/W2586486617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3391,3392,10.1002/anie.202301239,W4320715477,uk_catalysis_hub,False,photo,unknown,soft_postive,catalyst_hub,unknown,...,22.0,T10078,Advanced Photocatalysis Techniques,"{'Advanced Photocatalysis Techniques': 0.9998,...","{'Advanced Photocatalysis Techniques': 0.9998,...","Advanced Photocatalysis Techniques': 0.9998, '...",{},,Cd/Pt Precursor Solution for Solar H<sub>2</su...,http://openalex.org/works/W4320715477
3392,3393,10.1039/d2su00082b,W4321210958,uk_catalysis_hub,False,photo,unknown,soft_postive,catalyst_hub,unknown,...,4.0,T10078,Advanced Photocatalysis Techniques,"{'Advanced Photocatalysis Techniques': 0.9998,...","{'Advanced Photocatalysis Techniques': 0.9998,...",Advanced Photocatalysis Techniques': 0.9998,{},,Highly selective CO<sub>2</sub> photoreduction...,http://openalex.org/works/W4321210958
3393,3394,10.1021/acscatal.3c00858,W4380449793,uk_catalysis_hub,False,photo,unknown,soft_postive,application_relevant,unknown,...,17.0,T10078,Advanced Photocatalysis Techniques,"{'Advanced Photocatalysis Techniques': 0.9971,...","{'Advanced Photocatalysis Techniques': 0.9971,...","Advanced Photocatalysis Techniques': 0.9971, '...",{},Catalysis,Mechanistic Study of Glucose Photoreforming ov...,http://openalex.org/works/W4380449793
3394,3395,10.1021/acs.jpcc.3c00952,W4381432868,uk_catalysis_hub,False,photo,unknown,soft_postive,application_relevant,unknown,...,0.0,T10024,TiO2 Photocatalysis and Solar Cells,{'TiO2 Photocatalysis and Solar Cells': 0.9967...,{'TiO2 Photocatalysis and Solar Cells': 0.9967...,"TiO2 Photocatalysis and Solar Cells': 0.9967, ...","{'stearic-acid': 0.78613615, 'fade': 0.48767388}",Photocatalysis,Study and Modeling of the Kinetics of the Phot...,http://openalex.org/works/W4381432868


In [68]:
from bokeh.palettes import Category20_20
from bokeh.models import ColumnDataSource, HoverTool, TapTool, OpenURL, LabelSet
from bokeh.models import CustomJS, TapTool

# --------------------------------------------------
# 3. UMAP for Dimensionality Reduction
# --------------------------------------------------
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embeddings_2d = umap_reducer.fit_transform(embeddings)

# Add UMAP results to the DataFrame
df["x"] = embeddings_2d[:, 0]
df["y"] = embeddings_2d[:, 1]

# --------------------------------------------------
# 4. Assign Colors to `application_theme`
# --------------------------------------------------
unique_themes = df["application_theme"].unique()  # Get unique themes
palette = Category20_20[:len(unique_themes)]  # Use a palette with enough colors for the themes
theme_to_color = dict(zip(unique_themes, palette))  # Map each theme to a specific color

# Map shapes based on `cited_by_patent`
df["shape"] = df["cited_by_patent"].map({
    "TRUE": "square",
    "FALSE": "triangle"
}).fillna("circle")  # Default to circle for other values

# --------------------------------------------------
# 5. Create Bokeh Plot
# --------------------------------------------------
p = figure(
    title="UMAP Projection of Specter2 Embeddings (by Application Theme)",
    tools="pan,wheel_zoom,box_zoom,reset,save,hover,tap",
    width=1200,
    height=800
)

# Plot each `application_theme` separately with shapes
for theme, color in theme_to_color.items():
    theme_data = df[df["application_theme"] == theme]
    source = ColumnDataSource(data={
        "x": theme_data["x"],
        "y": theme_data["y"],
        "oaid": theme_data["oaid"],
        "title": theme_data["title"],
        "application_theme": theme_data["application_theme"],
        "weblink": theme_data["weblink"],
        "shape": theme_data["shape"]
    })
    p.scatter(
        x="x",
        y="y",
        source=source,
        size=10,
        fill_alpha=0.7,
        color=color,
        marker=factor_mark("shape", ["square", "triangle", "circle"], ["square", "triangle", "circle"]),
        legend_label=theme
    )

# Add hover tool
hover_tool = HoverTool()
hover_tool.tooltips = [
    ("OAID", "@oaid"),
    ("Title", "@title"),
    ("Application Theme", "@application_theme"),
    ("Cited by Patent", "@cited_by_patent"),
    ("Weblink", "@weblink")
]
p.add_tools(hover_tool)

# Add tap tool for clickable links with JavaScript
tap_tool = TapTool()
tap_tool.callback = CustomJS(args=dict(source=source), code="""
    var selected_index = source.selected.indices[0];  // Get the first selected index
    if (selected_index !== undefined) {
        var weblink = source.data["weblink"][selected_index];  // Get the weblink for the selected point
        if (weblink) {
            window.open(weblink, "_blank");  // Open in a new tab
        } else {
            console.log("No valid weblink found for the selected point.");
        }
    } else {
        console.log("No point selected.");
    }
""")
p.add_tools(tap_tool)

# --------------------------------------------------
# 6. Add Labels for Clusters
# --------------------------------------------------
centroids = df.groupby("application_theme")[["x", "y"]].mean().reset_index()

centroid_source = ColumnDataSource(data={
    "x": centroids["x"],
    "y": centroids["y"],
    "application_theme": centroids["application_theme"]
})

labels = LabelSet(
    x="x",
    y="y",
    text="application_theme",
    source=centroid_source,
    background_fill_color="white",
    background_fill_alpha=0.6,
    text_font_size="10pt"
)
p.add_layout(labels)

# --------------------------------------------------
# 7. Configure Legend Click Policy
# --------------------------------------------------
p.legend.location = "top_left"
p.legend.title = "Application Theme"
p.legend.click_policy = "hide"  # Allows toggling themes by clicking legend items

# --------------------------------------------------
# 8. Show Plot
# --------------------------------------------------
show(p)

  warn(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# Metrics

