In [2]:
from adapters import AutoAdapterModel

model = AutoAdapterModel.from_pretrained("allenai/specter2_base")
adapter_name = model.load_adapter("allenai/specter2", source="hf", set_active=True)


Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 19784.45it/s]


In [None]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
from tqdm import tqdm

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')

#load base model
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')

#load the adapter(s) as per the required task, provide an identifier for the adapter in load_as argument and activate it
model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)

papers = [{'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'},
          {'title': 'Attention is all you need', 'abstract': ' The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'}]
papers = 
# concatenate title and abstract
text_batch = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in papers]
# preprocess the input
inputs = tokenizer(text_batch, padding=True, truncation=True,
                                   return_tensors="pt", return_token_type_ids=False, max_length=512)
output = model(**inputs)
# take the first token in the batch as the embedding
embeddings = output.last_hidden_state[:, 0, :]


Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 41221.66it/s]


In [None]:
import pandas as pd
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
from tqdm import tqdm

# Load the two CSV files
gold_standard_csv = "./dummyPapers/goldStandardPapersList.csv"
all_papers_csv = "./dummyPapers/testCatalysisPapersList.csv"

gold_df = pd.read_csv(gold_standard_csv)
all_df = pd.read_csv(all_papers_csv)

# Mark gold vs. non-gold
gold_df["is_gold"] = 1
all_df["is_gold"] = 0
 
# Combine
combined_df = pd.concat([gold_df, all_df], ignore_index=True)

combined_df.dropna(subset=["abstract"], inplace=True)  # Drop rows without an abstract
combined_df = combined_df[combined_df["title"].notna() | combined_df["abstract"].notna()]

# Concatenate title and abstract (if title is missing, use only the abstract)
combined_df["text"] = combined_df.apply(
    lambda row: (row["title"] + " [SEP] " if pd.notna(row["title"]) else "") + row["abstract"], axis=1
)

tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")
model = AutoAdapterModel.from_pretrained("allenai/specter2_base")

# Load and activate the Specter2 adapter
model.load_adapter("allenai/specter2_classification", source="hf", load_as="classification", set_active=True)

# Process the input texts in batches
batch_size = 1
texts = combined_df["text"].tolist()
embeddings = []

# Process the input texts in batches with a progress bar
for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings", unit="batch"):
    # Prepare the batch
    batch = texts[i:i+batch_size]
    inputs = tokenizer(batch, padding=True, truncation=True,
                       return_tensors="pt", return_token_type_ids=False, max_length=512)

    # Compute embeddings
    outputs = model(**inputs)
    batch_embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()  # Take [CLS] token embeddings
    embeddings.append(batch_embeddings)

# Combine embeddings into a single array
import numpy as np
embeddings = np.vstack(embeddings)

# Save embeddings to a file for future use
np.save("specter2_embeddings.npy", embeddings)

# Save the corresponding paper metadata (e.g., OpenAlex ID) to a CSV
combined_df.reset_index(drop=True, inplace=True)
combined_df["embedding_index"] = np.arange(len(combined_df))  # Add an index for cross-referencing embeddings
combined_df.to_csv("paper_metadata_with_embeddings.csv", index=False)

print(f"Processed {len(combined_df)} papers and saved embeddings.")

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 21050.46it/s]
  state_dict = torch.load(weights_file, map_location="cpu")
Generating Embeddings: 100%|██████████| 57231/57231 [1:03:11<00:00, 15.09batch/s]


Processed 57231 papers and saved embeddings.


In [1]:
import numpy as np
import pandas as pd
import umap.umap_ as umap
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool

# Load the saved metadata and embeddings
metadata_path = "paper_metadata_with_embeddings.csv"
embeddings_path = "specter2_embeddings.npy"

metadata_df = pd.read_csv(metadata_path)
embeddings = np.load(embeddings_path)

# --------------------------------------------------
# 1. UMAP for Dimensionality Reduction
# --------------------------------------------------
umap_reducer = umap.UMAP(
    n_neighbors=15,  # Number of neighbors to consider for manifold learning
    min_dist=0.1,    # Minimum distance between points
    n_components=2,  # Reduce to 2 dimensions for visualization
    random_state=42
)
embeddings_2d = umap_reducer.fit_transform(embeddings)

# Add UMAP results to the metadata DataFrame
metadata_df["x"] = embeddings_2d[:, 0]
metadata_df["y"] = embeddings_2d[:, 1]

# --------------------------------------------------
# 2. Prepare Bokeh Data Sources
# --------------------------------------------------
# Create separate data sources for gold (red) and non-gold (blue)
red_source = ColumnDataSource(metadata_df[metadata_df["is_gold"] == 1])
blue_source = ColumnDataSource(metadata_df[metadata_df["is_gold"] == 0])

# --------------------------------------------------
# 3. Create Bokeh Plot
# --------------------------------------------------
p = figure(
    title="UMAP Projection of Specter2 Embeddings",
    tools="pan,wheel_zoom,box_zoom,reset,save,hover",
    width=800,
    height=600
)

# Plot red points (gold-standard) with alpha=1
p.circle(
    x="x",
    y="y",
    source=red_source,
    size=6,
    fill_alpha=1.0,
    color="red",
    legend_label="Gold (Red)"
)

# Plot blue points (non-gold) with alpha=0.5
p.circle(
    x="x",
    y="y",
    source=blue_source,
    size=6,
    fill_alpha=0.5,
    color="blue",
    legend_label="Non-Gold (Blue)"
)

# Add hover tool
hover_tool = HoverTool()
hover_tool.tooltips = [
    ("OAID", "@oaid"),
    ("Title", "@title"),
    ("Category", "@color")
]
p.add_tools(hover_tool)

# Configure legend
p.legend.location = "top_left"
p.legend.title = "Category"
p.legend.click_policy = "hide"  # Allows toggling categories

# --------------------------------------------------
# 4. Show Plot
# --------------------------------------------------
show(p)

  from .autonotebook import tqdm as notebook_tqdm
  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [45]:
embeddings.shape

(57231, 768)