In [1]:
import pandas as pd

df_pubmed = pd.read_csv('pubmed_part1.csv')

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

def get_biobert_embedding(text):
    """
    Get BioBERT embedding for a text
    
    Args:
        text (str): Input text
    Returns:
        numpy.ndarray: Embedding vector
    """
    # Tokenize text
    inputs = tokenizer(text, return_tensors="pt", max_length=512, 
                      truncation=True, padding=True)
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # Use [CLS] token embedding or mean of all tokens
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    return embeddings

# Replace the Word2Vec functions
def get_sentence_vector(sentence):
    """
    Get embedding vector for a sentence using BioBERT
    """
    return get_biobert_embedding(sentence)

In [3]:
from tqdm.notebook import tqdm
from tqdm import tqdm
import pandas as pd

# Add tqdm to pandas
tqdm.pandas()

def process_in_batches(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        all_embeddings.extend(embeddings)
    return all_embeddings

def process_embeddings(df, batch_size=32):
    """
    Process embeddings for abstract texts in dataframe using batches
    """
    print("Generating embeddings...")
    
    # Process abstract texts in batches
    abstract_texts = df['abstract_text'].tolist()
    abstract_embeddings = process_in_batches(abstract_texts, batch_size)
    df['abstract_vector'] = abstract_embeddings
    
    return df

In [4]:
# Take sample of 10 rows from df_pubmed_1
# df_pubmed_sample = df_pubmed.sample(n=10, random_state=42)


In [5]:
process_embeddings(df_pubmed)

Generating embeddings...


Unnamed: 0,abstract_id,abstract_text,abstract_vector
9932,8825533,The effect of ranitidine on postoperative naus...,"[0.06694699, -0.025046367, 0.029512007, 0.1212..."
21459,10213349,Physical training improves exercise capacity i...,"[0.0422495, 0.022081487, -0.019242544, 0.15092..."
10146,8853730,To study the effect of HIV-1 resistance to lam...,"[0.16013415, -0.019622369, 0.08715239, 0.13335..."
16554,9572224,To test the null hypothesis of no association ...,"[0.02369017, -0.08694814, 0.023889067, 0.07609..."
3699,7802127,Three issues relevant to revising the DSM-III-...,"[0.0974562, -0.009266093, 0.04439923, -0.01585..."
19969,9892302,This study compared the effect of clozapine an...,"[0.012985589, -0.03693167, 0.028529573, 0.0491..."
17948,9702441,To evaluate the efficacy of combining electrot...,"[0.12210363, -0.15359151, 0.026292726, 0.22490..."
17605,9673476,To compare the safety and efficacy of polyacry...,"[0.12443222, -0.06251675, 0.012834996, 0.18939..."
18166,9721763,To compare intraoperative and postoperative ou...,"[0.09425821, -0.041623063, 0.047945935, 0.1804..."
2851,7659476,To compare the reactogenicity of a licensed co...,"[0.15273434, -0.090804145, 0.14593138, 0.19430..."


In [6]:
# Check length of abstract vectors
def check_length(df):
    vector_lengths = df['abstract_vector'].apply(len)
    print("\nAll vectors have same length:", all(vector_lengths == vector_lengths.iloc[0]))
    print("Vector lengths:", vector_lengths.unique())

def check_type(df):
    print("First vector type:", type(df['abstract_vector'].iloc[0]))
    print("First vector shape:", df['abstract_vector'].iloc[0].shape)

check_length(df_pubmed)
check_type(df_pubmed)


All vectors have same length: True
Vector lengths: [768]
First vector type: <class 'numpy.ndarray'>
First vector shape: (768,)


In [7]:
def save_embeddings(df, num):
    df_export = df.copy()
    df_export['abstract_vector'] = df_export['abstract_vector'].apply(lambda x: ','.join(map(str, x)))
    df_export.to_csv(f'processed_embeddings_{num}.csv', index=False)


In [8]:
save_embeddings(df_pubmed, 1)

In [None]:
save_embeddings(df_pubmed, 2)

In [None]:
save_embeddings(df_pubmed, 3)

In [None]:
save_embeddings(df_pubmed, 4)

In [None]:
# save_embeddings(df_pubmed, 5)

In [None]:
# save_embeddings(df_pubmed, 6)

In [None]:
# save_embeddings(df_pubmed, 7)

In [None]:
# save_embeddings(df_pubmed, 8)