# Data and model prep

### Check env is correctly configured

In [2]:
import torch; print(torch.version); print(torch.version.cuda); print(torch.cuda.is_available()); print(torch.cuda.device_count())

<module 'torch.version' from '/home/teghipco/.conda/envs/llm_v100/lib/python3.8/site-packages/torch/version.py'>
12.4
True
2


### Download and save Biobert for later

In [4]:
save_dir ='/work/teghipco/LLMs/Biobert'
import os
os.chdir(save_dir)

In [5]:
from transformers import pipeline, AutoModel, AutoTokenizer
from accelerate import Accelerator

accelerator = Accelerator() # multi-gpu
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = accelerator.prepare(model) # test accelerator/multi-gpu

# Pipeline
pipe = pipeline("feature-extraction", model=model, tokenizer=tokenizer, device=accelerator.device)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [6]:
# Save locally
pipe.model.save_pretrained(save_dir)
pipe.tokenizer.save_pretrained(save_dir)

('/work/teghipco/LLMs/Biobert/tokenizer_config.json',
 '/work/teghipco/LLMs/Biobert/special_tokens_map.json',
 '/work/teghipco/LLMs/Biobert/vocab.txt',
 '/work/teghipco/LLMs/Biobert/added_tokens.json',
 '/work/teghipco/LLMs/Biobert/tokenizer.json')

### Concatenate sharded json files with publication-level data

In [17]:
import pandas as pd
import glob
import gzip
import json
import os
import re

json_directory = '/work/teghipco/LLMs/Biobert/data/pubs'
json_files = glob.glob(os.path.join(json_directory, "*.json"))
json_files = sorted(json_files, key=lambda x: int(re.search(r'(\d+)', x).group())) # sort by shard
#print(json_files)

In [None]:
from concurrent.futures import ProcessPoolExecutor
def load_json_file(file):
    try:
        print(f"Processing file: {file}") 
        dataframes = []
        with gzip.open(file, 'rt', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                df = pd.json_normalize(data)
                dataframes.append(df)
        return pd.concat(dataframes, ignore_index=True)
    except Exception as e:
        print(f"Error processing file {file}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error

In [27]:
# IGNORE--TESTING
#del(result_dataframes)
#del(test_full_dataframe)

In [28]:
batch_size = 50  # Process files in batches--faster on my particular HPC setup ymmv
result_dataframes = []

for i in range(0, len(json_files), batch_size):
    batch_files = json_files[i:i + batch_size]
    #with ProcessPoolExecutor(max_workers=4) as executor:
    with ProcessPoolExecutor as executor:
        batch_dataframes = list(executor.map(load_json_file, batch_files))
        result_dataframes.append(pd.concat(batch_dataframes, ignore_index=True))

Processing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000002.jsonProcessing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000000.jsonProcessing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000003.jsonProcessing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000001.json



Processing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000004.json
Processing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000005.json
Processing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000006.json
Processing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000007.json
Processing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000008.json
Processing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000009.json
Processing file: /work/teghipco/LLMs/Biobert/data/pubs/pubWhole_export_000000000010.json
Processing file: /wor

In [29]:
# Concatenate into one dataframe
full_dataframe = pd.concat(result_dataframes, ignore_index=True)
print(full_dataframe.head())

   publication_id  year        date date_online  \
0  pub.1142700518  2021  2021-11-19  2021-11-19   
1  pub.1157734354  2023  2023-05-04  2023-05-04   
2  pub.1176144398  2024  2024-09-30  2024-09-30   
3  pub.1133314038  2020  2020-11-01         NaN   
4  pub.1168937635  2024  2024-02-06         NaN   

                                     title_preferred clinical_trial_ids  \
0  P11‐17: Outcomes of COVID 19 critical care pat...                      
1  The Bronte Creek Project: Outdoor Environmenta...                      
2  (Mis)recognising the symbolic violence of acad...                      
3                                  Lunar dust buster                      
4               Lessons from COVID-19: UK experience                      

  times_cited recent_citations publication_type  is_open_access  ...  \
0           0                0          article            True  ...   
1           0                0          chapter            True  ...   
2           0             

In [30]:
save_df = '/work/teghipco/LLMs/Biobert/pub_data_concat.parquet'
full_dataframe.to_parquet(save_df, index=False)

# Load prepared data and model

#### Assuming you are coming back to the notebook (new kernel)

#### Load data

In [1]:
import pandas as pd
save_df = '/work/teghipco/LLMs/Biobert/pub_data_concat.parquet'
full_dataframe = pd.read_parquet(save_df)
print(full_dataframe.head())

   publication_id  year        date date_online  \
0  pub.1142700518  2021  2021-11-19  2021-11-19   
1  pub.1157734354  2023  2023-05-04  2023-05-04   
2  pub.1176144398  2024  2024-09-30  2024-09-30   
3  pub.1133314038  2020  2020-11-01        None   
4  pub.1168937635  2024  2024-02-06        None   

                                     title_preferred clinical_trial_ids  \
0  P11‐17: Outcomes of COVID 19 critical care pat...                      
1  The Bronte Creek Project: Outdoor Environmenta...                      
2  (Mis)recognising the symbolic violence of acad...                      
3                                  Lunar dust buster                      
4               Lessons from COVID-19: UK experience                      

  times_cited recent_citations publication_type  is_open_access  ...  \
0           0                0          article            True  ...   
1           0                0          chapter            True  ...   
2           0             

#### Extract just the abstracts to save memory

In [2]:
abstracts = full_dataframe['abstract_preferred'].dropna().tolist()
del full_dataframe

### Load model

In [3]:
import torch; print(torch.version); print(torch.version.cuda); print(torch.cuda.is_available()); print(torch.cuda.device_count())

<module 'torch.version' from '/home/teghipco/.conda/envs/llm_v100/lib/python3.8/site-packages/torch/version.py'>
12.4
True
2


In [22]:
# In case this was previously run...
try:
    del(abstract_embeddings)
except NameError:
    pass # Does not exist

In [5]:
# Also in case you were messing around with models previously...(this is not the first attempt there is much data to optimize for!:) )
import gc
try:
    del model
    del tokenizer
    torch.cuda.empty_cache()  # Clear any remaining memory cache
    gc.collect()  # Run garbage collection to free up RAM as well
except NameError:
    pass # Not loaded yet

In [4]:
# Load model and tokenizer
from transformers import AutoModel, AutoTokenizer
save_dir = '/work/teghipco/LLMs/Biobert'
model = AutoModel.from_pretrained(save_dir)
tokenizer = AutoTokenizer.from_pretrained(save_dir)

# Enable data parallelism through cuda instead of accelerate (see scratch; faster)
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = torch.nn.DataParallel(model)
model.to('cuda')

Using 2 GPUs


DataParallel(
  (module): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

## Run model

In [8]:
import numpy as np
import os
from tqdm import tqdm
import pickle

In [12]:
# Set batch size and other parameters
batch_size = 32
max_length = 512
expected_embedding_dim = 768
checkpoint_path = "/work/teghipco/LLMs/Biobert/checkpoint.pkl"  # Path to save progress
abstract_embeddings = []

#### Load data from an existing checkpoint if you ran the main loop previously

In [13]:
start_batch = 0
if os.path.exists(checkpoint_path):
    with open(checkpoint_path, "rb") as f:
        checkpoint = pickle.load(f)
        abstract_embeddings = checkpoint["embeddings"]
        start_batch = checkpoint["abstract_index"]
    print(f"Resuming from batch {start_batch}.")

Resuming from batch 2240000.


In [14]:
print(f"Loaded {len(abstract_embeddings)} embeddings, resuming from batch {start_batch}.")

Loaded 2240000 embeddings, resuming from batch 2240000.


#### Helper function to get CLS embedding from model output

In [15]:
def get_cls_embedding(batch_texts):
    inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    inputs = {key: val.to('cuda') for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return cls_embeddings

#### Loop over batches of abstracts and get model embeddings

In [16]:
for i in tqdm(range(start_batch, len(abstracts), batch_size), desc="Processing Batches"):
    
    if i == start_batch:
        print(f"First abstract after resuming from batch {start_batch}:")
        print(abstracts[i])  # Print the first abstract after resuming (debug)
    
    batch_texts = []

    # Chunk long abstracts or process in one go
    for abstract in abstracts[i:i + batch_size]:
        tokens = tokenizer(abstract, return_tensors="pt", padding=False, truncation=False)
        input_ids = tokens['input_ids'][0]

        if len(input_ids) > max_length:
            # Split into 512-token chunks and add to batch_texts
            for j in range(0, len(input_ids), max_length):
                chunk_input_ids = input_ids[j:j+max_length]
                # Decode each chunk to pass into the pipeline as text
                chunk_text = tokenizer.decode(chunk_input_ids, skip_special_tokens=True)
                batch_texts.append(chunk_text)
        else:
            # Short abstracts can be added directly
            batch_texts.append(abstract)

    # Extract embeddings for the batch
    batch_embeddings = get_cls_embedding(batch_texts)

    # Aggregate and store embeddings
    idx = 0
    for abstract in abstracts[i:i + batch_size]:
        tokens = tokenizer(abstract, return_tensors="pt", padding=False, truncation=False)
        input_ids = tokens['input_ids'][0]
        
        if len(input_ids) > max_length:
            # Number of chunks in this abstract
            num_chunks = len(input_ids) // max_length + (len(input_ids) % max_length > 0)
            cls_embedding = np.mean(batch_embeddings[idx:idx + num_chunks], axis=0)
            idx += num_chunks
        else:
            cls_embedding = batch_embeddings[idx]
            idx += 1

        abstract_embeddings.append(cls_embedding)

    # Save checkpoint every 10000 batches to safeguard progress
    if (i // batch_size + 1) % 10000 == 0:
        with open(checkpoint_path, "wb") as f:
            pickle.dump({"embeddings": abstract_embeddings, "abstract_index": i + batch_size}, f)
        print(f"Checkpoint saved at batch {i + batch_size}.")

# Convert embeddings to tensor after final batch
abstract_embeddings = torch.tensor(abstract_embeddings)

Processing Batches:   0%|          | 0/8055 [00:00<?, ?it/s]

First abstract after resuming from batch 2240000:
The role of healthcare and climate change in sustainable development is crucial for ensuring a healthy and prosperous future for people and the planet. Addressing the health impacts of climate change requires a multi-faceted approach that includes reducing greenhouse gas emissions to mitigate the drivers of climate change, enhancing adaptation strategies to build resilience in communities, and implementing public health measures to protect vulnerable populations. Collaboration between governments, international organizations, healthcare providers, and communities is essential in confronting the health challenges posed by climate change. Healthcare and climate change are interconnected and play pivotal roles in sustainable development. A holistic approach that considers the health impacts of climate change promotes sustainable healthcare practices, and addresses health disparities is vital for achieving a sustainable and resilient future

Processing Batches: 100%|██████████| 8055/8055 [36:09<00:00,  3.71it/s] 
  abstract_embeddings = torch.tensor(abstract_embeddings)


In [17]:
import numpy as np
embed = np.array(abstract_embeddings)
save_path = "/work/teghipco/LLMs/Biobert/abstract_embeddings.npy"
np.save(save_path, embed)

In [18]:
embed2 = np.load(save_path, allow_pickle=True)
len(embed2)

2497751

In [19]:
len(abstracts)

2497751

# Scratch

### Slower pipeline alternative

In [5]:
#from transformers import pipeline, AutoModel, AutoTokenizer
#from accelerate import Accelerator
#save_dir ='/work/teghipco/LLMs/Biobert'  # Replace with your actual save path

#accelerator = Accelerator()
#model = AutoModel.from_pretrained(save_dir)
#tokenizer = AutoTokenizer.from_pretrained(save_dir)
#model = accelerator.prepare(model)
#pipe = pipeline("feature-extraction", model=model, tokenizer=tokenizer, device=accelerator.device)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
test_text = "This is a test input to verify the pipeline is functioning correctly."
features = pipe(test_text)
print("Number of tokens:", len(features[0]))
print("Embedding vector size for each token:", len(features[0][0]))
print("First token embedding:", features[0][0])  # Embedding for [CLS] token