In [4]:
# %pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu

# Windows, NVIDIA 1660 Ti, CUDA 12.1
# %pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(f"CUDA available: {torch.cuda.is_available()}, CUDA device count: {torch.cuda.device_count()}")
print(f"Current deviceId: {torch.cuda.current_device()}, Device name: {torch.cuda.get_device_name(0)}")

CUDA available: True, CUDA device count: 1
Current deviceId: 0, Device name: NVIDIA GeForce GTX 1660 Ti


## Load data

In [23]:
data = pd.read_json('../data/PMC-Patients.json')
# Keep only patient_id and title columns
data = data[['PMID', 'title']]
# Drop duplicates based on both PMID and title
data = data.drop_duplicates(subset=['PMID', 'title']).reset_index(drop=True)


In [24]:
data.head(2)

Unnamed: 0,PMID,title
0,33492400,Early Physical Therapist Interventions for Pat...
1,34956745,Deranged Liver Function Tests and Liver Insult...


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140897 entries, 0 to 140896
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   PMID    140897 non-null  int64 
 1   title   140897 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.2+ MB


## NER Extraction

In [26]:
tokenizer = AutoTokenizer.from_pretrained("Clinical-AI-Apollo/Medical-NER")
model = AutoModelForTokenClassification.from_pretrained("Clinical-AI-Apollo/Medical-NER")
ner_pipeline = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0)

Device set to use cuda:0


In [53]:
def extract_ner(title):
    ner_output = ner_pipeline(title)
    flattened_output = [[entity['word'], entity['entity_group'], entity['score']] for entity in ner_output]
    return flattened_output
    
def process_batch(batch_df):
    ner_results = batch_df['title'].apply(extract_ner)
    batch_df['entities'] = ner_results
    return batch_df

def process_large_csv(input_json, output_csv, batch_size=1000):
    # Create reader without context manager
    data = pd.read_json(input_json)
    data = data[['PMID', 'title']]
    data = data.drop_duplicates(subset=['PMID', 'title']).reset_index(drop=True)
    
    total_rows = len(data)
    num_batches = (total_rows + batch_size - 1) // batch_size  # Round up division
    
    for batch_num in range(num_batches):
        start_idx = batch_num * batch_size
        end_idx = min(start_idx + batch_size, total_rows)
        batch = data.iloc[start_idx:end_idx].copy()
        
        processed_batch = process_batch(batch)
        
        if batch_num == 0:
            # First batch: write with headers
            processed_batch.to_csv(output_csv, index=False, mode='w')
        else:
            # Subsequent batches: append without headers
            processed_batch.to_csv(output_csv, index=False, mode='a', header=False)
        
        print(f"Processed batch {batch_num + 1}/{num_batches} "
              f"(rows {start_idx} to {end_idx - 1})")

In [55]:
process_large_csv(
    input_json='../data/PMC-Patients.json',
    output_csv='../data/titles_entities.csv',
    batch_size=10000
)

Processed batch 1/15 (rows 0 to 9999)
Processed batch 2/15 (rows 10000 to 19999)
Processed batch 3/15 (rows 20000 to 29999)
Processed batch 4/15 (rows 30000 to 39999)
Processed batch 5/15 (rows 40000 to 49999)
Processed batch 6/15 (rows 50000 to 59999)
Processed batch 7/15 (rows 60000 to 69999)
Processed batch 8/15 (rows 70000 to 79999)
Processed batch 9/15 (rows 80000 to 89999)
Processed batch 10/15 (rows 90000 to 99999)
Processed batch 11/15 (rows 100000 to 109999)
Processed batch 12/15 (rows 110000 to 119999)
Processed batch 13/15 (rows 120000 to 129999)
Processed batch 14/15 (rows 130000 to 139999)
Processed batch 15/15 (rows 140000 to 140896)


In [31]:
# Attempt 1 - Did not work
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# # model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

# ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0)

# Function to extract medical/clinical named entities
# def extract_entities(title):
#     entities = ner_pipeline(title)
#     return entities
#     # return [entity['word'] for entity in entities if entity['entity_group'] in ['ORG', 'PER', 'LOC', 'MISC']]

In [None]:
# Extract named entities for each title and store in new column
# data['named_entities'] = data['title'].apply(lambda x: ner_pipeline(x))


In [None]:
titles = data['title'].tolist()
print(f"Total titles: {len(titles)}")
unique_titles = set(titles)
print(f"Unique titles: {len(unique_titles)}")


In [None]:
ner_mapping = {}
for title in unique_titles:
    ner_mapping[title] = ner_pipeline(title)



In [7]:
sample_titles = data['title'].iloc[20:40].head(20).tolist()
sample_titles

['An Extremely Rare Case of Metastatic Merkel Carcinoma of the Liver',
 'Multiple Sclerosis Masquerading as Post Septorhinoplasty Complication: A Case Report',
 'Upper Urinary Tract Urothelial Carcinoma With Squamous, Glandular, and Sarcomatoid Variants in a Horseshoe Kidney: A Novel Case Report and Literature Review',
 'Nitroglycerin as a Treatment Modality for Recurrent Isolated Paracentral Acute Middle Maculopathy: A Case Report',
 'A Case of Incidental Detection of Asymptomatic Bladder Cancer by Transvaginal Ultrasound',
 'Bradycardia Related to Remdesivir During COVID-19: Persistent or Permanent?',
 'Bradycardia Related to Remdesivir During COVID-19: Persistent or Permanent?',
 'Leptomeningeal Disease as an Initial Presenting Manifestation in Breast Cancer',
 'Rare Case of Central Pontine Myelinolysis: Etiological Dilemma',
 'COVID Booster Shots: The Need of the Hour',
 'COVID Booster Shots: The Need of the Hour',
 'COVID Booster Shots: The Need of the Hour',
 'COVID Booster Shots

In [11]:

print(f"Sample title: {sample_titles[0]}\n")
ner_output = ner_pipeline(sample_titles[0])
flattened_output = [[entity['word'], entity['entity_group'], entity['score']] for entity in ner_output]

# simplified_ner_output = list(map(flatten, ner_output))
# simplified_ner_output


Sample title: An Extremely Rare Case of Metastatic Merkel Carcinoma of the Liver



TypeError: string indices must be integers, not 'str'

In [None]:
output = {}
for title in sample_titles:
    print(f"Sample title: {title}\n")
    extracted_terms = ner_pipeline(title)
    output.append(ner_pipeline(title))
output



In [None]:
# data['named_entities'] = data['title'].apply(extract_entities)