In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import spacy
import re
from typing import List, Dict, Tuple
from collections import defaultdict
import json
import icd_codes
import importlib



  from .autonotebook import tqdm as notebook_tqdm


### Similarity based mapping

In [45]:
class ICD10Mapper:
    def __init__(self, icd10_file_path: str):
        """
        Initialize the ICD-10 code mapper
        
        Args:
            icd10_file_path: Path to the ICD-10 codes file
        """
        # Load models
        self.clinical_bert_name = "emilyalsentzer/Bio_ClinicalBERT"
        self.tokenizer = AutoTokenizer.from_pretrained(self.clinical_bert_name)
        self.clinical_bert = AutoModel.from_pretrained(self.clinical_bert_name)

        self.nlp = spacy.load("en_core_sci_md")
        
        # Model parameters
        self.max_length = 512
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.clinical_bert.to(self.device)
        
        # Load and process ICD-10 codes
        self.icd10_codes = self._load_icd10_codes(icd10_file_path)
        self.target_codes = icd_codes.target_code_list
        self.code_keywords = self._initialize_code_keywords()
        
        # Cache for embeddings
        self.code_embeddings = self._precompute_code_embeddings()

    def _load_icd10_codes(self, file_path: str) -> Dict[str, str]:
        """Load and parse ICD-10 codes from file"""
        codes_dict = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                #TO DO  Assuming format: "CODE DESCRIPTION"
                code, description = line.strip().split(' ', 1)
                code = code.strip()  # Remove leading/trailing spaces from code
                description = description.strip()  # Remove leading/trailing spaces from description
                codes_dict[code] = description
        return codes_dict

    

    def _initialize_code_keywords(self) -> Dict[str, List[str]]:
        """Initialize keyword mappings for each target ICD-10 code"""
        return {
            'I10': ['hypertension', 'high blood pressure', 'elevated bp', 'htn'],
            'I25.10': ['coronary artery disease', 'cad', 'coronary heart disease', 'chd'],
            'I50.9': ['heart failure', 'chf', 'cardiac failure', 'congestive heart'],
            'I48.91': ['atrial fibrillation', 'afib', 'a-fib'],
            'J44.9': ['copd', 'chronic obstructive pulmonary disease'],
            'J45.909': ['asthma', 'reactive airway'],
            'J84.10': ['pulmonary fibrosis', 'lung fibrosis'],
            'E10.9': ['type 1 diabetes', 't1dm', 'type i diabetes'],
            'E11.9': ['type 2 diabetes', 't2dm', 'type ii diabetes'],
            'E03.9': ['hypothyroidism', 'underactive thyroid'],
            'E78.5': ['hyperlipidemia', 'high cholesterol', 'dyslipidemia'],
            'E66.9': ['obesity', 'overweight', 'high bmi']
        }

    def _precompute_code_embeddings(self) -> Dict[str, torch.Tensor]:
        """Precompute embeddings for all target ICD-10 code descriptions"""
        embeddings = {}
        
        for code in self.target_codes:
            description = self.icd10_codes[code]
            embeddings[code] = self._get_bert_embeddings(description)
        return embeddings

    def _get_bert_embeddings(self, text: str) -> torch.Tensor:
        """Get BERT embeddings for input text"""
        inputs = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.clinical_bert(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
        
        return embeddings


    def calculate_similarity_scores(self, note_embedding: torch.Tensor, text: str) -> Dict[str, List[Tuple[str, float]]]:
        """Calculate similarity scores for all target codes"""
        results = defaultdict(list)
        
        for category, codes in self.target_codes.items():
            for code, description in codes.items():
                # BERT similarity
                bert_sim = torch.cosine_similarity(
                    note_embedding,
                    self.code_embeddings[code],
                    dim=1
                ).item()
                
                
                # Keyword matching
                keyword_match = any(kw in text for kw in self.code_keywords[code])
                
                # Combined score
                score = bert_sim 
                if keyword_match:
                    score += 0.2
                
                if score > 0.5:  # Confidence threshold
                    results[category].append((code, score))
        
        return results

    def map_clinical_note(self, clinical_note: str) -> Dict[str, List[Dict[str, any]]]:
        """
        Map clinical note to ICD-10 codes
        
        Returns:
            Dictionary with categories and their matched codes, scores, and descriptions
        """
        # Preprocess the note
        processed_text = self.preprocess_text(clinical_note)
        
        # Get BERT embeddings for the note
        note_embedding = self._get_bert_embeddings(processed_text)
        
        # Calculate similarities
        similarities = self.calculate_similarity_scores(note_embedding, processed_text)
        
        # Format results
        results = {}
        for category, matches in similarities.items():
            results[category] = [
                {
                    'code': code,
                    'description': self.target_codes[category][code],
                    'confidence': round(score, 3)
                }
                for code, score in sorted(matches, key=lambda x: x[1], reverse=True)
            ]
        return results

In [46]:
mapper = ICD10Mapper('icd10cm-codes-April-2025.txt')


KeyError: 'I25.10'

In [47]:
mapper.icd10_codes['I25.10']

KeyError: 'I25.10'

In [None]:



# Test cases
test_notes = [
    """
    Patient presents with shortness of breath and chest pain.
    History of hypertension and type 2 diabetes mellitus.
    Current blood pressure reading 165/95.
    """,
    
    """
    Follow-up visit for COPD exacerbation.
    Patient also has atrial fibrillation and is on anticoagulation.
    Reports increased wheezing and using rescue inhaler more frequently.
    """
]

# Process test cases
for i, note in enumerate(test_notes, 1):
    print(f"\nTest Case {i}:")
    print("Clinical Note:")
    print(note.strip())
    print("\nMapped ICD-10 Codes:")
    
    results = mapper.map_clinical_note(note)
    for category, codes in results.items():
        print(f"\n{category.title()}:")
        for match in codes:
            print(f"- {match['code']}: {match['description']}")
            print(f"  Confidence: {match['confidence']}")

### load the dataset

In [1]:
import pandas as pd
raw_df = pd.read_csv('pmc_patients.csv')
# print(df.head())
raw_df.info()
df = raw_df.sample(frac=1, random_state=42).reset_index(drop=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167034 entries, 0 to 167033
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   patient_id         167034 non-null  int64 
 1   patient_uid        167034 non-null  object
 2   PMID               167034 non-null  int64 
 3   file_path          167034 non-null  object
 4   title              167034 non-null  object
 5   patient            167034 non-null  object
 6   age                167034 non-null  object
 7   gender             167034 non-null  object
 8   relevant_articles  167034 non-null  object
 9   similar_patients   167034 non-null  object
dtypes: int64(2), object(8)
memory usage: 12.7+ MB


In [18]:
temp_df = df[:50].copy(deep=True)


In [None]:
import time
from tqdm import tqdm
tqdm.pandas()
from llm_utils import extract_codes_from_response
start_time = time.time()
temp_df['icd10_codes'] = None  
temp_df['raw_llm_response'] = temp_df['patient'].progress_apply(analyze_clinical_note)
temp_df['json_llm'] = temp_df['raw_llm_response'].progress_apply(extract_codes_from_response)
temp_df['icd10_codes'] = temp_df['json_llm'].progress_apply(lambda x: x[0])
temp_df['json_llm'] = temp_df['json_llm'].progress_apply(lambda x: x[1])
temp_df['summary'] = temp_df['patient'].progress_apply(summarize_clinical_note)
temp_df['raw_llm_response_on_summary'] = temp_df['summary'].progress_apply(analyze_clinical_note)
temp_df['json_llm_on_summary'] = temp_df['raw_llm_response_on_summary'].progress_apply(extract_codes_from_response)
temp_df['icd10_codes_on_summary'] = temp_df['json_llm_on_summary'].progress_apply(lambda x: x[0])
temp_df['json_llm_on_summary'] = temp_df['json_llm_on_summary'].progress_apply(lambda x: x[1])

# temp_df['icd10_codes_title'] = temp_df['title'].progress_apply(analyze_clinical_note)
# temp_df['json_llm_title'] = temp_df['icd10_codes_title'].progress_apply(extract_codes_from_response)
# temp_df['icd10_codes_title'] = temp_df['icd10_codes_title'].progress_apply(lambda x: x[0])
# temp_df['json_llm_title'] = temp_df['json_llm_title'].progress_apply(lambda x: x[1])

print(temp_df[['patient', 'icd10_codes']].head())
end_time = time.time()
print(f"Time taken for {len(temp_df)} rows: {end_time - start_time} seconds")

In [22]:
temp_df['icd10_codes_compare'] = temp_df.apply(lambda row: set(row['icd10_codes']).intersection(set(row['icd10_codes_on_summary'])), axis=1)
temp_df.to_clipboard()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Train a Clinical Bert Model 

In [23]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [50]:
temp_df.iloc[0]['icd10_codes']

{'', 'B95.62', 'P23.4', 'P74.1'}

### Generate Labels 

In [28]:
from icd_codes import target_code_list
def generate_labels(detected_codes: set, target_code_list: list) -> list:
    """
    Generate binary labels for ICD-10 codes based on detected codes.

    Args:
        detected_codes (set): A set of detected ICD-10 codes.
        all_codes (list): A list of all possible ICD-10 codes.

    Returns:
        list: A binary list indicating the presence (1) or absence (0) of each code.
    """
    return [1 if code in detected_codes else 0 for code in target_code_list]

print(target_code_list, len(target_code_list))
temp_df['labels'] = temp_df['icd10_codes'].progress_apply(lambda x: generate_labels(x, target_code_list))
temp_df['target_codes_detected'] = temp_df['icd10_codes'].apply(lambda x: set(x) & set(target_code_list))

target_code_prefixes = {code[:3] for code in target_code_list}

# Update the target_codes_detected to include common codes based on the first three characters
temp_df['target_codes_detected_uptil_3'] = temp_df['icd10_codes'].apply(
    lambda x: set(code[:3] for code in x) & target_code_prefixes
)
temp_df.head()

['I10', 'I25.10', 'I50.9', 'I48.91', 'J44.9', 'J45.909', 'J84.10', 'E10.9', 'E11.9', 'E03.9', 'E78.5', 'E66.9'] 12


100%|██████████| 50/50 [00:00<00:00, 38130.04it/s]


Unnamed: 0,patient_id,patient_uid,PMID,file_path,title,patient,age,gender,relevant_articles,similar_patients,icd10_codes,raw_llm_response,json_llm,summary,raw_llm_response_on_summary,json_llm_on_summary,icd10_codes_on_summary,icd10_codes_compare,labels,target_codes_detected
0,8752,6532217-4,31117962,comm/PMC006xxxxxx/PMC6532217.xml,Clinical diagnosis and mutation analysis of fo...,Patient 4 was an 8-month-old girl born at full...,"[[8.0, 'month']]",F,"{'27056292': 1, '32093054': 1, '27311541': 1, ...","{'6532217-1': 2, '6532217-2': 2, '6532217-3': 2}","{, E71.510}",identified_codes=[ICD10Code(primary_code='E71....,"[{'primary_code': 'E71.510', 'secondary_code':...",summary='8-month-old girl with developmental d...,identified_codes=[ICD10Code(primary_code='E71....,"[{'primary_code': 'E71.39', 'secondary_code': ...","{, E71.39}",{},"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{}
1,48758,8072118-1,33912440,comm/PMC008xxxxxx/PMC8072118.xml,Case Report: Next-Generation Sequencing Reveal...,A 63-year-old female patient was presented to ...,"[[63.0, 'year']]",F,"{'25844674': 1, '30231931': 1, '27621679': 1, ...",{},"{, C79.31, C78.00, C73}",identified_codes=[ICD10Code(primary_code='C73'...,"[{'primary_code': 'C73', 'secondary_code': 'C7...","summary=""The patient presented with dizziness ...",identified_codes=[ICD10Code(primary_code='C73'...,"[{'primary_code': 'C73', 'secondary_code': 'C7...","{J96.90, C79.31, C73, C79.51}","{C79.31, C73}","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{}
2,8034,6485064-1,31027486,comm/PMC006xxxxxx/PMC6485064.xml,Antibiotic-impregnated articulating cement spa...,A 44-year-old man was admitted to our clinic b...,"[[44.0, 'year']]",M,"{'8314821': 1, '20878287': 1, '2203567': 1, '9...","{'8447608-1': 1, '8447608-2': 1}","{, A15.0, M24.66, M01.X2, T84.84XA}",identified_codes=[ICD10Code(primary_code='M01....,"[{'primary_code': 'M01.X2', 'secondary_code': ...",summary='A man with a history of pulmonary tub...,identified_codes=[ICD10Code(primary_code='M01....,"[{'primary_code': 'M01.X6', 'secondary_code': ...","{M01.X6, , A15.0, Z94.1}","{, A15.0}","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{}
3,158610,3416908-1,22948470,noncomm/PMC003xxxxxx/PMC3416908.xml,Diagnosis of an ectopic adrenocorticotropic ho...,A 25-year-old Caucasian woman was admitted for...,"[[25.0, 'year']]",F,"{'20089611': 1, '18209857': 1, '15914534': 1, ...",{'8684410-1': 1},"{, D3A.090, E24.0, C7A.090}",identified_codes=[ICD10Code(primary_code='E24....,"[{'primary_code': 'E24.0', 'secondary_code': '...",summary='The patient presented with oligomenor...,identified_codes=[ICD10Code(primary_code='C7A....,"[{'primary_code': 'C7A.090', 'secondary_code':...","{, Cushing's syndrome due to ectopic ACTH, E24...","{, C7A.090}","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{}
4,82995,4575995-1,26435857,comm/PMC004xxxxxx/PMC4575995.xml,Dental Considerations in Children with Glucose...,"In the autumn of 2014, a 9-year-and-2-month-ol...","[[9.0, 'year'], [2.0, 'month']]",M,"{'10916676': 1, '25079187': 1, '18177777': 1, ...",{},"{, D55.0, K02.9}",identified_codes=[ICD10Code(primary_code='D55....,"[{'primary_code': 'D55.0', 'secondary_code': '...","summary='The 9-year-old male child, previously...",identified_codes=[ICD10Code(primary_code='D55....,"[{'primary_code': 'D55.0', 'secondary_code': '...","{, K01.1, D55.0, K02.9, K08.1, K04.0}","{, D55.0, K02.9}","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{}


In [29]:
temp_df.to_clipboard()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
# Tried hugging face model 
from transformers import AutoTokenizer, BertForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("AkshatSurolia/ICD-10-Code-Prediction")
model = BertForSequenceClassification.from_pretrained("AkshatSurolia/ICD-10-Code-Prediction")
config = model.config
text = "A 15-day-old neonate presented with 3 days of irritability, fever (38.5°C), poor sucking, and left preauricular swelling. Examination showed a 5 cm × 5 cm fluctuant left parotid gland swelling with pus from the Stensen's duct. Laboratory results indicated elevated white blood cells (17.6 × 10^9/L). Ultrasound suggested acute suppurative parotitis. Initial treatment involved intravenous cefotaxime and rehydration, followed by surgical drainage. Pus culture identified methicillin-resistant S. aureus, prompting a switch to intravenous vancomycin for 10 days, leading to full recovery without residual parotid issues."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

results = output.logits.detach().cpu().numpy()[0].argsort()[::-1][:5]
res = [config.id2label[ids] for ids in results]
res 

['T59.91', 'G40.00', 'T59.92', 'I82.43', 'T47.91']

### Sampling the positive cases by the keyword search 

In [2]:
from icd_codes import keyword_bank
import icd_codes
import importlib
importlib.reload(icd_codes)
from tqdm import tqdm
tqdm.pandas()

from icd_codes import keyword_bank
import icd_codes
import importlib
import re
import pandas as pd
from tqdm import tqdm
importlib.reload(icd_codes)

# 1. Vectorized approach - build a mapping dictionary first
def optimize_keyword_matching(df):
    # Initialize empty lists
    df['Sampling_label_list'] = [[] for _ in range(len(df))]
    
    # Build a reverse mapping: keyword -> code
    keyword_to_code = {}
    for code, value in keyword_bank.items():
        for keyword in value['keywords']:
            keyword_to_code[keyword.lower()] = code
    
    # Process each patient text only once
    def find_keywords(text):
        if pd.isna(text):
            return []
        
        text = text.lower()
        found_codes = set()  # Use set to avoid duplicates
        
        for keyword, code in keyword_to_code.items():
            if keyword in text:
                found_codes.add(code)
        
        return list(found_codes)
    
    # Apply the function to each patient text
    tqdm.pandas(desc="Processing patient texts")
    df['Sampling_label_list'] = df['patient'].progress_apply(find_keywords)
    
    return df

df = optimize_keyword_matching(df)

Processing patient texts: 100%|██████████| 167034/167034 [00:20<00:00, 8075.87it/s]


In [11]:
positive_df = df[df['Sampling_label_list'].apply(lambda x: len(x) > 0)]
positive_df.shape
negative_df = df[df['Sampling_label_list'].apply(lambda x: len(x) == 0)]
negative_df.shape

(94898, 11)

In [15]:
import pandas as pd
import numpy as np

# Create a dictionary to hold sampled dataframes for each code
sampled_dfs = {}

# Process each unique ICD code
all_codes = set(keyword_bank.keys())

for code in all_codes:
    # Find all rows containing this code
    mask = positive_df['Sampling_label_list'].apply(lambda x: code in x)
    code_df = positive_df[mask]
    
    # Sample 300 rows (or fewer if not enough available)
    if len(code_df) <= 300:
        sampled_dfs[code] = code_df
        print(f"Warning: Only {len(code_df)} samples available for code {code}")
    else:
        sampled_dfs[code] = code_df.sample(n=300, random_state=42)
    
    print(f"Selected {len(sampled_dfs[code])} samples for code {code}")

# Combine all sampled dataframes without using drop_duplicates
combined_df = pd.concat(sampled_dfs.values())

# Use drop_duplicates on the index to remove duplicate rows
balanced_df = combined_df.loc[~combined_df.index.duplicated(keep='first')]

print(f"Final balanced dataset has {len(balanced_df)} rows")

Selected 300 samples for code I25.10
Selected 300 samples for code E78.5
Selected 300 samples for code E66.9
Selected 300 samples for code E03.9
Selected 300 samples for code J44.9
Selected 300 samples for code J84.10
Selected 300 samples for code I50.9
Selected 300 samples for code E10.9
Selected 300 samples for code E11.9
Selected 300 samples for code I10
Selected 300 samples for code J45.909
Selected 300 samples for code I48.91
Final balanced dataset has 3525 rows


In [16]:
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_df.shape


(3525, 11)

In [17]:
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from tqdm import tqdm
import llm_utils
from llm_utils import analyze_clinical_note, extract_codes_from_response, get_icd10_code_df
import importlib
importlib.reload(llm_utils)
import time 
output_dir = 'data_labelled/positive'
os.makedirs(output_dir, exist_ok=True)
# Initialize a list to hold results
all_results = []
# Process data in chunks
chunk_size = 50
for start in range(0, len(balanced_df), chunk_size):
    chunk_file_path = os.path.join(output_dir, f'chunk_{start // chunk_size + 1}.csv')
    print('processing chunk', start // chunk_size + 1)
    if os.path.exists(chunk_file_path):
        print('chunk already exists')
        continue
    start_time = time.time()
    # time taken to process the chunk
    end = start + chunk_size
    chunk = balanced_df.iloc[start:end]

    # Initialize a list to hold chunk results
    try:
        icd10_code_df = get_icd10_code_df(chunk)
    except Exception as e:
        print(f"Error processing chunk {start // chunk_size + 1}: {str(e)}")
        continue
     
    
    icd10_code_df.to_csv(chunk_file_path, index=False)
    all_results.append(icd10_code_df)
    time_taken = time.time() - start_time
    print(f"Time taken to process the chunk {time_taken} seconds")
    time.sleep(10)
print("Processing complete. Results saved in chunks.")

processing chunk 1
chunk already exists
processing chunk 2
chunk already exists
processing chunk 3
chunk already exists
processing chunk 4
chunk already exists
processing chunk 5
chunk already exists
processing chunk 6
chunk already exists
processing chunk 7
chunk already exists
processing chunk 8
chunk already exists
processing chunk 9
chunk already exists
processing chunk 10
chunk already exists
processing chunk 11
chunk already exists
processing chunk 12
chunk already exists
processing chunk 13
chunk already exists
processing chunk 14
chunk already exists
processing chunk 15
chunk already exists
processing chunk 16
chunk already exists
processing chunk 17
chunk already exists
processing chunk 18
chunk already exists
processing chunk 19
chunk already exists
processing chunk 20
chunk already exists
processing chunk 21


100%|██████████| 50/50 [04:04<00:00,  4.89s/it]
100%|██████████| 50/50 [00:00<00:00, 61572.28it/s]
100%|██████████| 50/50 [00:00<00:00, 178481.02it/s]
100%|██████████| 50/50 [00:00<00:00, 231218.52it/s]


Time taken to process the chunk 244.33357191085815 seconds
processing chunk 22


100%|██████████| 50/50 [03:47<00:00,  4.55s/it]
100%|██████████| 50/50 [00:00<00:00, 64093.89it/s]
100%|██████████| 50/50 [00:00<00:00, 188762.56it/s]
100%|██████████| 50/50 [00:00<00:00, 240223.60it/s]


Time taken to process the chunk 227.34556198120117 seconds
processing chunk 23


100%|██████████| 50/50 [04:09<00:00,  4.98s/it]
100%|██████████| 50/50 [00:00<00:00, 62341.02it/s]
100%|██████████| 50/50 [00:00<00:00, 189273.65it/s]
100%|██████████| 50/50 [00:00<00:00, 174182.06it/s]


Time taken to process the chunk 249.21735215187073 seconds
processing chunk 24


100%|██████████| 50/50 [04:05<00:00,  4.90s/it]
100%|██████████| 50/50 [00:00<00:00, 31198.33it/s]
100%|██████████| 50/50 [00:00<00:00, 124979.26it/s]
100%|██████████| 50/50 [00:00<00:00, 146143.00it/s]


Time taken to process the chunk 245.20695877075195 seconds
processing chunk 25


100%|██████████| 50/50 [04:26<00:00,  5.33s/it]
100%|██████████| 50/50 [00:00<00:00, 61881.14it/s]
100%|██████████| 50/50 [00:00<00:00, 185753.06it/s]
100%|██████████| 50/50 [00:00<00:00, 201649.23it/s]


Time taken to process the chunk 266.7536609172821 seconds
processing chunk 26


100%|██████████| 50/50 [03:10<00:00,  3.81s/it]
100%|██████████| 50/50 [00:00<00:00, 62751.41it/s]
100%|██████████| 50/50 [00:00<00:00, 186579.36it/s]
100%|██████████| 50/50 [00:00<00:00, 202232.59it/s]


Time taken to process the chunk 190.5955147743225 seconds
processing chunk 27


100%|██████████| 50/50 [03:17<00:00,  3.95s/it]
100%|██████████| 50/50 [00:00<00:00, 68045.17it/s]
100%|██████████| 50/50 [00:00<00:00, 238042.22it/s]
100%|██████████| 50/50 [00:00<00:00, 194541.00it/s]


Time taken to process the chunk 197.42775082588196 seconds
processing chunk 28


100%|██████████| 50/50 [02:58<00:00,  3.57s/it]
100%|██████████| 50/50 [00:00<00:00, 60839.92it/s]
100%|██████████| 50/50 [00:00<00:00, 165651.82it/s]
100%|██████████| 50/50 [00:00<00:00, 192399.27it/s]


Time taken to process the chunk 178.54345297813416 seconds
processing chunk 29


100%|██████████| 50/50 [03:10<00:00,  3.82s/it]
100%|██████████| 50/50 [00:00<00:00, 55805.00it/s]
100%|██████████| 50/50 [00:00<00:00, 187245.71it/s]
100%|██████████| 50/50 [00:00<00:00, 202427.80it/s]


Time taken to process the chunk 190.99709391593933 seconds
processing chunk 30


100%|██████████| 50/50 [03:42<00:00,  4.45s/it]
100%|██████████| 50/50 [00:00<00:00, 73326.99it/s]
100%|██████████| 50/50 [00:00<00:00, 132983.64it/s]
100%|██████████| 50/50 [00:00<00:00, 113605.20it/s]


Time taken to process the chunk 222.65141105651855 seconds
processing chunk 31


100%|██████████| 50/50 [03:56<00:00,  4.73s/it]
100%|██████████| 50/50 [00:00<00:00, 59024.82it/s]
100%|██████████| 50/50 [00:00<00:00, 179858.66it/s]
100%|██████████| 50/50 [00:00<00:00, 181257.74it/s]


Time taken to process the chunk 236.28401494026184 seconds
processing chunk 32


100%|██████████| 50/50 [03:24<00:00,  4.09s/it]
100%|██████████| 50/50 [00:00<00:00, 66302.62it/s]
100%|██████████| 50/50 [00:00<00:00, 193821.81it/s]
100%|██████████| 50/50 [00:00<00:00, 174182.06it/s]


Time taken to process the chunk 204.3374571800232 seconds
processing chunk 33


100%|██████████| 50/50 [03:52<00:00,  4.65s/it]
100%|██████████| 50/50 [00:00<00:00, 66407.60it/s]
100%|██████████| 50/50 [00:00<00:00, 174182.06it/s]
100%|██████████| 50/50 [00:00<00:00, 233796.21it/s]


Time taken to process the chunk 232.63435196876526 seconds
processing chunk 34


100%|██████████| 50/50 [03:54<00:00,  4.68s/it]
100%|██████████| 50/50 [00:00<00:00, 56879.63it/s]
100%|██████████| 50/50 [00:00<00:00, 187245.71it/s]
100%|██████████| 50/50 [00:00<00:00, 171897.70it/s]


Time taken to process the chunk 234.13359212875366 seconds
processing chunk 35


100%|██████████| 50/50 [03:57<00:00,  4.75s/it]
100%|██████████| 50/50 [00:00<00:00, 66555.13it/s]
100%|██████████| 50/50 [00:00<00:00, 169947.49it/s]
100%|██████████| 50/50 [00:00<00:00, 198593.94it/s]


Time taken to process the chunk 237.37146711349487 seconds
processing chunk 36


100%|██████████| 50/50 [04:03<00:00,  4.87s/it]
100%|██████████| 50/50 [00:00<00:00, 66555.13it/s]
100%|██████████| 50/50 [00:00<00:00, 187916.85it/s]
100%|██████████| 50/50 [00:00<00:00, 156270.64it/s]


Time taken to process the chunk 243.33157014846802 seconds
processing chunk 37


100%|██████████| 50/50 [03:55<00:00,  4.72s/it]
100%|██████████| 50/50 [00:00<00:00, 60822.27it/s]
100%|██████████| 50/50 [00:00<00:00, 176527.95it/s]
100%|██████████| 50/50 [00:00<00:00, 177424.03it/s]


Time taken to process the chunk 235.95317387580872 seconds
processing chunk 38


100%|██████████| 50/50 [04:08<00:00,  4.97s/it]
100%|██████████| 50/50 [00:00<00:00, 51692.19it/s]
100%|██████████| 50/50 [00:00<00:00, 196178.86it/s]
100%|██████████| 50/50 [00:00<00:00, 159722.16it/s]


Time taken to process the chunk 248.66791200637817 seconds
processing chunk 39


100%|██████████| 50/50 [04:27<00:00,  5.35s/it]
100%|██████████| 50/50 [00:00<00:00, 29086.71it/s]
100%|██████████| 50/50 [00:00<00:00, 163840.00it/s]
100%|██████████| 50/50 [00:00<00:00, 244994.39it/s]


Time taken to process the chunk 267.60115909576416 seconds
processing chunk 40


100%|██████████| 50/50 [04:12<00:00,  5.04s/it]
100%|██████████| 50/50 [00:00<00:00, 57096.43it/s]
100%|██████████| 50/50 [00:00<00:00, 403298.46it/s]
100%|██████████| 50/50 [00:00<00:00, 427990.20it/s]


Time taken to process the chunk 252.1866271495819 seconds
processing chunk 41


 16%|█▌        | 8/50 [1:49:41<9:35:53, 822.69s/it]


KeyboardInterrupt: 

In [12]:
# Doing the same above for the negative cases 

import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from tqdm import tqdm
import llm_utils
from llm_utils import analyze_clinical_note, extract_codes_from_response, get_icd10_code_df
import importlib
importlib.reload(llm_utils)
import time 
output_dir = 'data_labelled/negative'
os.makedirs(output_dir, exist_ok=True)
# Initialize a list to hold results
all_results = []
# Process data in chunks
chunk_size = 50
for start in range(0, len(negative_df), chunk_size):
    chunk_file_path = os.path.join(output_dir, f'chunk_{start // chunk_size + 1}.csv')
    print('processing chunk', start // chunk_size + 1)
    if os.path.exists(chunk_file_path):
        print('chunk already exists')
        continue
    start_time = time.time()
    # time taken to process the chunk
    end = start + chunk_size
    chunk = negative_df.iloc[start:end]

    # Initialize a list to hold chunk results
    try:
        icd10_code_df = get_icd10_code_df(chunk)
    except Exception as e:
        print(f"Error processing chunk {start // chunk_size + 1}: {str(e)}")
        continue
    icd10_code_df.to_csv(chunk_file_path, index=False)
    all_results.append(icd10_code_df)
    time_taken = time.time() - start_time
    print(f"Time taken to process the chunk {time_taken} seconds")
    time.sleep(10)
print("Processing complete. Results saved in chunks.")

processing chunk 1


100%|██████████| 50/50 [02:57<00:00,  3.55s/it]
100%|██████████| 50/50 [00:00<00:00, 60297.64it/s]
100%|██████████| 50/50 [00:00<00:00, 187245.71it/s]
100%|██████████| 50/50 [00:00<00:00, 198406.05it/s]


Time taken to process the chunk 177.47610187530518 seconds
processing chunk 2


100%|██████████| 50/50 [02:37<00:00,  3.15s/it]
100%|██████████| 50/50 [00:00<00:00, 80752.87it/s]
100%|██████████| 50/50 [00:00<00:00, 214432.72it/s]
100%|██████████| 50/50 [00:00<00:00, 223101.28it/s]


Time taken to process the chunk 157.4871621131897 seconds
processing chunk 3


100%|██████████| 50/50 [02:59<00:00,  3.59s/it]
100%|██████████| 50/50 [00:00<00:00, 67130.35it/s]
100%|██████████| 50/50 [00:00<00:00, 159237.05it/s]
100%|██████████| 50/50 [00:00<00:00, 166176.86it/s]


Time taken to process the chunk 179.60151290893555 seconds
processing chunk 4


100%|██████████| 50/50 [02:53<00:00,  3.48s/it]
100%|██████████| 50/50 [00:00<00:00, 88487.43it/s]
100%|██████████| 50/50 [00:00<00:00, 175493.89it/s]
100%|██████████| 50/50 [00:00<00:00, 187413.05it/s]


Time taken to process the chunk 173.98093914985657 seconds
processing chunk 5


100%|██████████| 50/50 [02:48<00:00,  3.38s/it]
100%|██████████| 50/50 [00:00<00:00, 74288.06it/s]
100%|██████████| 50/50 [00:00<00:00, 198406.05it/s]
100%|██████████| 50/50 [00:00<00:00, 209296.61it/s]


Time taken to process the chunk 168.87911987304688 seconds
processing chunk 6


100%|██████████| 50/50 [02:43<00:00,  3.27s/it]
100%|██████████| 50/50 [00:00<00:00, 79739.62it/s]
100%|██████████| 50/50 [00:00<00:00, 184446.09it/s]
100%|██████████| 50/50 [00:00<00:00, 124018.45it/s]


Time taken to process the chunk 163.5576729774475 seconds
processing chunk 7


100%|██████████| 50/50 [02:47<00:00,  3.34s/it]
100%|██████████| 50/50 [00:00<00:00, 51175.01it/s]
100%|██████████| 50/50 [00:00<00:00, 113114.99it/s]
100%|██████████| 50/50 [00:00<00:00, 180633.25it/s]


Time taken to process the chunk 167.0477170944214 seconds
processing chunk 8


100%|██████████| 50/50 [02:43<00:00,  3.26s/it]
100%|██████████| 50/50 [00:00<00:00, 69350.26it/s]
100%|██████████| 50/50 [00:00<00:00, 195265.55it/s]
100%|██████████| 50/50 [00:00<00:00, 210135.47it/s]


Time taken to process the chunk 163.15091824531555 seconds
processing chunk 9


100%|██████████| 50/50 [02:39<00:00,  3.20s/it]
100%|██████████| 50/50 [00:00<00:00, 72565.81it/s]
100%|██████████| 50/50 [00:00<00:00, 183317.48it/s]
100%|██████████| 50/50 [00:00<00:00, 174908.42it/s]


Time taken to process the chunk 159.97604298591614 seconds
processing chunk 10


100%|██████████| 50/50 [02:34<00:00,  3.08s/it]
100%|██████████| 50/50 [00:00<00:00, 60245.68it/s]
100%|██████████| 50/50 [00:00<00:00, 191520.73it/s]
100%|██████████| 50/50 [00:00<00:00, 207433.43it/s]


Time taken to process the chunk 154.141037940979 seconds
processing chunk 11


100%|██████████| 50/50 [02:39<00:00,  3.18s/it]
100%|██████████| 50/50 [00:00<00:00, 60769.40it/s]
100%|██████████| 50/50 [00:00<00:00, 196178.86it/s]
100%|██████████| 50/50 [00:00<00:00, 196915.68it/s]


Time taken to process the chunk 159.25764298439026 seconds
processing chunk 12


100%|██████████| 50/50 [02:43<00:00,  3.26s/it]
100%|██████████| 50/50 [00:00<00:00, 73999.72it/s]
100%|██████████| 50/50 [00:00<00:00, 179090.69it/s]
100%|██████████| 50/50 [00:00<00:00, 196915.68it/s]


Time taken to process the chunk 163.15227699279785 seconds
processing chunk 13


100%|██████████| 50/50 [02:25<00:00,  2.91s/it]
100%|██████████| 50/50 [00:00<00:00, 81284.96it/s]
100%|██████████| 50/50 [00:00<00:00, 165130.08it/s]
100%|██████████| 50/50 [00:00<00:00, 152520.15it/s]


Time taken to process the chunk 145.61909294128418 seconds
processing chunk 14


100%|██████████| 50/50 [02:38<00:00,  3.16s/it]
100%|██████████| 50/50 [00:00<00:00, 91899.74it/s]
100%|██████████| 50/50 [00:00<00:00, 227456.83it/s]
100%|██████████| 50/50 [00:00<00:00, 241607.37it/s]


Time taken to process the chunk 158.044193983078 seconds
processing chunk 15


100%|██████████| 50/50 [02:56<00:00,  3.53s/it]
100%|██████████| 50/50 [00:00<00:00, 52520.71it/s]
100%|██████████| 50/50 [00:00<00:00, 119021.11it/s]
100%|██████████| 50/50 [00:00<00:00, 191695.80it/s]


Time taken to process the chunk 176.57262587547302 seconds
processing chunk 16


100%|██████████| 50/50 [02:43<00:00,  3.28s/it]
100%|██████████| 50/50 [00:00<00:00, 68601.64it/s]
100%|██████████| 50/50 [00:00<00:00, 206615.96it/s]
100%|██████████| 50/50 [00:00<00:00, 185097.26it/s]


Time taken to process the chunk 163.8972270488739 seconds
processing chunk 17


100%|██████████| 50/50 [02:34<00:00,  3.09s/it]
100%|██████████| 50/50 [00:00<00:00, 72141.45it/s]
100%|██████████| 50/50 [00:00<00:00, 190304.17it/s]
100%|██████████| 50/50 [00:00<00:00, 131979.36it/s]


Time taken to process the chunk 154.7181167602539 seconds
processing chunk 18


100%|██████████| 50/50 [02:37<00:00,  3.15s/it]
100%|██████████| 50/50 [00:00<00:00, 74871.55it/s]
100%|██████████| 50/50 [00:00<00:00, 117356.02it/s]
100%|██████████| 50/50 [00:00<00:00, 190997.45it/s]


Time taken to process the chunk 157.51738476753235 seconds
processing chunk 19


100%|██████████| 50/50 [02:45<00:00,  3.31s/it]
100%|██████████| 50/50 [00:00<00:00, 86373.64it/s]
100%|██████████| 50/50 [00:00<00:00, 202232.59it/s]
100%|██████████| 50/50 [00:00<00:00, 210981.09it/s]


Time taken to process the chunk 165.29848861694336 seconds
processing chunk 20


100%|██████████| 50/50 [02:33<00:00,  3.07s/it]
100%|██████████| 50/50 [00:00<00:00, 75983.77it/s]
100%|██████████| 50/50 [00:00<00:00, 171335.95it/s]
100%|██████████| 50/50 [00:00<00:00, 188423.36it/s]


Time taken to process the chunk 153.3799340724945 seconds
processing chunk 21


100%|██████████| 50/50 [03:15<00:00,  3.91s/it]
100%|██████████| 50/50 [00:00<00:00, 69144.48it/s]
100%|██████████| 50/50 [00:00<00:00, 195265.55it/s]
100%|██████████| 50/50 [00:00<00:00, 191520.73it/s]


Time taken to process the chunk 195.6796588897705 seconds
processing chunk 22


100%|██████████| 50/50 [02:44<00:00,  3.30s/it]
100%|██████████| 50/50 [00:00<00:00, 71648.51it/s]
100%|██████████| 50/50 [00:00<00:00, 167772.16it/s]
100%|██████████| 50/50 [00:00<00:00, 175493.89it/s]


Time taken to process the chunk 164.86446285247803 seconds
processing chunk 23


100%|██████████| 50/50 [02:39<00:00,  3.19s/it]
100%|██████████| 50/50 [00:00<00:00, 81696.61it/s]
100%|██████████| 50/50 [00:00<00:00, 199349.05it/s]
100%|██████████| 50/50 [00:00<00:00, 172889.69it/s]


Time taken to process the chunk 159.66344499588013 seconds
processing chunk 24


100%|██████████| 50/50 [03:21<00:00,  4.03s/it]
100%|██████████| 50/50 [00:00<00:00, 67934.95it/s]
100%|██████████| 50/50 [00:00<00:00, 192222.91it/s]
100%|██████████| 50/50 [00:00<00:00, 195995.51it/s]


Time taken to process the chunk 201.3462131023407 seconds
processing chunk 25


100%|██████████| 50/50 [02:39<00:00,  3.19s/it]
100%|██████████| 50/50 [00:00<00:00, 79709.31it/s]
100%|██████████| 50/50 [00:00<00:00, 183638.53it/s]
100%|██████████| 50/50 [00:00<00:00, 148418.40it/s]


Time taken to process the chunk 159.4175307750702 seconds
processing chunk 26


100%|██████████| 50/50 [02:51<00:00,  3.44s/it]
100%|██████████| 50/50 [00:00<00:00, 63782.00it/s]
100%|██████████| 50/50 [00:00<00:00, 175347.16it/s]
100%|██████████| 50/50 [00:00<00:00, 193821.81it/s]


Time taken to process the chunk 172.0001621246338 seconds
processing chunk 27


100%|██████████| 50/50 [02:50<00:00,  3.41s/it]
100%|██████████| 50/50 [00:00<00:00, 76315.57it/s]
100%|██████████| 50/50 [00:00<00:00, 187245.71it/s]
100%|██████████| 50/50 [00:00<00:00, 207433.43it/s]


Time taken to process the chunk 170.31840991973877 seconds
processing chunk 28


100%|██████████| 50/50 [02:51<00:00,  3.43s/it]
100%|██████████| 50/50 [00:00<00:00, 88301.14it/s]
100%|██████████| 50/50 [00:00<00:00, 168988.88it/s]
100%|██████████| 50/50 [00:00<00:00, 139716.99it/s]


Time taken to process the chunk 171.75133681297302 seconds
processing chunk 29


 18%|█▊        | 9/50 [00:27<02:05,  3.06s/it]


KeyboardInterrupt: 

In [20]:
import llm_utils
import importlib
importlib.reload(llm_utils)
tempodf = get_icd10_code_df(chunk[:1])
tempodf.head()

100%|██████████| 1/1 [00:02<00:00,  2.90s/it]
100%|██████████| 1/1 [00:00<00:00, 4760.84it/s]


identified_codes=[ICD10Code(code='H18.20', code_type='Primary', description='Corneal edema, unspecified', confidence=0.95, supporting_keyword_evidence='Corneal edema after vitrectomy and lensectomy, marked corneal edema throughout the entire corneal tissue.'), ICD10Code(code='H59.81', code_type='Secondary', description='Corneal decompensation after ocular surgery', confidence=0.9, supporting_keyword_evidence='Corneal decompensation in the left eye after combined pars plana vitrectomy and lensectomy'), ICD10Code(code='S05.8XXA', code_type='Secondary', description='Other specified injuries of eye and orbit, initial encounter', confidence=0.85, supporting_keyword_evidence='Management of a traumatic eye injury')]


100%|██████████| 1/1 [00:00<00:00, 5983.32it/s]
100%|██████████| 1/1 [00:00<00:00, 9709.04it/s]


Unnamed: 0,patient_id,patient_uid,PMID,file_path,title,patient,age,gender,relevant_articles,similar_patients,Sampling_label_list,icd10_codes,raw_llm_response,json_llm
125671,17069,7047886-1,32158637,comm/PMC007xxxxxx/PMC7047886.xml,Descemet’s membrane endothelial keratoplasty i...,A 56-year-old male patient presented with corn...,"[[56.0, 'year']]",M,"{'34296044': 1, '25990654': 1, '28834814': 1, ...",{},[I50.9],"{S05.8XXA, H18.20, H59.81}","identified_codes=[ICD10Code(code='H18.20', cod...","[{'code': 'H18.20', 'code_type': 'Primary', 'd..."


In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from tqdm import tqdm
import llm_utils
from llm_utils import analyze_clinical_note, extract_codes_from_response, get_icd10_code_df
import importlib
importlib.reload(llm_utils)
from time import time, sleep
import concurrent.futures

output_dir = 'data_labelled/positive'
os.makedirs(output_dir, exist_ok=True)

def process_chunk(chunk_info):
    """Process a chunk of the dataframe"""
    start, chunk_size, chunk_index = chunk_info
    start_time = time()
    end = start + chunk_size
    chunk = balanced_df.iloc[start:end]
    
    try:
        icd10_code_df = get_icd10_code_df(chunk)
        
        # Save chunk results to a CSV file
        chunk_file_path = os.path.join(output_dir, f'chunk_{chunk_index}.csv')
        icd10_code_df.to_csv(chunk_file_path, index=False)
        
        time_taken = time() - start_time
        print(f"Chunk {chunk_index} completed in {time_taken:.2f} seconds")
        
        return icd10_code_df
    except Exception as e:
        print(f"Error processing chunk {chunk_index}: {str(e)}")
        return None

# Process data in chunks
chunk_size = 50
total_chunks = (len(balanced_df) + chunk_size - 1) // chunk_size  # ceiling division

# Prepare the arguments for each chunk
chunk_args = [(i, chunk_size, i // chunk_size + 1) 
              for i in range(0, len(balanced_df), chunk_size)]

# Use ThreadPoolExecutor for I/O bound tasks (API calls)
# Use ProcessPoolExecutor for CPU bound tasks
max_workers = min(8, os.cpu_count() or 4)  # Limit to avoid overwhelming the API
print(f"Processing with {max_workers} workers")

all_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all chunks for processing
    future_to_chunk = {executor.submit(process_chunk, arg): arg for arg in chunk_args}
    
    # Process results as they complete
    for future in tqdm(concurrent.futures.as_completed(future_to_chunk), total=len(future_to_chunk)):
        chunk_arg = future_to_chunk[future]
        try:
            result = future.result()
            if result is not None:
                all_results.append(result)
        except Exception as exc:
            print(f'Chunk {chunk_arg[2]} generated an exception: {exc}')
        
        # Optional: Add a small delay between submissions to avoid rate limits
        sleep(1)

# Combine all results
if all_results:
    combined_results = pd.concat(all_results, ignore_index=True)
    combined_file_path = os.path.join(output_dir, 'all_results_combined.csv')
    combined_results.to_csv(combined_file_path, index=False)
    print(f"Combined results saved to {combined_file_path}")
else:
    print("No valid results to combine")

print("Processing complete. Results saved in chunks and combined file.")