Make sure you have MIMIC-IV datasets downloaded, particularly discharge.csv and diagnoses.csv.

In [13]:
import numpy as np
import pandas as pd
from collections import defaultdict
import random


In [2]:
cd ~/Downloads

/Users/roshanswaroop/Downloads


In [12]:
N = 5000

#Load MIMIC IV data (clinical notes and diagnoses)
discharge_df = pd.read_csv('discharge.csv')
diagnoses_df = pd.read_csv('diagnoses_icd.csv')  
diagnoses_df = diagnoses_df.query("icd_version == 10")

#Filter clinical notes
icd_10_hadm_ids = random.sample(list(set(diagnoses_df["hadm_id"].values.tolist())), N)
discharge_df = discharge_df[discharge_df["hadm_id"].isin(icd_10_hadm_ids)]

#Load filtered hadm and subject ids into new data frame
hadm_to_subject_id = dict()
for index, entry in diagnoses_df.iterrows():
    if (entry["hadm_id"] in hadm_to_subject_id):
        continue
    else:
        hadm_to_subject_id[entry["hadm_id"]] = entry["subject_id"]
icd_10_subject_ids = [hadm_to_subject_id[hadm_id] for hadm_id in icd_10_hadm_ids]
query_df = pd.DataFrame()
query_df["hadm_id"] = icd_10_hadm_ids
query_df["subject_id"] = icd_10_subject_ids

#Add ICD codes to new data frame
hadm_to_icd = dict()
for i in icd_10_hadm_ids:
    icd_codes = diagnoses_df.loc[diagnoses_df['hadm_id'] == i, 'icd_code'].values.tolist()
    icd_code_string = ""
    for code in icd_codes:
        icd_code_string += code + " "
    hadm_to_icd[i] = icd_code_string.strip()
icd_10_codes = [hadm_to_icd[hadm_id] for hadm_id in icd_10_hadm_ids]
query_df["icd_codes"] = icd_10_codes

# text, ICD-10 mapping for eval purposes
merged_df = pd.merge(discharge_df, query_df, on='hadm_id')
eval_df = merged_df[['text', 'icd_codes']]

In [94]:
# Given OAI costs, we are using a subset, 500 randomly sampled examples

# We also keep our notes used to around the average length, the average word count is:
# Average word count: 1754.476, as seen a cell down below
import random

# Function to calculate word count
def count_words(text):
    return len(text.split())

# Filter rows with word count under 2000
filtered_df = eval_df[eval_df['text'].apply(count_words) < 2000]

# Randomly sample 500 rows
sampled_df = filtered_df.sample(n=500, random_state=42)

# Display the sampled DataFrame
sampled_df

Unnamed: 0,text,icd_codes,word_count
1437,\nName: ___ Unit No: ___...,I2510 N179 I4891 I9789 D62 R001 I129 K219 N189...,1373
3408,\nName: ___ Unit No: ___\n...,I472 Z951 F17210 I4891 I2510 G4730 I255 Z95810...,1081
628,\nName: ___ Unit No: _...,K5521 D62 I739 I10 E785 Z8546 I6522 Z87891 I48...,1556
3679,\nName: ___ Unit No: ___\n \...,O133 D259 O76 O99824 O3413 O99344 F419 Z3A37 Z370,559
612,\nName: ___ Unit No: ___\n...,O4703 O98613 Z3A32 O99513 J45909 B360,630
...,...,...,...
132,\nName: ___ Unit No: __...,T814XXA M341 L03116 M96830 Y838 Y929 I4891 Z79...,1168
356,\nName: ___ Unit No: ___...,I63411 R64 E46 N179 M6282 N390 R471 R29700 I10...,1837
850,\nName: ___ Unit No: __...,I214 I5031 E1122 E1140 T82855A I130 E785 I2510...,1639
3162,\nName: ___ Unit No: ___\n ...,D171 Z7902 Z86711 I10 E785 G4733 J309 K219 B00...,622


## Determine longest texts for LLM context window purposes

In [88]:
# Sort the DataFrame by text length in descending order
sorted_df = sampled_df.assign(text_length=sampled_df['text'].str.len()).sort_values('text_length', ascending=False)

# Get the top 10 longest texts and their indices
top_10_longest_texts = sorted_df.head(10)['text'].tolist()
top_10_longest_texts_indices = sorted_df.head(10).index.tolist()

# Print the top 10 longest texts with their indices
# print("Top 10 longest text indices:")
# for i, (index, text) in enumerate(zip(top_10_longest_texts_indices, top_10_longest_texts)):
#     print(f"Index: {index}")

# Make a copy of the DataFrame to avoid SettingWithCopyWarning
sampled_df_copy = sampled_df.copy()

# Calculate word count for each text
sampled_df_copy['word_count'] = sampled_df_copy['text'].apply(lambda x: len(str(x).split()))

# Calculate average word count
average_word_count = sampled_df_copy['word_count'].mean()
print(f"Average word count: {average_word_count}")

# Sort the DataFrame by word count in descending order
sampled_df = sampled_df_copy.sort_values('word_count', ascending=False)

# Get the top 10 texts with the highest word counts and their indices
top_10_word_counts = sampled_df.head(10)['word_count'].tolist()
top_10_word_counts_indices = sampled_df.head(10).index.tolist()

# Print the top 10 texts with the highest word counts and their indices
print("Top 10 highest word counts:")
for i, (index, word_count) in enumerate(zip(top_10_word_counts_indices, top_10_word_counts)):
    print(f"Index: {index}, Text {i+1}: {word_count} words")


Average word count: 1754.476
Top 10 highest word counts:
Index: 1718, Text 1: 5861 words
Index: 2032, Text 2: 4857 words
Index: 3008, Text 3: 4389 words
Index: 1989, Text 4: 4155 words
Index: 2476, Text 5: 4037 words
Index: 2625, Text 6: 3860 words
Index: 2162, Text 7: 3809 words
Index: 1260, Text 8: 3802 words
Index: 2203, Text 9: 3792 words
Index: 1750, Text 10: 3769 words


## Determine 50 most common ICD-10 codes in dataset

In [138]:
# vvv supresses output for conciseness vvv
%%capture

from collections import Counter

# Split the space-delimited ICD-10 codes into a list
codes_list = eval_df['icd_codes'].str.split()

# Flatten the list of ICD-10 codes
flattened_codes = [code for sublist in codes_list for code in sublist]

# Count the occurrences of each ICD-10 code
code_counter = Counter(flattened_codes)

# Get the 50 most common ICD-10 codes
top_50_common_codes = code_counter.most_common(50)

# Create an ordered list of codes
ordered_codes = [code for code, _ in top_50_common_codes]

# Create a dictionary mapping code to frequency count
code_freq_dict = {code: count for code, count in top_50_common_codes}

# Print the ordered list of codes
print("Ordered List of Codes:")
for i, code in enumerate(ordered_codes):
    print(f"{i+1}. {code}")

# Print the dictionary mapping code to frequency count
print("\nCode to Frequency Count Dictionary:")
for code, count in code_freq_dict.items():
    print(f"{code}: {count}")

UsageError: Line magic function `%%capture` not found.


## Baseline: Predict top 16 codes 

In [148]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

top_16 = ordered_codes[:16]

# Assuming that 'codes' column in dataframe and top_16 codes are space-delimited strings
def process_codes(codes):
    return set(codes.split())

# Create binary representations for each code string and top_16
mlb = MultiLabelBinarizer()
mlb.fit([process_codes(code) for code in sampled_df['icd_codes']] + [set(top_16)])

y_true = []
for line in sampled_df['icd_codes']:
    y_true.append(line.split())
    
y_true = mlb.transform(y_true)
y_pred = mlb.transform([set(top_16) for _ in range(500)])

# Calculate metrics
micro_auc = roc_auc_score(y_true, y_pred, average='micro')
macro_auc = roc_auc_score(y_true, y_pred, average='macro')
micro_f1 = f1_score(y_true, y_pred, average='micro')
macro_f1 = f1_score(y_true, y_pred, average='macro')


def precision_at_k(y_true, y_pred, k):
    precisions = []
    for true_codes in y_true:
        top_k_preds = y_pred[:k]  # Get the top k predictions
        # Count the number of correct predictions
        correct_preds = sum([1 for code in top_k_preds if code in true_codes])
        # Calculate precision and append it to the list
        precisions.append(correct_preds / len(top_k_preds))
    return np.mean(precisions)


# Convert y_true back to the set representation
y_true_sets = [set(codes) for codes in mlb.inverse_transform(y_true)]

# The top_16 codes
y_pred = top_16

precision_at_5 = precision_at_k(y_true_sets, y_pred, 5)

# specified
print('Micro AUC:', micro_auc)
print('Macro AUC:', macro_auc)
print('Micro F1:', micro_f1)
print('Macro F1:', macro_f1)
print('Precision P@5:', precision_at_5)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Micro AUC: 0.6037045694606421
Macro AUC: 0.5
Micro F1: 0.18547008547008545
Macro F1: 0.002613984616350635
Precision P@5: 0.2748


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Baseline: GPT 3.5 eval (turbo)

In [132]:
# standard GPT-3.5/4

# for Azure
# openai.api_type = "azure"
# openai.api_key = "..."
# openai.api_base = "https://example-endpoint.openai.azure.com"
# openai.api_version = "2023-03-15-preview"

def call_model(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo", 
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,           # Maximum number of tokens in the generated text. If the generated response exceeds this limit, it will be cut off.
        temperature=1,          # Controls randomness. Higher values (closer to 1) make output more random, lower values (closer to 0) make it more deterministic.
        top_p=1,                  # Sets the nucleus sampling value, controls diversity via probability threshold, can be used as an alternative to temperature.
        frequency_penalty=0.0,    # Penalizes new tokens based on their frequency in the model's training data. Ranges from -2.0 to 2.0.
        presence_penalty=0.0,     # Penalizes new tokens based on whether they appear in the context. Ranges from -2.0 to 2.0.
        n=1,                      # The number of completions to generate. More completions means more diversity, but at a higher computational cost.
        stream=False,             # If true, generate the response as a stream to reduce latency.
        stop=None,                # A sequence (or list of sequences) where the API will stop generating further tokens.
    )
    return completion.choices[0].message.content

import openai
openai.api_key = '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'

In [68]:
import json
with open('/Users/roshanswaroop/rema/rema/codemCodes.json', 'r') as f:
    # Load JSON data from file
    legit_codes = json.load(f)

In [133]:
prompt ='You are an experienced medical coder. You must identify all correct ICD-10 codes for the following clinical note. Pay attention to areas describing present illness, chart review, imaging, discharge labs, active issues, medications, chief complaint, major surgery/procedure, etc. Return your answer in the following format, but note that the actual correct codes may vary greatly from these: I10, E78.5, Z87.891\n'

print(prompt)

You are an experienced medical coder. You must identify all correct ICD-10 codes for the following clinical note. Pay attention to areas describing present illness, chart review, imaging, discharge labs, active issues, medications, chief complaint, major surgery/procedure, etc. Return your answer in the following format, but note that the actual codes suggested may vary greatly from these: I10, E78.5, Z87.891



In [74]:
def get_code_descriptions(index):
    # Get the row corresponding to the index
    row = eval_df.loc[index]

    # Extract the text and ICD codes from the row
    text = row['text']
    icd_codes = row['icd_codes'].split()


    code_list = []
    description_list = []
    # For each ICD code, print the code and its description
    for code in icd_codes:
        description = icd_code_descriptions.get(code, "No description available")
        code_list.append(code)
        description_list.append(description)

    return code_list, description_list

def print_code_descriptions(index):
    # Get the row corresponding to the index
    row = eval_df.loc[index]

    # Extract the text and ICD codes from the row
    text = row['text']
    icd_codes = row['icd_codes'].split()

    #print("Text:", text, "\n")

    
    code_list = []
    description_list = []
    # For each ICD code, print the code and its description
    for code in icd_codes:
        description = icd_code_descriptions.get(code, "No description available")
        code_list.append(code)
        description_list.append(description)
        
    print("Code:", code_list)
    print("Description:", description_list)

In [134]:
import pandas as pd

# Empty dictionary to store the mapping
results = {'Original Codes': [], 'Predicted Codes': []}

for i in range(5):
    note = sampled_df[['text']].iloc[i][0]
    inference = call_model(prompt + note)

    # Append original codes and descriptions to the results
    original_codes, _ = get_code_descriptions(i)
    results['Original Codes'].append(original_codes)

    # Predicted codes are inferred from the model and need to be processed to match the format of original codes
    predicted_codes = inference.replace(",", " ").split()

    # Check each predicted code against the list of legit codes before appending
    # Removing periods from predicted codes only for lookup
    legit_predicted_codes = [code for code in predicted_codes if code.replace(".", "") in legit_codes]
    results['Predicted Codes'].append(legit_predicted_codes)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

In [135]:
results_df

Unnamed: 0,Original Codes,Predicted Codes
0,"[G3183, F0280, R441, R296, E785, Z8546]","[I25.10, I50.32, Z99.2, Z87.891, Z85.46, Z82.49, Z90.49, Z79.4, Z79.899, Z96.651, Z94.0, Z79.01]"
1,"[C675, I10, D259, Z87891, E785, E890]","[I44.1, I48.0, I25.9, I42.9, F51.01, Z85.048, Z90.49, Z87.01, Z91.83, Z79.01, Z79.899, Z13.88, Z68.30, Z98.61, R53.81]"
2,"[J441, N179, Z9981, I4891, D649, I10, E785, G5622, I2510, M1990, Z96649, Z87891, J45909, F419, G4700, R040, I739]","[I10, E78.5, Z85.038, Z92.0, Z87.891, Z96.651, Z94.1, Z85.49, Z01.810]"
3,"[K31811, B1910, S0990XA, G629, D62, F1120, I452, I6523, G40909, I951, F319, Q2733, I10, W01198A, Y92008, I701, M5416, E039, E785, J449, K219, Z86718, Z87891, K2270, R110, T402X5A, Y929, I739, I69398, R531, R42, N3090, R079, I459, K5900]","[O10.13, O82, I10, Z87.891]"
4,"[T8453XA, D62, N179, D709, B9562, D696, I10, E785, I2510, E860, H409, B9689, N400, Z951, Z8673, Z96652, Z954, Y792, Y929]","[Z32.01, Z86.79, J45.909, Z3A.32, Z3A.09.]"


In [125]:
# Get the simplified codes
original_codes_simplified = results_df['Original Codes'].apply(lambda codes: [code[:3] for code in codes])
predicted_codes_simplified = results_df['Predicted Codes'].apply(lambda codes: [code[:3] for code in codes])

# Create binary representations for each code
mlb = MultiLabelBinarizer()

# Get all unique codes in the dataset
all_codes = set()
for codes in original_codes_simplified:
    all_codes.update(codes)
for codes in predicted_codes_simplified:
    all_codes.update(codes)

#print(all_codes)
# Fit the binarizer on all unique codes
mlb.fit([all_codes])

# Now transform your labels
print(original_codes_simplified, predicted_codes_simplified)
y_true = []
for line in original_codes_simplified:
    y_true.append(line)
#print(y_true)
y_true = mlb.transform(y_true)
y_pred = mlb.transform(predicted_codes_simplified)

#print(y_true)


# Calculate metrics
micro_auc = roc_auc_score(y_true, y_pred, average='micro')
auc_scores = []
for class_index in range(y_true.shape[1]):
    try:
        class_auc = roc_auc_score(y_true[:, class_index], y_pred[:, class_index])
        auc_scores.append(class_auc)
    except ValueError:
        auc_scores.append(0.5)  # means it's as good as random for that class instance
macro_auc = np.mean(auc_scores)
micro_f1 = f1_score(y_true, y_pred, average='micro')
macro_f1 = f1_score(y_true, y_pred, average='macro')

# Convert y_true back to the set representation
y_true_sets = [set(codes) for codes in mlb.inverse_transform(y_true)]
y_pred_sets = [set(codes) for codes in mlb.inverse_transform(y_pred)]

# Calculate precision at 5
precision_at_5 = precision_at_k(y_true_sets, y_pred_sets, 5)

# Print the results
print('Micro AUC:', micro_auc)
print('Macro AUC:', macro_auc)
print('Micro F1:', micro_f1)
print('Macro F1:', macro_f1)
print('Precision P@5:', precision_at_5)

0                       [G31, F02, R44, R29, E78, Z85]
1                       [C67, I10, D25, Z87, E78, E89]
2    [J44, N17, Z99, I48, D64, I10, E78, G56, I25, ...
3    [K31, B19, S09, G62, D62, F11, I45, I65, G40, ...
4    [T84, D62, N17, D70, B95, D69, I10, E78, I25, ...
Name: Original Codes, dtype: object 0    [I25, N18, K57, I10, K21, M10, M17, C61, H26, ...
1    [I47, I48, I25, Z95, Z87, G47, Z48, Z13, E87, ...
2             [D50, E78, I21, I48, I70, Z12, Z85, Z87]
3                  [O10, O82, R03, Z87, Z71, Z79, N18]
4                  [Z34, J45, Z87, Z01, Z11, Z11, Z04]
Name: Predicted Codes, dtype: object
Micro AUC: 0.4654605263157895
Macro AUC: 0.4809782608695653
Micro F1: 0.07407407407407407
Macro F1: 0.019021739130434784
Precision P@5: 0.0


In [121]:
column_sums = np.sum(y_true, axis=0)
if np.any(column_sums == len(y_true)) or np.any(column_sums == 0):
    print('There is a class with only one type of instance.')
else:
    print('Every class has at least one positive and one negative instance.')

There is a class with only one type of instance.


In [129]:
pd.set_option('display.max_colwidth', None)
pd.DataFrame(original_codes_simplified)

Unnamed: 0,Original Codes
0,"[G31, F02, R44, R29, E78, Z85]"
1,"[C67, I10, D25, Z87, E78, E89]"
2,"[J44, N17, Z99, I48, D64, I10, E78, G56, I25, M19, Z96, Z87, J45, F41, G47, R04, I73]"
3,"[K31, B19, S09, G62, D62, F11, I45, I65, G40, I95, F31, Q27, I10, W01, Y92, I70, M54, E03, E78, J44, K21, Z86, Z87, K22, R11, T40, Y92, I73, I69, R53, R42, N30, R07, I45, K59]"
4,"[T84, D62, N17, D70, B95, D69, I10, E78, I25, E86, H40, B96, N40, Z95, Z86, Z96, Z95, Y79, Y92]"


In [130]:
pd.DataFrame(predicted_codes_simplified)

Unnamed: 0,Predicted Codes
0,"[I25, N18, K57, I10, K21, M10, M17, C61, H26, Z87, Z96, Z12, Z82, Z96, Z96, Z79, Z89, Z99, Z91]"
1,"[I47, I48, I25, Z95, Z87, G47, Z48, Z13, E87, R55, F41, Z76, Z71, Z68, Z79, Z88, Z95, Z85]"
2,"[D50, E78, I21, I48, I70, Z12, Z85, Z87]"
3,"[O10, O82, R03, Z87, Z71, Z79, N18]"
4,"[Z34, J45, Z87, Z01, Z11, Z11, Z04]"


In [20]:
# input an index, get MIMIC IV's code suggestions and the original clinical text
print_code_descriptions(0)

Code: ['G3183', 'F0280', 'R441', 'R296', 'E785', 'Z8546']
Description: ['Dementia with Lewy bodies', 'Dementia in other diseases classified elsewhere without behavioral disturbance', 'Visual hallucinations', 'Repeated falls', 'Hyperlipidemia, unspecified', 'Personal history of malignant neoplasm of prostate']


## GPT-4 zero-shot

In [139]:
import openai
openai.api_key = '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'

def call_model(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-4", 
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,           # Maximum number of tokens in the generated text. If the generated response exceeds this limit, it will be cut off.
        temperature=1,          # Controls randomness. Higher values (closer to 1) make output more random, lower values (closer to 0) make it more deterministic.
        top_p=1,                  # Sets the nucleus sampling value, controls diversity via probability threshold, can be used as an alternative to temperature.
        frequency_penalty=0.0,    # Penalizes new tokens based on their frequency in the model's training data. Ranges from -2.0 to 2.0.
        presence_penalty=0.0,     # Penalizes new tokens based on whether they appear in the context. Ranges from -2.0 to 2.0.
        n=1,                      # The number of completions to generate. More completions means more diversity, but at a higher computational cost.
        stream=False,             # If true, generate the response as a stream to reduce latency.
        stop=None,                # A sequence (or list of sequences) where the API will stop generating further tokens.
    )
    return completion.choices[0].message.content

prompt ='You are an experienced medical coder. You must identify all correct ICD-10 codes for the following clinical note. Pay attention to areas describing present illness, chart review, imaging, discharge labs, active issues, medications, chief complaint, major surgery/procedure, etc. Return your answer in the following format, but note that the actual correct codes may vary greatly from these: I10, E78.5, Z87.891\n'

print(prompt)

def get_code_descriptions(index):
    # Get the row corresponding to the index
    row = eval_df.loc[index]

    # Extract the text and ICD codes from the row
    text = row['text']
    icd_codes = row['icd_codes'].split()


    code_list = []
    description_list = []
    # For each ICD code, print the code and its description
    for code in icd_codes:
        description = icd_code_descriptions.get(code, "No description available")
        code_list.append(code)
        description_list.append(description)

    return code_list, description_list

import pandas as pd

# Empty dictionary to store the mapping
results = {'Original Codes': [], 'Predicted Codes': []}

for i in range(5):
    note = sampled_df[['text']].iloc[i][0]
    inference = call_model(prompt + note)

    # Append original codes and descriptions to the results
    original_codes, _ = get_code_descriptions(i)
    results['Original Codes'].append(original_codes)

    # Predicted codes are inferred from the model and need to be processed to match the format of original codes
    predicted_codes = inference.replace(",", " ").split()

    # Check each predicted code against the list of legit codes before appending
    # Removing periods from predicted codes only for lookup
    legit_predicted_codes = [code for code in predicted_codes if code.replace(".", "") in legit_codes]
    results['Predicted Codes'].append(legit_predicted_codes)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

You are an experienced medical coder. You must identify all correct ICD-10 codes for the following clinical note. Pay attention to areas describing present illness, chart review, imaging, discharge labs, active issues, medications, chief complaint, major surgery/procedure, etc. Return your answer in the following format, but note that the actual correct codes may vary greatly from these: I10, E78.5, Z87.891



In [141]:
results_df

Unnamed: 0,Original Codes,Predicted Codes
0,"[G3183, F0280, R441, R296, E785, Z8546]","[I25.10, N18.9, K20.9, I10, K44.9, K21.9, M10.9, E78.5, M19.90, C61, Z87.891, Z94.1, Z95.1, Z95.5, Z96.649, Z98.890]"
1,"[C675, I10, D259, Z87891, E785, E890]","[I48.91, I25.10, I21.9, Z95.1, I25.5, I50.23, Z95.810, I47.2, I47.9, G47.33, Z99.89]"
2,"[J441, N179, Z9981, I4891, D649, I10, E785, G5622, I2510, M1990, Z96649, Z87891, J45909, F419, G4700, R040, I739]","[D62, K55.21, D50.9, I73.9, I65.23, I48.91]"
3,"[K31811, B1910, S0990XA, G629, D62, F1120, I452, I6523, G40909, I951, F319, Q2733, I10, W01198A, Y92008, I701, M5416, E039, E785, J449, K219, Z86718, Z87891, K2270, R110, T402X5A, Y929, I739, I69398, R531, R42, N3090, R079, I459, K5900]","[O10.93, O82, Z37.0, Z3A.37, Z87.59, I10]"
4,"[T8453XA, D62, N179, D709, B9562, D696, I10, E785, I2510, E860, H409, B9689, N400, Z951, Z8673, Z96652, Z954, Y792, Y929]","[Z33.1, A54.9, J45.909, Z3A.32, O60.03, O76, O99.320, A49.02, J45.20, B37.89, Z79.84]"


In [140]:
# Get the simplified codes
original_codes_simplified = results_df['Original Codes'].apply(lambda codes: [code[:3] for code in codes])
predicted_codes_simplified = results_df['Predicted Codes'].apply(lambda codes: [code[:3] for code in codes])

# Create binary representations for each code
mlb = MultiLabelBinarizer()

# Get all unique codes in the dataset
all_codes = set()
for codes in original_codes_simplified:
    all_codes.update(codes)
for codes in predicted_codes_simplified:
    all_codes.update(codes)

#print(all_codes)
# Fit the binarizer on all unique codes
mlb.fit([all_codes])

# Now transform your labels
print(original_codes_simplified, predicted_codes_simplified)
y_true = []
for line in original_codes_simplified:
    y_true.append(line)
#print(y_true)
y_true = mlb.transform(y_true)
y_pred = mlb.transform(predicted_codes_simplified)

#print(y_true)


# Calculate metrics
micro_auc = roc_auc_score(y_true, y_pred, average='micro')
auc_scores = []
for class_index in range(y_true.shape[1]):
    try:
        class_auc = roc_auc_score(y_true[:, class_index], y_pred[:, class_index])
        auc_scores.append(class_auc)
    except ValueError:
        auc_scores.append(0.5)  # means it's as good as random for that class instance
macro_auc = np.mean(auc_scores)
micro_f1 = f1_score(y_true, y_pred, average='micro')
macro_f1 = f1_score(y_true, y_pred, average='macro')

# Convert y_true back to the set representation
y_true_sets = [set(codes) for codes in mlb.inverse_transform(y_true)]
y_pred_sets = [set(codes) for codes in mlb.inverse_transform(y_pred)]

# Calculate precision at 5
precision_at_5 = precision_at_k(y_true_sets, y_pred_sets, 5)

# Print the results
print('Micro AUC:', micro_auc)
print('Macro AUC:', macro_auc)
print('Micro F1:', micro_f1)
print('Macro F1:', macro_f1)
print('Precision P@5:', precision_at_5)

0                                                                                                                                                     [G31, F02, R44, R29, E78, Z85]
1                                                                                                                                                     [C67, I10, D25, Z87, E78, E89]
2                                                                                              [J44, N17, Z99, I48, D64, I10, E78, G56, I25, M19, Z96, Z87, J45, F41, G47, R04, I73]
3    [K31, B19, S09, G62, D62, F11, I45, I65, G40, I95, F31, Q27, I10, W01, Y92, I70, M54, E03, E78, J44, K21, Z86, Z87, K22, R11, T40, Y92, I73, I69, R53, R42, N30, R07, I45, K59]
4                                                                                    [T84, D62, N17, D70, B95, D69, I10, E78, I25, E86, H40, B96, N40, Z95, Z86, Z96, Z95, Y79, Y92]
Name: Original Codes, dtype: object 0    [I25, N18, K20, I10, K44, K21, M10, E78, M19, C61, Z87

In [147]:
def print_code_descriptions(results_df, index):
    # Get the row corresponding to the index
    row = results_df.loc[index]

    # Extract the original and predicted ICD codes from the row
    original_codes = row['Original Codes']
    predicted_codes = row['Predicted Codes']

    # For each ICD code in original and predicted codes, print the code and its description
    for code_list, label in zip([original_codes, predicted_codes], ['Original', 'Predicted']):
        print(f"{label} Codes and Descriptions:")
        for code in code_list:
            description = icd_code_descriptions.get(code.replace(".", ""), "No description available")
            print("Code:", code)
            print("Description:", description)
        print("\n")

print_code_descriptions(results_df, 1)

Original Codes and Descriptions:
Code: C675
Description: Malignant neoplasm of bladder neck
Code: I10
Description: Essential (primary) hypertension
Code: D259
Description: Leiomyoma of uterus, unspecified
Code: Z87891
Description: Personal history of nicotine dependence
Code: E785
Description: Hyperlipidemia, unspecified
Code: E890
Description: Postprocedural hypothyroidism


Predicted Codes and Descriptions:
Code: I48.91
Description: Unspecified atrial fibrillation
Code: I25.10
Description: Atherosclerotic heart disease of native coronary artery without angina pectoris
Code: I21.9
Description: Acute myocardial infarction, unspecified
Code: Z95.1
Description: Presence of aortocoronary bypass graft
Code: I25.5
Description: Ischemic cardiomyopathy
Code: I50.23
Description: Acute on chronic systolic (congestive) heart failure
Code: Z95.810
Description: Presence of automatic (implantable) cardiac defibrillator
Code: I47.2
Description: Ventricular tachycardia
Code: I47.9
Description: Paroxy