In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns

from matplotlib import pyplot as plt
from medcat.cat import CAT

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load model pack and Create CAT - the main class from medcat used for concept annotation

model_pack_path = 'medcat/mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5'
cat = CAT.load_model_pack(model_pack_path)

Found an existing unziped model pack at: medcat\mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5, the provided zip will not be touched.
{
  "Model ID": "25be3857ba34bdd5",
  "Last Modifed On": "16 March 2022",
  "History (from least to most recent)": [
    "a474096eb4566638",
    "009617d7ff372682"
  ],
  "Description": "SNOMED INT enriched with UMLS and trained unsupervised on MIMIC-III",
  "Source Ontology": "SnomedCT_InternationalRF2_PRODUCTION_20210131T120000Z",
  "Location": "MedCAT Rosalind machine, available for public download from https://github.com/CogStack/MedCAT",
  "MetaCAT models": {
    "Status": "Detects is a concept Affirmed or Negated/Hypothetical"
  },
  "Basic CDB Stats": {
    "Number of concepts": 354448,
    "Number of names": 2049216,
    "Number of concepts that received training": 29674,
    "Number of seen training examples in total": 20585988,
    "Average training examples per concept": 693.7382220125362
  },
  "Performance": {
    "ner": {},
    "meta"

In [3]:
test_notes = pd.read_csv('MIMIC_IV_clean_test_set_3digitcode.csv')
test_notes

Unnamed: 0.1,Unnamed: 0,hadm_id,text,subject_id,seq_num,icd_code,icd_version
0,0,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,1.0,D50.,10.0
1,1,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,2.0,K52.,10.0
2,2,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,3.0,I10.,10.0
3,3,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,4.0,E53.,10.0
4,4,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,5.0,M81.,10.0
...,...,...,...,...,...,...,...
245122,245122,29997858,sex m medicine allergies no known allergies ad...,17657693.0,2.0,F12.,10.0
245123,245123,29997858,sex m medicine allergies no known allergies ad...,17657693.0,3.0,F17.,10.0
245124,245124,29997858,sex m medicine allergies no known allergies ad...,17657693.0,4.0,Z79.,10.0
245125,245125,29997858,sex m medicine allergies no known allergies ad...,17657693.0,5.0,F41.,10.0


In [4]:
test_notes[test_notes.hadm_id == 20002800].icd_code.unique()

array(['J93.', 'T85.', 'Y83.', 'Y92.', 'J96.', 'D72.', 'R33.', 'E10.',
       'L98.', 'Z95.', 'Z89.', 'J45.', 'I11.', 'I50.', 'M86.', 'B95.',
       'M46.', 'F32.', 'G47.', 'G25.', 'N83.', 'I48.', 'Z79.', 'F41.',
       'I25.'], dtype=object)

In [5]:
test_notes.drop(columns = ['Unnamed: 0'], inplace = True)

In [6]:
unique_hadm_notes = test_notes[['hadm_id', 'text']].drop_duplicates(subset='hadm_id')


In [7]:
unique_hadm_notes

Unnamed: 0,hadm_id,text
0,20000024,sex f medicine allergies aspirin chief complai...
10,20002636,sex m medicine allergies prevacid ciprofloxaci...
27,20002800,sex f medicine allergies penicillins tetanus t...
52,20003013,sex m neurology allergies no known allergies a...
64,20003425,sex m otolaryngology allergies no known allerg...
...,...,...
245061,29995981,sex m surgery allergies no known allergies adv...
245065,29996421,no sex m medicine allergies penicillins chief ...
245086,29996653,sex f medicine allergies erythromycin base ato...
245115,29997523,sex f surgery allergies no known allergies adv...


In [8]:
unique_hadm_notes = unique_hadm_notes.reset_index(drop = True)

In [28]:
icd10_data = {
    'hadm_id': [],
    'icd10_codes': []
}



In [29]:
i = 0
for index, row in unique_hadm_notes.iterrows():
    i +=1
    
    if i%100 ==0:
        print(f'{i} notes have been processed!', '\n')
        
        
    single_note = row['text']
    hadm_id = row['hadm_id']

    # Get the entities from the text
    all_entities = cat.get_entities(single_note)

    icd_codes = set()  # Using a set to automatically remove duplicates

    for entity_id, entity_data in all_entities['entities'].items():
        icd10 = entity_data.get('icd10', [])
        for code in icd10:
            if code:  # Check if the code is not an empty string
                icd_codes.add(code)

    icd10_data['hadm_id'].append(hadm_id)
    icd10_data['icd10_codes'].append(list(icd_codes))



100 notes have been processed! 

200 notes have been processed! 

300 notes have been processed! 

400 notes have been processed! 

500 notes have been processed! 

600 notes have been processed! 

700 notes have been processed! 

800 notes have been processed! 

900 notes have been processed! 

1000 notes have been processed! 

1100 notes have been processed! 

1200 notes have been processed! 

1300 notes have been processed! 

1400 notes have been processed! 

1500 notes have been processed! 

1600 notes have been processed! 

1700 notes have been processed! 

1800 notes have been processed! 

1900 notes have been processed! 

2000 notes have been processed! 

2100 notes have been processed! 

2200 notes have been processed! 

2300 notes have been processed! 

2400 notes have been processed! 

2500 notes have been processed! 

2600 notes have been processed! 

2700 notes have been processed! 

2800 notes have been processed! 

2900 notes have been processed! 

3000 notes have been pr

In [30]:
# Create a DataFrame from the processed data
df_icd10 = pd.DataFrame(icd10_data)
df_icd10

Unnamed: 0,hadm_id,icd10_codes
0,20000024,"[R58, H40.9, K66.1, R29.6, K59.0, K52.9, H35.3..."
1,20002636,"[H15.9, R07.4, I95.8, R06.0, R57.9, R62.8, M10..."
2,20002800,"[F44.8, J98.8, E87.7, F41.9, J96.99, Z88.0, R0..."
3,20003013,"[I63.9, R25.1, R50.8, D44.3, R51, K59.0, H57.8..."
4,20003425,"[T88.7, T81.9, C80.9, B99, R52.9, C32.9, R53, ..."
...,...,...
18340,29995981,"[I10, R58, R50.8, K13.7, R20.0, K59.0, K52.9, ..."
18341,29996421,"[R45.2, I10, E14.9, F44.8, T14.9, E11.6, Z88.0..."
18342,29996653,"[E14.9, T14.9, R07.4, R06.0, K59.0, H55, R05, ..."
18343,29997523,"[R58, R20.0, K59.0, K91.3, R52.9, J18.9, T79.3..."


In [32]:
def reformat(code):
    code = ''.join(code.split('.'))
    code = code[:3] + '.'
    return code



In [33]:

# Apply the reformat function to each element in the 'icd10_codes' column
df_icd10['icd10_codes'] = df_icd10['icd10_codes'].apply(lambda codes: [reformat(code) for code in codes])
df_icd10

Unnamed: 0,hadm_id,icd10_codes
0,20000024,"[R58., H40., K66., R29., K59., K52., H35., K44..."
1,20002636,"[H15., R07., I95., R06., R57., R62., M10., R00..."
2,20002800,"[F44., J98., E87., F41., J96., Z88., R06., M86..."
3,20003013,"[I63., R25., R50., D44., R51., K59., H57., R05..."
4,20003425,"[T88., T81., C80., B99., R52., C32., R53., R40..."
...,...,...
18340,29995981,"[I10., R58., R50., K13., R20., K59., K52., K92..."
18341,29996421,"[R45., I10., E14., F44., T14., E11., Z88., R06..."
18342,29996653,"[E14., T14., R07., R06., K59., H55., R05., D63..."
18343,29997523,"[R58., R20., K59., K91., R52., J18., T79., C18..."


In [36]:
# Define a function to remove duplicates within a list
def remove_duplicates(codes):
    seen = set()
    result = []
    for code in codes:
        if code not in seen:
            result.append(code)
            seen.add(code)
    return result

In [37]:

# Apply the remove_duplicates function to the 'icd10_codes' column for each 'hadm_id'
df_icd10['icd10_codes'] = df_icd10['icd10_codes'].apply(remove_duplicates)
df_icd10

Unnamed: 0,hadm_id,icd10_codes
0,20000024,"[R58., H40., K66., R29., K59., K52., H35., K44..."
1,20002636,"[H15., R07., I95., R06., R57., R62., M10., R00..."
2,20002800,"[F44., J98., E87., F41., J96., Z88., R06., M86..."
3,20003013,"[I63., R25., R50., D44., R51., K59., H57., R05..."
4,20003425,"[T88., T81., C80., B99., R52., C32., R53., R40..."
...,...,...
18340,29995981,"[I10., R58., R50., K13., R20., K59., K52., K92..."
18341,29996421,"[R45., I10., E14., F44., T14., E11., Z88., R06..."
18342,29996653,"[E14., T14., R07., R06., K59., H55., R05., D63..."
18343,29997523,"[R58., R20., K59., K91., R52., J18., T79., C18..."


In [38]:
df_icd10.to_csv('test_set_mapped_icd_hadmids_18k_3digits.csv')

unique_hadm_notes.to_csv('test_set_unique_notes_hadmids_18k_3digits.csv')


In [39]:
df_icd10.dtypes

hadm_id         int64
icd10_codes    object
dtype: object

In [40]:
np.array(df_icd10[df_icd10.hadm_id==20000024].icd10_codes[0])

array(['R58.', 'H40.', 'K66.', 'R29.', 'K59.', 'K52.', 'H35.', 'K44.',
       'M19.', 'K40.', 'R52.', 'R27.', 'M81.', 'Z88.', 'D64.', 'R60.',
       'R19.', 'R32.', 'J30.', 'S32.', 'K57.', 'R20.', 'R23.', 'C22.',
       'N83.', 'A09.', 'C34.', 'K58.', 'C50.', 'R01.', 'R53.', 'Z82.',
       'Q89.', 'R71.', 'T14.', 'R09.', 'R06.'], dtype='<U4')

In [41]:
test_notes[test_notes.hadm_id == 20000024].icd_code.unique().astype('str')

array(['D50.', 'K52.', 'I10.', 'E53.', 'M81.', 'R27.', 'Z91.', 'H54.',
       'T47.', 'Y92.'], dtype='<U4')

In [42]:
df_icd10.drop_duplicates(subset = ['hadm_id'], inplace = True)

In [43]:
test_notes

Unnamed: 0,hadm_id,text,subject_id,seq_num,icd_code,icd_version
0,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,1.0,D50.,10.0
1,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,2.0,K52.,10.0
2,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,3.0,I10.,10.0
3,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,4.0,E53.,10.0
4,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,5.0,M81.,10.0
...,...,...,...,...,...,...
245122,29997858,sex m medicine allergies no known allergies ad...,17657693.0,2.0,F12.,10.0
245123,29997858,sex m medicine allergies no known allergies ad...,17657693.0,3.0,F17.,10.0
245124,29997858,sex m medicine allergies no known allergies ad...,17657693.0,4.0,Z79.,10.0
245125,29997858,sex m medicine allergies no known allergies ad...,17657693.0,5.0,F41.,10.0


In [44]:

unique_icd_codes = set(test_notes['icd_code']) #.union(set(icd_code for icd_list in df_icd10['icd10_codes'] for icd_code in icd_list))



In [45]:
len(unique_icd_codes)

1402

In [46]:
test_notes.icd_code.nunique()

1402

In [47]:
# Step 3: Initialize TP, FP, FN, TN counts
counts = {icd_code: (0, 0, 0, 0) for icd_code in unique_icd_codes}



In [48]:
i = 0
for icd_code in unique_icd_codes:
    i+=1
    if i%100 == 0:
        print(f'{i} codes are done!')
    
    true_icd_codes = test_notes[test_notes['icd_code'] == icd_code]['hadm_id'].values
    predicted_icd_codes = df_icd10[df_icd10['icd10_codes'].apply(lambda x: icd_code in x)]['hadm_id'].values

    TP = np.intersect1d(true_icd_codes, predicted_icd_codes)
    FP = np.setdiff1d(predicted_icd_codes, true_icd_codes)
    FN = np.setdiff1d(true_icd_codes, predicted_icd_codes)
    TN = len(np.union1d(predicted_icd_codes, true_icd_codes)) -len(TP) -len(FP) -len(FN)


    counts[icd_code] = (len(TP), len(FP), len(FN), TN)

100 codes are done!
200 codes are done!
300 codes are done!
400 codes are done!
500 codes are done!
600 codes are done!
700 codes are done!
800 codes are done!
900 codes are done!
1000 codes are done!
1100 codes are done!
1200 codes are done!
1300 codes are done!
1400 codes are done!


In [49]:
counts

{'M89.': (11, 273, 34, 0),
 'G03.': (23, 242, 1, 0),
 'H83.': (4, 13, 1, 0),
 'O75.': (0, 0, 22, 0),
 'M43.': (45, 339, 25, 0),
 'C71.': (11, 8, 64, 0),
 'D45.': (0, 0, 26, 0),
 'L22.': (0, 2, 1, 0),
 'A56.': (0, 0, 2, 0),
 'D55.': (0, 0, 22, 0),
 'R89.': (0, 52, 7, 0),
 'M75.': (19, 107, 25, 0),
 'B20.': (0, 0, 121, 0),
 'Z75.': (0, 0, 14, 0),
 'V47.': (0, 0, 24, 0),
 'O86.': (0, 0, 25, 0),
 'B95.': (55, 119, 678, 0),
 'L71.': (17, 75, 5, 0),
 'D61.': (325, 181, 92, 0),
 'C64.': (60, 228, 33, 0),
 'R81.': (2, 67, 2, 0),
 'G93.': (526, 1210, 313, 0),
 'C01.': (0, 0, 4, 0),
 'A31.': (5, 46, 3, 0),
 'N93.': (45, 485, 7, 0),
 'B46.': (2, 54, 0, 0),
 'C08.': (0, 2, 1, 0),
 'K62.': (70, 301, 80, 0),
 'T47.': (0, 0, 39, 0),
 'B99.': (7, 8201, 1, 0),
 'R19.': (467, 11509, 165, 0),
 'O44.': (7, 16, 3, 0),
 'S76.': (3, 2, 9, 0),
 'S23.': (0, 0, 2, 0),
 'Q04.': (4, 83, 8, 0),
 'C13.': (0, 0, 3, 0),
 'A87.': (9, 26, 0, 0),
 'Z33.': (1, 222, 3, 0),
 'F53.': (0, 15, 1, 0),
 'F02.': (2, 3, 292, 0),


In [50]:



# Calculate Precision and Recall for Each ICD-10 Code
precisions = {}
recalls = {}
for icd_code in unique_icd_codes:
    TP, FP, FN, TN = counts[icd_code]
    
    # Handle division by zero
    if TP + FP == 0:
        precision = 0
    else:
        precision = TP / (TP + FP)

    if TP + FN == 0:
        recall = 0
    else:
        recall = TP / (TP + FN)
    
    precisions[icd_code] = precision
    recalls[icd_code] = recall


In [51]:
precisions

{'M89.': 0.03873239436619718,
 'G03.': 0.08679245283018867,
 'H83.': 0.23529411764705882,
 'O75.': 0,
 'M43.': 0.1171875,
 'C71.': 0.5789473684210527,
 'D45.': 0,
 'L22.': 0.0,
 'A56.': 0,
 'D55.': 0,
 'R89.': 0.0,
 'M75.': 0.15079365079365079,
 'B20.': 0,
 'Z75.': 0,
 'V47.': 0,
 'O86.': 0,
 'B95.': 0.3160919540229885,
 'L71.': 0.18478260869565216,
 'D61.': 0.642292490118577,
 'C64.': 0.20833333333333334,
 'R81.': 0.028985507246376812,
 'G93.': 0.3029953917050691,
 'C01.': 0,
 'A31.': 0.09803921568627451,
 'N93.': 0.08490566037735849,
 'B46.': 0.03571428571428571,
 'C08.': 0.0,
 'K62.': 0.18867924528301888,
 'T47.': 0,
 'B99.': 0.0008528265107212475,
 'R19.': 0.038994655978623914,
 'O44.': 0.30434782608695654,
 'S76.': 0.6,
 'S23.': 0,
 'Q04.': 0.04597701149425287,
 'C13.': 0,
 'A87.': 0.2571428571428571,
 'Z33.': 0.004484304932735426,
 'F53.': 0.0,
 'F02.': 0.4,
 'E09.': 0,
 'C41.': 0.0,
 'S25.': 0.07692307692307693,
 'G13.': 0.0,
 'D09.': 0.07692307692307693,
 'S41.': 0.0,
 'G05.': 

In [52]:
# Step 6: Calculate Macro Precision and Macro Recall
macro_precision = sum(precisions.values()) / len(precisions)
macro_recall = sum(recalls.values()) / len(recalls)


In [53]:
macro_precision

0.19464000821596777

In [54]:
macro_recall

0.3811432146132433

In [55]:
# Calculate F1 Score for Each ICD-10 Code
f1_scores = {}
for icd_code in unique_icd_codes:
    precision = precisions[icd_code]
    recall = recalls[icd_code]

    # Handle division by zero for precision and recall
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)

    f1_scores[icd_code] = f1_score



In [56]:
# Step 8: Calculate Macro F1 Score
macro_f1_score = sum(f1_scores.values()) / len(f1_scores)

# Now you have macro precision, macro recall, and macro F1 score for your multi-label classification evaluation.

In [57]:
macro_f1_score

0.1998874173030856

In [58]:
# Calculate Precision and Recall for Each ICD-10 Code
precisions = {}
recalls = {}
for icd_code in unique_icd_codes:
    TP, FP, FN, TN = counts[icd_code]
    
    # Handle division by zero
    if TP + FP == 0:
        precision = 0
    else:
        precision = TP / (TP + FP)

    if TP + FN == 0:
        recall = 0
    else:
        recall = TP / (TP + FN)
    
    precisions[icd_code] = precision
    recalls[icd_code] = recall

# Filter out zero precision values and calculate average precision
average_precision = np.mean([precision for precision in precisions.values() if precision > 0])

In [59]:
average_precision

0.2807461846901098

In [60]:
average_recall = np.mean([recall for recall in recalls.values() if recall > 0])

In [61]:
average_recall

0.5497559535882379

In [62]:
# Calculate F1 Score for Each ICD-10 Code
f1_scores = {}
for icd_code in unique_icd_codes:
    precision = precisions[icd_code]
    recall = recalls[icd_code]

    # Handle division by zero for precision and recall
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)

    f1_scores[icd_code] = f1_score


In [63]:
average_f1 = np.mean([f1_score for f1_score in f1_scores.values() if f1_score > 0])

In [64]:
average_f1

0.288314978455685

In [65]:
len([f1_score for f1_score in f1_scores.values() if f1_score > 0])

972

#### Top 50 ICD codes identification

In [66]:
top_50_icd_codes = test_notes['icd_code'].value_counts().index[:50]
top_50_icd_codes

Index(['E78.', 'I10.', 'Z79.', 'Z87.', 'Y92.', 'E11.', 'K21.', 'I25.', 'E87.',
       'Z68.', 'F32.', 'I50.', 'N18.', 'I48.', 'N17.', 'Z86.', 'F41.', 'Z85.',
       'G47.', 'Z95.', 'E66.', 'E03.', 'D64.', 'F17.', 'J44.', 'I12.', 'J45.',
       'Y83.', 'G89.', 'I95.', 'Z66.', 'J96.', 'N39.', 'D62.', 'D69.', 'N40.',
       'K59.', 'E86.', 'Z91.', 'F10.', 'I13.', 'Z99.', 'B96.', 'Z96.', 'D63.',
       'I11.', 'M54.', 'D72.', 'A41.', 'E83.'],
      dtype='object')

In [67]:
filtered_test_notes = test_notes[test_notes['icd_code'].isin(top_50_icd_codes)]

filtered_test_notes

Unnamed: 0,hadm_id,text,subject_id,seq_num,icd_code,icd_version
2,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,3.0,I10.,10.0
6,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,7.0,Z91.,10.0
9,20000024,sex f medicine allergies aspirin chief complai...,16925328.0,10.0,Y92.,10.0
10,20002636,sex m medicine allergies prevacid ciprofloxaci...,12527107.0,1.0,E86.,10.0
11,20002636,sex m medicine allergies prevacid ciprofloxaci...,12527107.0,2.0,N17.,10.0
...,...,...,...,...,...,...
245116,29997523,sex f surgery allergies no known allergies adv...,16012077.0,2.0,D64.,10.0
245119,29997523,sex f surgery allergies no known allergies adv...,16012077.0,5.0,M54.,10.0
245123,29997858,sex m medicine allergies no known allergies ad...,17657693.0,3.0,F17.,10.0
245124,29997858,sex m medicine allergies no known allergies ad...,17657693.0,4.0,Z79.,10.0


In [68]:
filtered_unique_hadm_notes = filtered_test_notes[['hadm_id', 'text']].drop_duplicates(subset='hadm_id')


In [69]:
filtered_unique_hadm_notes = filtered_unique_hadm_notes.reset_index(drop = True)
filtered_unique_hadm_notes

Unnamed: 0,hadm_id,text
0,20000024,sex f medicine allergies aspirin chief complai...
1,20002636,sex m medicine allergies prevacid ciprofloxaci...
2,20002800,sex f medicine allergies penicillins tetanus t...
3,20003013,sex m neurology allergies no known allergies a...
4,20003425,sex m otolaryngology allergies no known allerg...
...,...,...
17750,29995981,sex m surgery allergies no known allergies adv...
17751,29996421,no sex m medicine allergies penicillins chief ...
17752,29996653,sex f medicine allergies erythromycin base ato...
17753,29997523,sex f surgery allergies no known allergies adv...


In [70]:
filtered_icd10_data = {
    'hadm_id': [],
    'icd10_codes': []
}



In [71]:
unique_icd_codes = set(filtered_test_notes['icd_code'])

In [72]:
# Step 3: Initialize TP, FP, FN, TN counts
filtered_counts = {icd_code: (0, 0, 0, 0) for icd_code in unique_icd_codes}



In [73]:
i = 0
for icd_code in unique_icd_codes:
    i+=1
    if i%10 == 0:
        print(f'{i} codes are done!')
    
    true_icd_codes = filtered_test_notes[filtered_test_notes['icd_code'] == icd_code]['hadm_id'].values
    predicted_icd_codes = df_icd10[df_icd10['icd10_codes'].apply(lambda x: icd_code in x)]['hadm_id'].values

    TP = np.intersect1d(true_icd_codes, predicted_icd_codes)
    FP = np.setdiff1d(predicted_icd_codes, true_icd_codes)
    FN = np.setdiff1d(true_icd_codes, predicted_icd_codes)
    TN = len(np.union1d(predicted_icd_codes, true_icd_codes)) -len(TP) -len(FP) -len(FN)


    filtered_counts[icd_code] = (len(TP), len(FP), len(FN), TN)

10 codes are done!
20 codes are done!
30 codes are done!
40 codes are done!
50 codes are done!


In [74]:
filtered_counts

{'E86.': (753, 1329, 628, 0),
 'N17.': (1390, 130, 1938, 0),
 'E66.': (1738, 1003, 711, 0),
 'G89.': (0, 0, 1722, 0),
 'F17.': (290, 179, 1748, 0),
 'D69.': (74, 60, 1470, 0),
 'E78.': (4812, 848, 2605, 0),
 'N18.': (1856, 225, 1616, 0),
 'Z96.': (11, 29, 1183, 0),
 'F41.': (2551, 1838, 598, 0),
 'E11.': (1259, 608, 3489, 0),
 'Z68.': (0, 0, 3596, 0),
 'Z66.': (0, 0, 1568, 0),
 'Z79.': (0, 0, 6175, 0),
 'E83.': (550, 1442, 472, 0),
 'Y92.': (0, 0, 5281, 0),
 'M54.': (878, 2729, 194, 0),
 'I50.': (2541, 518, 979, 0),
 'I95.': (1118, 1732, 520, 0),
 'Z87.': (413, 711, 5613, 0),
 'I12.': (63, 40, 1810, 0),
 'Z91.': (327, 578, 1035, 0),
 'K59.': (1217, 8250, 168, 0),
 'I11.': (14, 36, 1142, 0),
 'Y83.': (45, 34, 1683, 0),
 'I13.': (71, 35, 1162, 0),
 'Z86.': (568, 1889, 2662, 0),
 'Z99.': (13, 2, 1199, 0),
 'J96.': (1076, 242, 479, 0),
 'D63.': (336, 168, 828, 0),
 'N40.': (364, 144, 1031, 0),
 'G47.': (1816, 2385, 1079, 0),
 'F32.': (2239, 1097, 1336, 0),
 'K21.': (3484, 696, 1009, 0),
 '

In [75]:

# Calculate Precision and Recall for Each ICD-10 Code
filtered_precisions = {}
filtered_recalls = {}
for icd_code in unique_icd_codes:
    TP, FP, FN, TN = filtered_counts[icd_code]
    
    # Handle division by zero
    if TP + FP == 0:
        precision = 0
    else:
        precision = TP / (TP + FP)

    if TP + FN == 0:
        recall = 0
    else:
        recall = TP / (TP + FN)
    
    filtered_precisions[icd_code] = precision
    filtered_recalls[icd_code] = recall

In [76]:
filtered_precisions

{'E86.': 0.361671469740634,
 'N17.': 0.9144736842105263,
 'E66.': 0.6340751550529004,
 'G89.': 0,
 'F17.': 0.6183368869936035,
 'D69.': 0.5522388059701493,
 'E78.': 0.8501766784452297,
 'N18.': 0.8918789043728976,
 'Z96.': 0.275,
 'F41.': 0.5812257917521075,
 'E11.': 0.6743438671665773,
 'Z68.': 0,
 'Z66.': 0,
 'Z79.': 0,
 'E83.': 0.2761044176706827,
 'Y92.': 0,
 'M54.': 0.24341558081508177,
 'I50.': 0.8306636155606407,
 'I95.': 0.39228070175438595,
 'Z87.': 0.36743772241992884,
 'I12.': 0.6116504854368932,
 'Z91.': 0.36132596685082874,
 'K59.': 0.12855181155593112,
 'I11.': 0.28,
 'Y83.': 0.569620253164557,
 'I13.': 0.6698113207547169,
 'Z86.': 0.23117623117623118,
 'Z99.': 0.8666666666666667,
 'J96.': 0.8163884673748103,
 'D63.': 0.6666666666666666,
 'N40.': 0.7165354330708661,
 'G47.': 0.4322780290407046,
 'F32.': 0.6711630695443646,
 'K21.': 0.8334928229665072,
 'E03.': 0.8304,
 'J45.': 0.687750556792873,
 'B96.': 0.6370370370370371,
 'D64.': 0.32331676457297787,
 'E87.': 0.6247262

In [77]:
filtered_recalls

{'E86.': 0.5452570601013759,
 'N17.': 0.4176682692307692,
 'E66.': 0.7096774193548387,
 'G89.': 0.0,
 'F17.': 0.1422963689892051,
 'D69.': 0.04792746113989637,
 'E78.': 0.6487798301199946,
 'N18.': 0.5345622119815668,
 'Z96.': 0.009212730318257957,
 'F41.': 0.8100984439504605,
 'E11.': 0.2651642796967144,
 'Z68.': 0.0,
 'Z66.': 0.0,
 'Z79.': 0.0,
 'E83.': 0.538160469667319,
 'Y92.': 0.0,
 'M54.': 0.8190298507462687,
 'I50.': 0.721875,
 'I95.': 0.6825396825396826,
 'Z87.': 0.06853634251576501,
 'I12.': 0.03363587827015483,
 'Z91.': 0.24008810572687225,
 'K59.': 0.8787003610108304,
 'I11.': 0.012110726643598616,
 'Y83.': 0.026041666666666668,
 'I13.': 0.0575831305758313,
 'Z86.': 0.17585139318885448,
 'Z99.': 0.010726072607260726,
 'J96.': 0.6919614147909968,
 'D63.': 0.28865979381443296,
 'N40.': 0.26093189964157704,
 'G47.': 0.6272884283246978,
 'F32.': 0.6262937062937063,
 'K21.': 0.7754284442466058,
 'E03.': 0.9214380825565912,
 'J45.': 0.8441771459814106,
 'B96.': 0.0711331679073614

In [78]:
# Step 6: Calculate Macro Precision and Macro Recall
macro_precision = sum(filtered_precisions.values()) / len(filtered_precisions)
macro_recall = sum(filtered_recalls.values()) / len(filtered_recalls)


In [79]:
macro_precision

0.5162037835460194

In [80]:
macro_recall

0.40007760437918205

In [81]:
# Calculate F1 Score for Each ICD-10 Code
filtered_f1_scores = {}
for icd_code in unique_icd_codes:
    precision = filtered_precisions[icd_code]
    recall = filtered_recalls[icd_code]

    # Handle division by zero for precision and recall
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)

    filtered_f1_scores[icd_code] = f1_score



In [82]:
# Step 8: Calculate Macro F1 Score
macro_f1_score = sum(filtered_f1_scores.values()) / len(filtered_f1_scores)

# Now you have macro precision, macro recall, and macro F1 score for your multi-label classification evaluation.

In [83]:
macro_f1_score

0.3736513159833468

In [84]:
# Calculate Precision and Recall for Each ICD-10 Code
filtered_precisions = {}
filtered_recalls = {}
for icd_code in unique_icd_codes:
    TP, FP, FN, TN = filtered_counts[icd_code]
    
    # Handle division by zero
    if TP + FP == 0:
        precision = 0
    else:
        precision = TP / (TP + FP)

    if TP + FN == 0:
        recall = 0
    else:
        recall = TP / (TP + FN)
    
    filtered_precisions[icd_code] = precision
    filtered_recalls[icd_code] = recall    
    
    
    
    
    
# Filter out zero precision values and calculate average precision
average_precision = np.mean([precision for precision in filtered_precisions.values() if precision > 0])

In [85]:
average_precision

0.5735597594955769

In [86]:
average_recall = np.mean([recall for recall in filtered_recalls.values() if recall > 0])

In [87]:
average_recall

0.44453067153242437

In [88]:
# Calculate F1 Score for Each ICD-10 Code
filtered_f1_scores = {}
for icd_code in unique_icd_codes:
    precision = filtered_precisions[icd_code]
    recall = filtered_recalls[icd_code]

    # Handle division by zero for precision and recall
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)

    filtered_f1_scores[icd_code] = f1_score


In [89]:
average_f1 = np.mean([f1_score for f1_score in filtered_f1_scores.values() if f1_score > 0])

In [90]:
average_f1

0.4151681288703853