In [5]:
import master_thesis.simple_icd_10_cm as cm
import pandas as pd
from functools import reduce
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import min_weight_full_bipartite_matching
import math
from master_thesis.helper_functions import f

In [6]:
df = pd.read_csv('../data/diagnoses_icd.csv')
icd10 = df.loc[df['icd_version'] == 10]
some_patients = icd10[['hadm_id','seq_num','icd_code']].head(14)
some_patients

patients = []
for index, row in some_patients.iterrows():
    if row['seq_num'] == 1:
        patient = []
        patients.append(patient)
        patient.append(row['icd_code'])
    else:
        patient.append(row['icd_code'])

patien_num = len(patients)
print(patients)


[['G3183', 'F0280', 'R441', 'R296', 'E785', 'Z8546'], ['R4182', 'G20', 'F0280', 'R609', 'E785', 'Z8546'], ['R1310', 'R0989']]


In [3]:
# Define Functions:

# get ancestors of a concept
def ancestors(concept):
    ancestors = (cm.get_ancestors(concept))
    return ancestors

# get levels of a concept (same as IC#1)
def levels(concept):
    levels = len(ancestors(concept))+1
    return levels

# get number of subsumers of a concept
def subsumers(concept):
    subsumer = len(ancestors(concept)) + 2
    return subsumer

# get number of leaves of the origin node (r) of the taxonomy
def leaves_of_origin():
    all_codes = cm.get_all_codes()
    leaves = 0
    for c in all_codes:
        if cm.is_leaf(c) == True:
            leaves = leaves+1
    return leaves

# get number of leaves of a concept
def num_of_leaves(concept):
    descendants = cm.get_descendants(concept)
    leaves = 0
    for c in descendants:
        if cm.is_leaf(c) == True:
            leaves = leaves+1
    return leaves

# get details of list of patients with concepts
def details(patient_list):
    for c in patient_list:
        print('patient#:',patient_list.index(c)+1, '\tnumber of concepts:',len(c))
        for n in c:
            print('\tconcept', c.index(n)+1,':',  n, '\tlevels:',levels(n), '\tancestors:',ancestors(n))

# get average of a list
def Average(list):
    return reduce(lambda a, b: a + b, list) / len(list)

# get average of each list in list of lists
def average_list_of_lists(list_of_lists):
    new_list = []
    for i in list_of_lists:
        average = Average(i)
        new_list.append(average)
    return new_list

# get IC#1 of Least Common Ancestor (LCA) for between 2 concepts
def LCA(concept1,concept2):
    if concept1==concept2:
        print(concept1,concept2,f'\tlevels of LCA: {len(cm.get_ancestors(concept1))+1}', '\tEXACT CONCEPT!')
        return(len(cm.get_ancestors(concept1))+1)
    else:
        for a in cm.get_ancestors(concept1):
            for b in cm.get_ancestors(concept2):
                if a==b:
                    print(concept1,concept2,f'\tlevels of LCA: {len(cm.get_ancestors(a))+1}', f'\tancestor:{a}')
                    return(len(cm.get_ancestors(a))+1)
        print(concept1,concept2,'\tlevels of LCA: 0', '\tno common ancestors')
        return 0

# ============ IC =============

# get IC#1 of a concept (same as levels)
def ic1(concept):
    levels = len(ancestors(concept))+1
    return levels

# get IC#2 of a concept
def ic2(concept):
    in_log = ((num_of_leaves(concept)/subsumers(concept))+1)/(leaves_of_origin()+1)
    ic2 = - math.log(in_log, 10)
    return ic2

# ============ CS =============

# get Code level similarity CS#2 between 2 concepts
def cs2(concept1,concept2,ic_function):
    cs2 = 1 - ((2*LCA(concept1,concept2))/ (ic_function(concept1) + ic_function(concept2)))
    print('cs#2 =',cs2)
    return cs2

# get Code level similarity CS#4 between 2 concepts
def cs4(concept1,concept2,ic_function):
    cs4 = (ic_function('S75.019A') - LCA(concept1,concept2))/ic_function('S75.019A') #7 is total levels of the taxonomy
    print('cs#4 =',cs4)
    return cs4

# ============================

# compare CS
def compareCS(patients_list,ic_function,cs_function):
    n=1
    for c1 in patients_list:
        for c2 in patients_list:
            if patients_list.index(c1) >= patients_list.index(c2):
                patients_list.index(c2) + 1
                continue
            for n1 in c1:
                for n2 in c2:
                    print(f'({n})')
                    n=n+1
                    cs_function(n1,n2,ic_function)

In [86]:
# get Set level similarity SS#5 between 2 patients (2 sets of concepts)
def ss5(patient1,patient2,ic_function,cs_function):
    A = patient1
    B = patient2
    n=1
    csCom = []
    for a in A:
        csComunit = []
        for b in B:
            print(f'({n}): concepts: first_patient_concept: {A.index(a)+1}, second_patient_concept: {B.index(b)+1}')
            n=n+1
            csSimilarity = cs_function(a,b,ic_function)
            csComunit.append(csSimilarity)
        print(f'CS for each concept of first patient: {csComunit}')
        minn = min(csComunit)
        csCom.append(minn)
    print(f'Min CSs of concepts of first patient: {csCom}')
    sum1 = sum(csCom)
    csCom = []
    for b in B:
        csComunit = []
        for a in A:
            print(f'({n}): concepts: second_patient_concept: {B.index(b)+1}, first_patient_concept: {A.index(a)+1}')
            n=n+1
            csSimilarity = cs_function(b,a,ic_function)
            csComunit.append(csSimilarity)
        print(f'CS for each concept of second patient: {csComunit}')
        minn = min(csComunit)
        csCom.append(minn)
    print(f'Min CS\'s of concepts of second patient: {csCom}')
    sum2 = sum(csCom)
    ss5 = (sum1+sum2)/(len(A)+len(B))
    print(f'SS#5: {ss5}')
    return ss5

# get Set level similarity SS#6 between 2 patients (2 sets of concepts)
def ss6(patient1,patient2,ic_function,cs_function):
    A = set(patient1)
    B = set(patient2)
    AUB = A | B
    AdiffB = A - B
    BdiffA = B - A
    n=1
    csCom = []
    for a in AdiffB:
        summ = 0
        for b in B:
            print(f'({n}): concepts: first_patient_concept: {patient1.index(a)+1}, second_patient_concept: {patient2.index(b)+1}')
            n=n+1
            csSimilarity = cs_function(a,b,ic_function)
            summ = summ + csSimilarity
        total1= summ/len(B)
        csCom.append(total1)
    for b in BdiffA:
        summ = 0
        for a in A:
            print(f'({n}): concepts: second_patient_concept: {patient2.index(b)+1}, first_patient_concept: {patient1.index(a)+1}')
            n=n+1
            csSimilarity = cs_function(b,a,ic_function)
            summ = summ + csSimilarity
        total2=summ/len(patient1)
        csCom.append(total2)
    print(f'Average CS\'s of each concept of first and second patients: {csCom}')
    total = sum(csCom)
    print(AUB)
    ss6 = total/len(AUB)
    print(f'SS#6: {ss6}')
    return ss6

# get Set level similarity SS#7 between 2 patients (2 sets of concepts)
def ss7(patient1,patient2,ic_function,cs_function):
    A = patient1
    B = patient2
    n=1
    summ = 0
    for a in A:
        for b in B:
            print(f'({n}): concepts: first_patient_concept: {A.index(a)+1}, second_patient_concept: {B.index(b)+1}')
            n=n+1
            csSimilarity = cs_function(a,b,ic_function)
            summ = summ + csSimilarity
    ss7 = summ/(len(A)*len(B))
    print(f'SS#7: {ss7}')
    return ss7

# get Set level similarity SS#8 Minimum Weighted Bipartite Matching between 2 patients
def ss8(patient1,patient2,ic_function,cs_function):
    A = set(patient1)
    B = set(patient2)
    AdiffB = A - B
    BdiffA = B - A
    n=1
    csCom = []
    if len(AdiffB) <= len(BdiffA):
        start = AdiffB
        end = BdiffA
        start_draft = patient1
        end_draft = patient2
        first_patient = 'first_patient_concept:'
        second_patient = 'second_patient_concept:'
    else:
        start = BdiffA
        end = AdiffB
        start_draft = patient2
        end_draft = patient1
        first_patient = 'second_patient_concept:'
        second_patient = 'first_patient_concept:'
    for c1 in start:
        csComunit = []
        for c2 in end:
            print(f'({n}): concepts: {first_patient} {start_draft.index(c1)+1}, {second_patient} {end_draft.index(c2)+1}')
            n=n+1
            csSimilarity = cs_function(c1,c2,ic_function)
            csComunit.append(csSimilarity)
        csCom.append(csComunit)
    print(f'Disjoint concepts of A and B: {AdiffB, BdiffA}')
    print(f'CS of concepts: {csCom}')
    biadjacency_matrix = csr_matrix(csCom)
    indecies = min_weight_full_bipartite_matching(biadjacency_matrix)[1]
    new_list = []
    print(f'Subset of edges with a minimum sum of weights(CS): {indecies}')
    for xx in csCom:
        dd = xx[indecies[csCom.index(xx)]]
        new_list.append(dd)
    sumCs = sum(new_list)
    ss8 = sumCs/min(len(A),len(B))
    print(f'weight(CS) of edges with a minimum sum of weights(CS): {new_list}')
    print(f'SS#8: {ss8}')
    return ss8

# 
def similarity(patients_list,ic_function,cs_function,ss_funtion):
    n=1
    matrix = []
    for p1 in patients_list:
        row = []
        for p2 in patients_list:
            if patients_list.index(p1) == patients_list.index(p2):
                patients_list.index(p2) + 1
                row.append(1)
                continue
            print(f'(Case:{n}, Patients: first_patient: {patients_list.index(p1)+1}, second_patient: {patients_list.index(p2)+1})')
            n=n+1
            set_level_similarity = ss_funtion(p1,p2,ic_function,cs_function)
            print(set_level_similarity)
            row.append(set_level_similarity)
        matrix.append(row)
    return matrix

In [106]:
sa1 = ['G3183', 'F0280', 'R441', 'R296', 'E785', 'Z8546']
sa2 = ['R4182', 'G20', 'F0280', 'R609', 'E785', 'Z8546']
# sa1 = ['G3183', 'F0280', 'R441', 'R296', 'E785', 'Z8546']
# sa2 = ['G3183', 'F0280', 'R441', 'R296', 'E785', 'Z8546']


ss8(sa1,sa2,ic2,cs2)

(1): concepts: first_patient_concept: 4, second_patient_concept: 4
R296 R609 	levels of LCA: 1 	ancestor:18
cs#2 = 0.7942884532802619
(2): concepts: first_patient_concept: 4, second_patient_concept: 2
R296 G20 	levels of LCA: 0 	no common ancestors
cs#2 = 1.0
(3): concepts: first_patient_concept: 4, second_patient_concept: 1
R296 R4182 	levels of LCA: 1 	ancestor:18
cs#2 = 0.7942884532802619
(4): concepts: first_patient_concept: 1, second_patient_concept: 4
G3183 R609 	levels of LCA: 0 	no common ancestors
cs#2 = 1.0
(5): concepts: first_patient_concept: 1, second_patient_concept: 2
G3183 G20 	levels of LCA: 1 	ancestor:6
cs#2 = 0.7942884532802619
(6): concepts: first_patient_concept: 1, second_patient_concept: 1
G3183 R4182 	levels of LCA: 0 	no common ancestors
cs#2 = 1.0
(7): concepts: first_patient_concept: 3, second_patient_concept: 4
R441 R609 	levels of LCA: 1 	ancestor:18
cs#2 = 0.7942884532802619
(8): concepts: first_patient_concept: 3, second_patient_concept: 2
R441 G20 	leve

0.36285896885350793

In [94]:
similarity(patients,ic1,cs2,ss8)

(Case:1, Patients: first_patient: 1, second_patient: 2)
(1): concepts: first_patient_concept: 4, second_patient_concept: 4
R296 R609 	levels of LCA: 1 	ancestor:18
cs#2 = 0.75
(2): concepts: first_patient_concept: 4, second_patient_concept: 2
R296 G20 	levels of LCA: 0 	no common ancestors
cs#2 = 1.0
(3): concepts: first_patient_concept: 4, second_patient_concept: 1
R296 R4182 	levels of LCA: 1 	ancestor:18
cs#2 = 0.7777777777777778
(4): concepts: first_patient_concept: 1, second_patient_concept: 4
G3183 R609 	levels of LCA: 0 	no common ancestors
cs#2 = 1.0
(5): concepts: first_patient_concept: 1, second_patient_concept: 2
G3183 G20 	levels of LCA: 1 	ancestor:6
cs#2 = 0.75
(6): concepts: first_patient_concept: 1, second_patient_concept: 1
G3183 R4182 	levels of LCA: 0 	no common ancestors
cs#2 = 1.0
(7): concepts: first_patient_concept: 3, second_patient_concept: 4
R441 R609 	levels of LCA: 1 	ancestor:18
cs#2 = 0.75
(8): concepts: first_patient_concept: 3, second_patient_concept: 2


[[1, 0.34259259259259256, 0.7777777777777778],
 [0.34259259259259256, 1, 0.7777777777777778],
 [0.7777777777777778, 0.7777777777777778, 1]]

In [306]:
details(patients)

patient#: 1 	number of concepts: 6
	concept 1 : G3183 	levels: 5 	ancestors: ['G31.8', 'G31', 'G30-G32', '6']
	concept 2 : F0280 	levels: 5 	ancestors: ['F02.8', 'F02', 'F01-F09', '5']
	concept 3 : R441 	levels: 4 	ancestors: ['R44', 'R40-R46', '18']
	concept 4 : R296 	levels: 4 	ancestors: ['R29', 'R25-R29', '18']
	concept 5 : E785 	levels: 4 	ancestors: ['E78', 'E70-E88', '4']
	concept 6 : Z8546 	levels: 5 	ancestors: ['Z85.4', 'Z85', 'Z77-Z99', '21']
patient#: 2 	number of concepts: 6
	concept 1 : R4182 	levels: 5 	ancestors: ['R41.8', 'R41', 'R40-R46', '18']
	concept 2 : G20 	levels: 3 	ancestors: ['G20-G26', '6']
	concept 3 : F0280 	levels: 5 	ancestors: ['F02.8', 'F02', 'F01-F09', '5']
	concept 4 : R609 	levels: 4 	ancestors: ['R60', 'R50-R69', '18']
	concept 5 : E785 	levels: 4 	ancestors: ['E78', 'E70-E88', '4']
	concept 6 : Z8546 	levels: 5 	ancestors: ['Z85.4', 'Z85', 'Z77-Z99', '21']
patient#: 3 	number of concepts: 2
	concept 1 : R1310 	levels: 5 	ancestors: ['R13.1', 'R13'

In [32]:
# biadjacency_matrix = csr_matrix([[0.1, 0.4,0.05], [0.3, 0.8, 0.3], [0.2, 0.3, 0.4]])
biadjacency_matrix = csr_matrix([[10,2,3,1], [3,10,1,5], [8,3,10,4]])
print(min_weight_full_bipartite_matching(biadjacency_matrix)[1])

[3 2 1]


In [45]:
biadjacency_matrix = csr_matrix([[1.0, 0.75, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 0.01, 1.0, 1.0, 1.0], [0.5555555555555556, 1.0, 1.0, 0.75, 1.0, 1.0], [0.7777777777777778, 1.0, 1.0, 0.75, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0, 0.01, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 0.01]]) 
print(min_weight_full_bipartite_matching(biadjacency_matrix)[1])
# [2 0 1]

[1 2 0 3 4 5]
