In [91]:
from master_thesis import helper_functions as hf
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


In [39]:
df1 = hf.pd.read_csv('../data/diagnoses_icd.csv')
icd10 = df1.loc[df1['icd_version'] == 10]
some_patients = icd10[['hadm_id','seq_num','icd_code']].head(500)

df2 = hf.pd.read_csv('../data/services.csv')
df2 = df2[['hadm_id','curr_service']]
# X = df2.iloc[:,0]
# y = df2.iloc[:,1]
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,test_size=0.2)

join = hf.pd.merge(df2,some_patients,on='hadm_id',how='inner')

train_patients_raw = join.iloc[0:34]
test_patients_raw = join.iloc[240:254]


In [40]:
train_patients = hf.get_patients(train_patients_raw)
test_patients= hf.get_patients(test_patients_raw)
y_train = hf.get_y_train(train_patients_raw)

[['G3183', 'F0280', 'R441', 'R296', 'E785', 'Z8546'], ['R4182', 'G20', 'F0280', 'R609', 'E785', 'Z8546'], ['R1310', 'R0989', 'K31819', 'K219', 'K449', 'F419', 'I341', 'M810', 'Z87891'], ['S72012A', 'W010XXA', 'Y93K1', 'Y92480', 'K219', 'E7800', 'I341', 'G43909', 'Z87891', 'Z87442', 'F419', 'M810', 'Z7901']]
4
[['J441', 'R0902', 'E876', 'I10', 'E780', 'E785', 'Z87891', 'Z9981'], ['J441', 'Z9981', 'I10', 'I4891', 'Z7901', 'Z87891']]
2
['MED', 'MED', 'MED', 'ORTHO']


In [42]:
hf.get_details(train_patients)
print('-------------------')
hf.get_details(test_patients)

Numer of patients: 4
patient#: 1 	number of concepts: 6
	concept 1 : G3183 	levels: 5 	ancestors: ['G31.8', 'G31', 'G30-G32', '6']
	concept 2 : F0280 	levels: 5 	ancestors: ['F02.8', 'F02', 'F01-F09', '5']
	concept 3 : R441 	levels: 4 	ancestors: ['R44', 'R40-R46', '18']
	concept 4 : R296 	levels: 4 	ancestors: ['R29', 'R25-R29', '18']
	concept 5 : E785 	levels: 4 	ancestors: ['E78', 'E70-E88', '4']
	concept 6 : Z8546 	levels: 5 	ancestors: ['Z85.4', 'Z85', 'Z77-Z99', '21']
patient#: 2 	number of concepts: 6
	concept 1 : R4182 	levels: 5 	ancestors: ['R41.8', 'R41', 'R40-R46', '18']
	concept 2 : G20 	levels: 3 	ancestors: ['G20-G26', '6']
	concept 3 : F0280 	levels: 5 	ancestors: ['F02.8', 'F02', 'F01-F09', '5']
	concept 4 : R609 	levels: 4 	ancestors: ['R60', 'R50-R69', '18']
	concept 5 : E785 	levels: 4 	ancestors: ['E78', 'E70-E88', '4']
	concept 6 : Z8546 	levels: 5 	ancestors: ['Z85.4', 'Z85', 'Z77-Z99', '21']
patient#: 3 	number of concepts: 9
	concept 1 : R1310 	levels: 5 	ances

In [43]:
X = hf.get_similarity(train_patients,hf.get_ic2,hf.get_cs4,hf.get_ss6)

(Case:1, Patients: first_patient: 1, second_patient: 2)
(1): concepts: first_patient_concept: 3, second_patient_concept: 3
R441 F0280 	levels of LCA: 0 	least common ancestor is the root node
cs#4 = 1.0
(2): concepts: first_patient_concept: 3, second_patient_concept: 1
R441 R4182 	levels of LCA: 2 	least common ancestor: R40-R46
cs#4 = 0.38922570725952693
(3): concepts: first_patient_concept: 3, second_patient_concept: 6
R441 Z8546 	levels of LCA: 0 	least common ancestor is the root node
cs#4 = 1.0
(4): concepts: first_patient_concept: 3, second_patient_concept: 2
R441 G20 	levels of LCA: 0 	least common ancestor is the root node
cs#4 = 1.0
(5): concepts: first_patient_concept: 3, second_patient_concept: 5
R441 E785 	levels of LCA: 0 	least common ancestor is the root node
cs#4 = 1.0
(6): concepts: first_patient_concept: 3, second_patient_concept: 4
R441 R609 	levels of LCA: 1 	least common ancestor: 18
cs#4 = 0.5885276973113567
(7): concepts: first_patient_concept: 1, second_patient_

In [44]:
X_train_dist_matrix = X
X_train_dist_matrix

[[0, 0.5826796730476845, 0.9525708454511354, 0.9590900391660376],
 [0.5826796730476845, 0, 0.9525708454511354, 0.9590900391660376],
 [0.9525708454511354, 0.9525708454511352, 0, 0.6928035164693924],
 [0.9590900391660376, 0.9590900391660376, 0.6928035164693923, 0]]

In [45]:
y_train

['MED', 'MED', 'MED', 'ORTHO']

In [46]:
X_test= hf.get_test_distance(test_patients,train_patients,hf.get_ic2,hf.get_cs4,hf.get_ss6)

(Case:1, Patients: first_patient: 1, second_patient: 1)
(1): concepts: first_patient_concept: 4, second_patient_concept: 3
I10 R441 	levels of LCA: 0 	least common ancestor is the root node
cs#4 = 1.0
(2): concepts: first_patient_concept: 4, second_patient_concept: 2
I10 F0280 	levels of LCA: 0 	least common ancestor is the root node
cs#4 = 1.0
(3): concepts: first_patient_concept: 4, second_patient_concept: 1
I10 G3183 	levels of LCA: 0 	least common ancestor is the root node
cs#4 = 1.0
(4): concepts: first_patient_concept: 4, second_patient_concept: 4
I10 R296 	levels of LCA: 0 	least common ancestor is the root node
cs#4 = 1.0
(5): concepts: first_patient_concept: 4, second_patient_concept: 5
I10 E785 	levels of LCA: 0 	least common ancestor is the root node
cs#4 = 1.0
(6): concepts: first_patient_concept: 4, second_patient_concept: 6
I10 Z8546 	levels of LCA: 0 	least common ancestor is the root node
cs#4 = 1.0
(7): concepts: first_patient_concept: 1, second_patient_concept: 3
J441

In [47]:
X_test_dist_matrix = X_test
X_test_dist_matrix

[[0.8637830618080043,
  0.8637830618080043,
  0.9096224688289657,
  0.9023594247326825],
 [0.9575775327873733,
  0.9575775327873733,
  0.9010892429466103,
  0.8439976744395646]]

In [52]:
# Assuming you have 'X_dist_matrix' as your distance matrix and 'y' as corresponding labels
k = 1  # Replace 'k' with the number of neighbors you want to consider
knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='precomputed')
knn_classifier.fit(X_train_dist_matrix, y_train)

In [86]:
y_pred = knn_classifier.predict(X_test_dist_matrix)
y_pred
# print(y_pred)
# y_pred = ['MED', 'ORTHO']

array(['MED', 'ORTHO'], dtype='<U5')

In [87]:
y_actual = hf.get_y_train(test_patients_raw)

['MED', 'MED']


In [92]:
# Evaluate Model
cm = confusion_matrix(y_actual, y_pred)
print(f'confusion matrix: \n{cm}')

# print(f'f1_score: {f1_score(y_actual, y_pred)}')

print(f'accuracy_score: {accuracy_score(y_actual, y_pred)}')

confusion matrix: 
[[1 1]
 [0 0]]
accuracy_score: 0.5
