In [None]:
import pandas as pd
import numpy as np

w2v_2 = pd.read_csv('./data/w2v/w2v_2.csv')
w2v_10 = pd.read_csv('./data/w2v/w2v_10.csv')
w2v_100 = pd.read_csv('./data/w2v/w2v_100.csv')

print(w2v_2.shape, w2v_10.shape, w2v_100.shape)

cui_icd = pd.read_csv('./data/w2v/icd_to_cui_mapping.txt', sep='\t', header=None)
print(cui_icd.shape)

In [None]:
cui_icd.columns = ['icd', 'cui']
print(cui_icd.head(), cui_icd.shape)
cui_icd = cui_icd[cui_icd['icd'].str.get(0).str.isdigit()]
print(cui_icd.head(), cui_icd.shape)

In [None]:
w2v_2.rename(columns={'Unnamed: 0': 'cui'}, inplace=True)
w2v_10.rename(columns={'Unnamed: 0': 'cui'}, inplace=True)
w2v_100.rename(columns={'Unnamed: 0': 'cui'}, inplace=True)

w2v_2 = w2v_2[w2v_2['cui'].isin(cui_icd['cui'])]
w2v_10 = w2v_10[w2v_10['cui'].isin(cui_icd['cui'])]
w2v_100 = w2v_100[w2v_100['cui'].isin(cui_icd['cui'])]

w2v_2 = pd.merge(w2v_2, cui_icd, left_on='cui', right_on='cui')
w2v_10 = pd.merge(w2v_10, cui_icd, left_on='cui', right_on='cui')
w2v_100 = pd.merge(w2v_100, cui_icd, left_on='cui', right_on='cui')
print(w2v_2.shape)
print(w2v_2.head())
print(w2v_2.columns)

In [None]:
chapters = pd.read_csv('./data/icd9/chapters.csv', sep="|").transpose()[:17]
chapters_start = chapters['start'].astype(int).values
sub_chapters = pd.read_csv('./data/icd9/subchapters.csv', sep="|").transpose()[:119]
sub_chapters_start = sub_chapters['start'].astype(int).values

w2v_2['major'] = w2v_2['icd'].str.split('.', expand=True)[0]
w2v_2['chapter'] = w2v_2['major'].map(lambda m: np.argmin(chapters_start <= int(m)))
w2v_2['sub_chapter'] = w2v_2['major'].map(lambda m: np.argmin(sub_chapters_start <= int(m)))

w2v_10['major'] = w2v_10['icd'].str.split('.', expand=True)[0]
w2v_10['chapter'] = w2v_10['major'].map(lambda m: np.argmin(chapters_start <= int(m)))
w2v_10['sub_chapter'] = w2v_10['major'].map(lambda m: np.argmin(sub_chapters_start <= int(m)))

w2v_100['major'] = w2v_100['icd'].str.split('.', expand=True)[0]
w2v_100['chapter'] = w2v_100['major'].map(lambda m: np.argmin(chapters_start <= int(m)))
w2v_100['sub_chapter'] = w2v_100['major'].map(lambda m: np.argmin(sub_chapters_start <= int(m)))

print(w2v_2.head())

## 2

In [None]:

from sklearn.metrics.pairwise import pairwise_distances, euclidean_distances
distance_matrix = pairwise_distances(w2v_2[[col for col in w2v_2 if col.startswith('V')]],
                                     w2v_2[[col for col in w2v_2 if col.startswith('V')]],
                                     metric='euclidean')

in_major = []
out_major = []

in_subchapter = []
out_subchapter = []

in_chapter = []
out_chapter = []

for i, icd in w2v_2.iterrows():
    indices = w2v_2.index[w2v_2['major'] == icd['major']].tolist()
    if len(indices) > 1:
        in_major.append(np.mean(distance_matrix[i, indices]))
    
        out_indices = w2v_2.index[w2v_2['major'] != icd['major']].tolist()
        out_major.append(np.mean(distance_matrix[i, out_indices]))
        
    indices = w2v_2.index[w2v_2['sub_chapter'] == icd['sub_chapter']].tolist()
    if len(indices) > 1:
        in_subchapter.append(np.mean(distance_matrix[i, indices]))
    
        out_indices = w2v_2.index[w2v_2['sub_chapter'] != icd['sub_chapter']].tolist()
        out_subchapter.append(np.mean(distance_matrix[i, out_indices]))
        
    indices = w2v_2.index[w2v_2['chapter'] == icd['chapter']].tolist()
    if len(indices) > 1:
        in_chapter.append(np.mean(distance_matrix[i, indices]))
    
        out_indices = w2v_2.index[w2v_2['chapter'] != icd['chapter']].tolist()
        out_chapter.append(np.mean(distance_matrix[i, out_indices]))
        
print(np.mean(in_major), np.mean(out_major))
# print(np.std(in_major), np.std(out_major))
print(np.mean(in_subchapter), np.mean(out_subchapter))
# print(np.std(in_subchapter), np.std(out_subchapter))
print(np.mean(in_chapter), np.mean(out_chapter))
# print(np.std(in_chapter), np.std(out_chapter))

## 10

In [None]:
from sklearn.metrics.pairwise import pairwise_distances, euclidean_distances
distance_matrix = pairwise_distances(w2v_10[[col for col in w2v_10 if col.startswith('V')]],
                                     w2v_10[[col for col in w2v_10 if col.startswith('V')]],
                                     metric='euclidean')
print('distance calculated')

in_major = []
out_major = []

in_subchapter = []
out_subchapter = []

in_chapter = []
out_chapter = []

for i, icd in w2v_10.iterrows():
    indices = w2v_10.index[w2v_10['major'] == icd['major']].tolist()
    
    if len(indices) > 1:
        in_major.append(np.mean(distance_matrix[i, indices]))

        out_indices = w2v_10.index[w2v_10['major'] != icd['major']].tolist()
        out_major.append(np.mean(distance_matrix[i, out_indices]))
        
    indices = w2v_10.index[w2v_10['sub_chapter'] == icd['sub_chapter']].tolist()
    if len(indices) > 1:
        in_subchapter.append(np.mean(distance_matrix[i, indices]))

        out_indices = w2v_10.index[w2v_10['sub_chapter'] != icd['sub_chapter']].tolist()
        out_subchapter.append(np.mean(distance_matrix[i, out_indices]))
        
    indices = w2v_10.index[w2v_10['chapter'] == icd['chapter']].tolist()
    if len(indices) > 1:
        in_chapter.append(np.mean(distance_matrix[i, indices]))

        out_indices = w2v_10.index[w2v_10['chapter'] != icd['chapter']].tolist()
        out_chapter.append(np.mean(distance_matrix[i, out_indices]))
        
print(np.mean(in_major), np.mean(out_major))
# print(np.std(in_major), np.std(out_major))
print(np.mean(in_subchapter), np.mean(out_subchapter))
# print(np.std(in_subchapter), np.std(out_subchapter))
print(np.mean(in_chapter), np.mean(out_chapter))
# print(np.std(in_chapter), np.std(out_chapter)) 

In [None]:
from sklearn.metrics.pairwise import pairwise_distances, euclidean_distances
distance_matrix = pairwise_distances(w2v_100[[col for col in w2v_10 if col.startswith('V')]],
                                     w2v_100[[col for col in w2v_10 if col.startswith('V')]],
                                     metric='euclidean')
print('distance calculated')

in_major = []
out_major = []

in_subchapter = []
out_subchapter = []

in_chapter = []
out_chapter = []

for i, icd in w2v_100.iterrows():
    indices = w2v_100.index[w2v_10['major'] == icd['major']].tolist()
    in_major.append(np.mean(distance_matrix[i, indices]))

    out_indices = w2v_100.index[w2v_100['major'] != icd['major']].tolist()
    out_major.append(np.mean(distance_matrix[i, out_indices]))

    indices = w2v_100.index[w2v_100['sub_chapter'] == icd['sub_chapter']].tolist()
    
    in_subchapter.append(np.mean(distance_matrix[i, indices]))

    out_indices = w2v_100.index[w2v_100['sub_chapter'] != icd['sub_chapter']].tolist()
    out_subchapter.append(np.mean(distance_matrix[i, out_indices]))
        
    indices = w2v_100.index[w2v_100['chapter'] == icd['chapter']].tolist()
    in_chapter.append(np.mean(distance_matrix[i, indices]))

    out_indices = w2v_100.index[w2v_100['chapter'] != icd['chapter']].tolist()
    out_chapter.append(np.mean(distance_matrix[i, out_indices]))
        
print(np.mean(in_major), np.mean(out_major))
# print(np.std(in_major), np.std(out_major))
print(np.mean(in_subchapter), np.mean(out_subchapter))
# print(np.std(in_subchapter), np.std(out_subchapter))
print(np.mean(in_chapter), np.mean(out_chapter))
# print(np.std(in_chapter), np.std(out_chapter))