In [21]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering
from sklearn import preprocessing
from scipy.sparse import csr_matrix
import pickle
import os
from annoy import AnnoyIndex


In [2]:
patient_icd_sparse_path = "./../../data/PATIENT_ICD_BINARY_SPARSE_CSR.p"
pdata = pickle.load(open(patient_icd_sparse_path, "rb"))
pdata.shape

(46520, 6984)

In [25]:
gt1_subject_ids_path = "../../data/PATIENT_ICD_SUBJECT_IDS_GT1_VISITS.csv"
gt1_subject_ids = pd.read_csv(gt1_subject_ids_path)
gt1_subject_ids.MATRIX_ROW_IDX.values

array([    0,     1,     6, ..., 46499, 46505, 46517])

In [18]:
ANNOY_PATH = "../../../large_data_files"
annoy_path = os.path.join(ANNOY_PATH, "200214_patient_similarity_clusters_default.ann")

cos_knn_tree = AnnoyIndex(pdata.shape[1], "angular")
cos_knn_tree.load(annoy_path)
k_neighbors = 50

print("Building dense data matrix with k={} nn...".format(k_neighbors))
knn_data = np.zeros(pdata.shape)
print(knn_data.shape)
for i in tqdm(range(pdata.shape[0])):
    nn_idxs = cos_knn_tree.get_nns_by_item(i, k_neighbors)
    print(nn_idxs)
    distances = [cos_knn_tree.get_distance(i, nn) for nn in nn_idxs]
    knn_data[i, nn_idxs] = distances

  0%|          | 0/46520 [00:00<?, ?it/s]

Loading knn tree
Building dense data matrix with k=50 nn...
(46520, 6984)
[0, 3319, 29662, 24132, 25275, 16666, 37755, 39245, 16376, 14844, 26326, 34948, 43782, 37026, 41266, 34699, 575, 7206, 16593, 44554, 26000, 27994, 27511, 19853, 763, 6929, 8225, 37687, 29390, 26986, 11416, 20125, 42398, 5053, 35708, 14472, 25143, 28145, 5676, 1949, 7456, 11647, 27122, 12725, 28544, 33250, 13828, 16693, 22743, 3149]





IndexError: index 29662 is out of bounds for axis 1 with size 6984

In [3]:
pdata_dense = pdata.todense()
pdata_dense_standardized = preprocessing.scale(pdata_dense)
pdata_dense_standardized.shape

(46520, 6984)

In [4]:
pca = PCA(n_components=500).fit(pdata_dense_standardized.T)

array([89.93565562, 60.35567593, 53.59515779, 48.84506942, 47.97377594,
       44.31714161, 43.3008692 , 40.52960266, 40.19115433, 39.54757514,
       37.55943968, 37.43865033, 36.89173152, 36.45667108, 36.16978749,
       35.65790252, 34.10033089, 33.505376  , 33.12399519, 32.82683878,
       32.53281463, 31.88717304, 31.48308371, 31.40670681, 31.25556872,
       30.82852524, 30.77612917, 30.42236143, 30.39388014, 30.34532643,
       30.07952749, 29.50727945, 28.81160754, 28.67321537, 28.50237307,
       28.2676643 , 28.09031228, 27.94385641, 27.88774024, 27.5983604 ,
       27.44125391, 27.28881415, 27.19954695, 27.0866641 , 27.0329092 ,
       26.90322569, 26.84603619, 26.72044602, 26.58191426, 26.48533893,
       26.3993231 , 26.33454737, 26.22229198, 26.11491315, 26.04798398,
       25.9515286 , 25.8459227 , 25.80857371, 25.66323125, 25.6262571 ,
       25.41592169, 25.35482118, 25.20497713, 25.13879832, 24.93066511,
       24.89279769, 24.79784325, 24.69272524, 24.62043827, 24.46

In [7]:
np.sum(pca.explained_variance_ratio_)

0.2041247218321534

In [6]:
pca.explained_variance_ratio_

array([0.00193422, 0.00129805, 0.00115265, 0.0010505 , 0.00103176,
       0.00095312, 0.00093126, 0.00087166, 0.00086438, 0.00085054,
       0.00080778, 0.00080518, 0.00079342, 0.00078406, 0.00077789,
       0.00076688, 0.00073339, 0.00072059, 0.00071239, 0.000706  ,
       0.00069967, 0.00068579, 0.0006771 , 0.00067545, 0.0006722 ,
       0.00066302, 0.00066189, 0.00065428, 0.00065367, 0.00065263,
       0.00064691, 0.0006346 , 0.00061964, 0.00061667, 0.00061299,
       0.00060794, 0.00060413, 0.00060098, 0.00059977, 0.00059355,
       0.00059017, 0.00058689, 0.00058497, 0.00058254, 0.00058139,
       0.0005786 , 0.00057737, 0.00057467, 0.00057169, 0.00056961,
       0.00056776, 0.00056637, 0.00056395, 0.00056165, 0.00056021,
       0.00055813, 0.00055586, 0.00055506, 0.00055193, 0.00055114,
       0.00054661, 0.0005453 , 0.00054208, 0.00054065, 0.00053618,
       0.00053536, 0.00053332, 0.00053106, 0.0005295 , 0.00052618,
       0.00052168, 0.00051931, 0.00051867, 0.00051575, 0.00051

In [None]:
pca_data = pca.components_.T

n_clusters = 5
sc = SpectralClustering(n_clusters=n_clusters).fit(pca_data)
unique, counts = np.unique(sc.labels_, return_counts=True)
print(unique, counts)

In [None]:
plt.figure(figsize=(40, 20))
sns.heatmap(pdata_sorted[i:, i:])
plt.savefig("sorted_matrix_map_freq_threshold_{}.png".format("None"))