# Subset Edge Files based on K Nearest Neighbors
Comparing every patient to every patient across 20+ subgroups is unreasonable, therefore, we reduce the data files to the top k nearest neighbor for each patient. Also, ensure that 

In [1]:
import os
import pandas as pd
import numpy as np
import re
import copy

In [2]:
input_dir = "../data/simMatrices/"
output_dir = "../data/knnMatrices/"

In [3]:
files = os.listdir(input_dir)
files

['chart events_activity.csv',
 'chart events_demographics.csv',
 'chart events_diet.csv',
 'chart events_heart lung.csv',
 'chart events_mental drug.csv',
 'chart events_pain.csv',
 'chart events_physical assessment.csv',
 'lab tests_antibodies.csv',
 'lab tests_blood.csv',
 'lab tests_breakdown products.csv',
 'lab tests_enzymes.csv',
 'lab tests_hepatitis.csv',
 'lab tests_other lab.csv',
 'lab tests_proteins.csv']

In [4]:
k = 100

### Subset all Files and Export

In [5]:
for file in files: 
    print("Running File: " + file)
    df = pd.read_csv(input_dir + file)
    display(df.head())
    
    # remove all instances where patient 1 = patient 2
    df = df[df['patient_1'] != df['patient_2']]
    
    # Rank and Filter to k nearest neighbors 
    df["rank"] = df.groupby("patient_1")["similarity"].rank("dense", ascending=False)
    filtered_df = df[df['rank'] <= k]
    
    # sort and export
    filtered_df = filtered_df.sort_values(by = ['patient_1', 'rank'])
    display(filtered_df)
    print(filtered_df.shape)

    filtered_df.to_csv(output_dir + file, index = False)
print("Complete")

Running File: chart events_activity.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4,4,1.0
1,52,4,0.804625
2,78,4,0.878708
3,117,4,0.496871
4,140,4,0.768637


Unnamed: 0,patient_1,patient_2,similarity,rank
3881757,4,29573,0.982752,1.0
573800,4,4699,0.963890,2.0
4125622,4,31535,0.953409,3.0
5396589,4,57317,0.945740,4.0
1592295,4,12119,0.940062,5.0
...,...,...,...,...
6423690,99923,72290,0.986790,96.0
5356422,99923,56484,0.986682,97.0
4920334,99923,50261,0.986638,98.0
2163225,99923,16979,0.986565,99.0


(302589, 4)
Running File: chart events_demographics.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4,4,1.0
1,52,4,0.81009
2,78,4,0.644189
3,117,4,0.99662
4,140,4,0.581554


Unnamed: 0,patient_1,patient_2,similarity,rank
8652,4,117,0.996620,1.0
761376,4,5801,0.994812,2.0
1052660,4,8028,0.988175,3.0
1110340,4,8516,0.988042,4.0
204764,4,1453,0.987259,5.0
...,...,...,...,...
5695899,99923,60020,0.889391,96.0
6526491,99923,72725,0.887018,97.0
6797587,99923,76780,0.886169,98.0
6601475,99923,73790,0.884302,99.0


(288400, 4)
Running File: chart events_diet.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4,140,0.237708
1,52,140,0.343705
2,78,140,0.463452
3,117,140,0.405096
4,143,140,0.410488


Unnamed: 0,patient_1,patient_2,similarity,rank
170956,4,21575,0.881409,1.0
235876,4,28889,0.881409,1.0
41116,4,4979,0.869866,2.0
103872,4,12539,0.866025,3.0
60592,4,6489,0.718304,4.0
...,...,...,...,...
361387,99923,44996,0.421272,96.0
203415,99923,27096,0.421018,97.0
45443,99923,5458,0.419085,98.0
266171,99923,31928,0.417890,99.0


(256665, 4)
Running File: chart events_heart lung.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4,4445,0.088427
1,52,4445,0.177136
2,78,4445,0.195058
3,117,4445,0.781336
4,140,4445,0.16785


Unnamed: 0,patient_1,patient_2,similarity,rank
2869,4,6800,0.429710,1.0
8607,4,19236,0.366365,2.0
5738,4,12246,0.291573,3.0
0,4,4445,0.088427,4.0
8608,52,19236,0.935569,1.0
...,...,...,...,...
2867,99862,4445,0.156987,4.0
11475,99923,19236,0.695832,1.0
5737,99923,6800,0.499858,2.0
8606,99923,12246,0.479771,3.0


(11476, 4)
Running File: chart events_mental drug.csv


Unnamed: 0,patient_1,patient_2,similarity
0,3632,188,0.045549
1,7009,188,0.926614
2,10248,188,0.684595
3,11912,188,0.547533
4,18892,188,0.607298


Unnamed: 0,patient_1,patient_2,similarity,rank
1659,3632,2100,1.000000,1.0
9006,3632,7339,1.000000,1.0
13509,3632,10531,1.000000,1.0
13746,3632,10686,1.000000,1.0
13983,3632,11003,1.000000,1.0
...,...,...,...,...
148124,99836,59415,0.956002,97.0
176801,99836,66232,0.956002,97.0
58538,99836,32526,0.954930,98.0
225860,99836,76327,0.953953,99.0


(70256, 4)
Running File: chart events_pain.csv


Unnamed: 0,patient_1,patient_2,similarity
0,52,4,0.291644
1,117,4,0.7923
2,143,4,0.154346
3,172,4,0.058033
4,188,4,0.591362


Unnamed: 0,patient_1,patient_2,similarity,rank
1365790,52,95672,0.868076,1.0
759021,52,22663,0.866426,2.0
1070242,52,31496,0.848974,3.0
1379224,52,98402,0.823446,4.0
394064,52,12226,0.819883,5.0
...,...,...,...,...
492579,99923,15645,0.350666,96.0
586617,99923,17924,0.350172,97.0
105232,99923,2683,0.348956,98.0
924706,99923,27166,0.347087,99.0


(368083, 4)
Running File: chart events_physical assessment.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4,4,1.0
1,52,4,0.600567
2,78,4,0.772627
3,117,4,0.568409
4,140,4,0.705856


Unnamed: 0,patient_1,patient_2,similarity,rank
2374317,4,18635,0.998777,1.0
2339865,4,18430,0.998185,2.0
499554,4,4038,0.997918,3.0
3000195,4,23752,0.987992,4.0
734976,4,5689,0.986536,5.0
...,...,...,...,...
31580,99923,236,0.966927,96.0
5032862,99923,51462,0.966795,97.0
6120971,99923,67265,0.966763,98.0
6709526,99923,76251,0.966558,99.0


(306924, 4)
Running File: lab tests_antibodies.csv


Unnamed: 0,patient_1,patient_2,similarity
0,252,252,1.0
1,279,252,0.279399
2,357,252,0.999969
3,433,252,0.927209
4,634,252,-0.024586


Unnamed: 0,patient_1,patient_2,similarity,rank
500,252,357,0.999969,1.0
4250,252,2779,0.997524,2.0
11750,252,8443,0.977332,3.0
3250,252,2195,0.972754,4.0
13250,252,9706,0.970178,5.0
...,...,...,...,...
26999,99197,20182,0.257822,96.0
26249,99197,19991,0.241957,97.0
25499,99197,19031,0.217176,98.0
46499,99197,50141,0.216392,99.0


(25000, 4)
Running File: lab tests_blood.csv


Unnamed: 0,patient_1,patient_2,similarity
0,31958,27144,-0.194081
1,31958,17470,0.61424
2,31958,20182,0.43599
3,31958,28325,-0.617907
4,31958,50782,0.745398


Unnamed: 0,patient_1,patient_2,similarity,rank
2796831,4,5210,0.953846,1.0
2791905,4,3064,0.950744,2.0
2787800,4,1196,0.948790,3.0
2814072,4,10876,0.937858,4.0
2824745,4,13855,0.919266,5.0
...,...,...,...,...
980525,99923,27655,0.628059,96.0
1148638,99923,56734,0.628032,97.0
524732,99923,80536,0.627406,98.0
1288882,99923,79038,0.623720,99.0


(264229, 4)
Running File: lab tests_breakdown products.csv


Unnamed: 0,patient_1,patient_2,similarity
0,1967,279,0.622819
1,2953,279,-0.131435
2,3392,279,0.060455
3,5251,279,0.790911
4,7009,279,0.065494


Unnamed: 0,patient_1,patient_2,similarity,rank
936648,4,2304,0.992673,1.0
841941,4,1005,0.988717,2.0
938025,4,5442,0.984444,3.0
938484,4,5681,0.984049,4.0
768042,4,7758,0.979659,5.0
...,...,...,...,...
1262913,99923,95968,0.613757,96.0
1201173,99923,96686,0.612939,97.0
1551453,99923,87459,0.611507,98.0
1256613,99923,82810,0.611429,99.0


(259437, 4)
Running File: lab tests_enzymes.csv


Unnamed: 0,patient_1,patient_2,similarity
0,45226,1570,-0.063847
1,45226,3681,-0.075879
2,45226,6205,-0.003228
3,45226,21019,0.277447
4,45226,48692,0.747555


Unnamed: 0,patient_1,patient_2,similarity,rank
2034932,4,2585,0.998453,1.0
1048848,4,8309,0.988520,2.0
719618,4,4263,0.985301,3.0
1241568,4,26267,0.981296,4.0
1715338,4,15127,0.977643,5.0
...,...,...,...,...
2249358,99923,79240,0.158540,96.0
2235842,99923,65405,0.147009,97.0
2254590,99923,85655,0.145320,98.0
2234534,99923,63987,0.141333,99.0


(250552, 4)
Running File: lab tests_hepatitis.csv


Unnamed: 0,patient_1,patient_2,similarity
0,226,226,1.0
1,252,226,0.359656
2,279,226,0.902887
3,314,226,0.154535
4,357,226,0.356283


Unnamed: 0,patient_1,patient_2,similarity,rank
19074,226,2451,0.982539,1.0
46240,226,6505,0.977658,2.0
56644,226,7666,0.972427,3.0
80920,226,10963,0.953130,4.0
27744,226,4004,0.946773,5.0
...,...,...,...,...
150857,99197,20117,0.501453,96.0
316743,99197,83854,0.499629,97.0
231777,99197,29909,0.498457,98.0
313853,99197,82279,0.494075,99.0


(57800, 4)
Running File: lab tests_other lab.csv


Unnamed: 0,patient_1,patient_2,similarity
0,422,1284,0.044325
1,1734,1284,0.106324
2,2919,1284,0.57296
3,4118,1284,0.486037
4,5060,1284,0.280722


Unnamed: 0,patient_1,patient_2,similarity,rank
1982923,4,13374,0.960309,1.0
1988605,4,17299,0.917660,2.0
1956407,4,1137,0.894769,3.0
1994287,4,22098,0.883281,4.0
1965877,4,6024,0.870788,5.0
...,...,...,...,...
330661,99923,58019,0.858438,96.0
207109,99923,98573,0.856254,97.0
403877,99923,98717,0.855954,98.0
198269,99923,72931,0.855380,99.0


(182212, 4)
Running File: lab tests_proteins.csv


Unnamed: 0,patient_1,patient_2,similarity
0,117,797,0.998451
1,689,797,0.436063
2,984,797,0.444394
3,2486,797,0.514419
4,3052,797,0.055637


Unnamed: 0,patient_1,patient_2,similarity,rank
410501,4,6824,1.000000,1.0
411108,4,7355,1.000000,1.0
411715,4,28099,1.000000,1.0
412322,4,30903,1.000000,1.0
412929,4,31360,1.000000,1.0
...,...,...,...,...
778053,99836,15150,-0.734837,78.0
776965,99836,5882,-0.745964,79.0
775877,99836,188,-0.801082,80.0
776149,99836,2953,-0.806747,81.0


(146252, 4)
Complete
