# Subset Edge Files based on K Nearest Neighbors
Comparing every patient to every patient across 20+ subgroups is unreasonable, therefore, we reduce the data files to the top k nearest neighbor for each patient. Also, ensure that 

In [1]:
import os
import pandas as pd
import numpy as np
import re
import copy

In [2]:
input_dir = "../data/simMatrices/"
output_dir = "../data/knnMatrices/"

In [3]:
files = os.listdir(input_dir)
files

['chart events_activity.csv',
 'chart events_demographics.csv',
 'chart events_diet.csv',
 'chart events_heart lung.csv',
 'chart events_medical history.csv',
 'chart events_pain.csv',
 'chart events_physical assessment.csv',
 'lab tests_antibodies.csv',
 'lab tests_blood.csv',
 'lab tests_breakdown products.csv',
 'lab tests_enzymes.csv',
 'lab tests_hepatitis.csv',
 'lab tests_other lab.csv',
 'lab tests_proteins.csv']

### Subset all Files and Export

In [7]:
for file in files: 
    print("Running File: " + file)
    df = pd.read_csv(input_dir + file)
    display(df.head())
    
    # remove all instances where patient 1 = patient 2
    df = df[df['patient_1'] != df['patient_2']]
    
    # Rank and Filter to k nearest neighbors 
    df["rank"] = df.groupby("patient_1")["similarity"].rank("dense", ascending=False)
    filtered_df = df[df['rank'] <= k]
    
    # sort and export
    filtered_df = filtered_df.sort_values(by = ['patient_1', 'rank'])
    display(filtered_df)
    print(filtered_df.shape)

    filtered_df.to_csv(output_dir + file, index = False)
print("Complete")

Running File: chart events_activity.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4,4,1.0
1,52,4,0.804625
2,78,4,0.878708
3,117,4,0.496871
4,140,4,0.768637


Unnamed: 0,patient_1,patient_2,similarity,rank
3881757,4,29573,0.982752,1.0
573800,4,4699,0.963890,2.0
4125622,4,31535,0.953409,3.0
5396589,4,57317,0.945740,4.0
1592295,4,12119,0.940062,5.0
...,...,...,...,...
6423690,99923,72290,0.986790,96.0
5356422,99923,56484,0.986682,97.0
4920334,99923,50261,0.986638,98.0
2163225,99923,16979,0.986565,99.0


(302563, 4)
Running File: chart events_demographics.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4,4,1.0
1,52,4,0.81009
2,78,4,0.644189
3,117,4,0.99662
4,140,4,0.581554


Unnamed: 0,patient_1,patient_2,similarity,rank
8652,4,117,0.996620,1.0
761376,4,5801,0.994812,2.0
1052660,4,8028,0.988175,3.0
1110340,4,8516,0.988042,4.0
204764,4,1453,0.987259,5.0
...,...,...,...,...
6872571,99923,78100,0.932249,96.0
6491883,99923,72327,0.931301,97.0
6298655,99923,69438,0.931164,98.0
6442855,99923,71364,0.929903,99.0


(288400, 4)
Running File: chart events_diet.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4,140,0.237708
1,52,140,0.343705
2,78,140,0.463452
3,117,140,0.405096
4,143,140,0.410488


Unnamed: 0,patient_1,patient_2,similarity,rank
170956,4,21575,0.881409,1.0
235876,4,28889,0.881409,1.0
41116,4,4979,0.869866,2.0
103872,4,12539,0.866025,3.0
1062524,4,78433,0.866025,3.0
...,...,...,...,...
1460699,99923,96151,0.625118,96.0
430635,99923,48342,0.624449,97.0
1038719,99923,76873,0.619584,98.0
893731,99923,70469,0.617686,99.0


(361218, 4)
Running File: chart events_heart lung.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4445,4,0.088427
1,6800,4,0.42971
2,12246,4,0.291573
3,19236,4,0.366365
4,4445,52,0.177136


Unnamed: 0,patient_1,patient_2,similarity,rank
11477,4,6800,0.429710,1.0
11479,4,19236,0.366365,2.0
11478,4,12246,0.291573,3.0
11476,4,4445,0.088427,4.0
11483,52,19236,0.935569,1.0
...,...,...,...,...
22944,99862,4445,0.156987,4.0
22951,99923,19236,0.695832,1.0
22949,99923,6800,0.499858,2.0
22950,99923,12246,0.479771,3.0


(11877, 4)
Running File: chart events_medical history.csv


Unnamed: 0,patient_1,patient_2,similarity
0,3632,188,0.045549
1,7009,188,0.926614
2,10248,188,0.684595
3,11912,188,0.547533
4,18892,188,0.607298


Unnamed: 0,patient_1,patient_2,similarity,rank
329808,188,75994,0.999994,1.0
329836,188,82432,0.999982,2.0
329692,188,43876,0.999879,3.0
329784,188,67788,0.999551,4.0
329889,188,94846,0.999215,5.0
...,...,...,...,...
659290,99923,88565,0.087706,87.0
659305,99923,91123,0.087706,87.0
659311,99923,92695,0.087706,87.0
659160,99923,53862,0.000000,88.0


(399926, 4)
Running File: chart events_pain.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4,52,0.291644
1,78,52,0.206223
2,140,52,0.506018
3,186,52,0.506018
4,279,52,0.291644


Unnamed: 0,patient_1,patient_2,similarity,rank
299460,4,16063,1.000000,1.0
965960,4,66656,1.000000,1.0
1121580,4,78737,1.000000,1.0
835140,4,57010,0.959426,2.0
850020,4,58005,0.906821,3.0
...,...,...,...,...
2775959,99923,15645,0.350666,96.0
2776001,99923,17924,0.350172,97.0
2775786,99923,2683,0.348956,98.0
2776152,99923,27166,0.347087,99.0


(472887, 4)
Running File: chart events_physical assessment.csv


Unnamed: 0,patient_1,patient_2,similarity
0,4,4,1.0
1,52,4,0.600567
2,78,4,0.772627
3,117,4,0.568409
4,140,4,0.705856


Unnamed: 0,patient_1,patient_2,similarity,rank
2374317,4,18635,0.998777,1.0
2339865,4,18430,0.998185,2.0
499554,4,4038,0.997918,3.0
3000195,4,23752,0.987992,4.0
734976,4,5689,0.986536,5.0
...,...,...,...,...
2535092,99923,20133,0.970367,96.0
8245144,99923,63200,0.970227,97.0
5822387,99923,63200,0.970227,98.0
5400350,99923,57199,0.970120,99.0


(306932, 4)
Running File: lab tests_antibodies.csv


Unnamed: 0,patient_1,patient_2,similarity
0,252,252,1.0
1,279,252,0.279399
2,357,252,0.999969
3,433,252,0.927209
4,634,252,-0.024586


Unnamed: 0,patient_1,patient_2,similarity,rank
500,252,357,0.999969,1.0
4250,252,2779,0.997524,2.0
11750,252,8443,0.977332,3.0
3250,252,2195,0.972754,4.0
13250,252,9706,0.970178,5.0
...,...,...,...,...
26999,99197,20182,0.257822,96.0
26249,99197,19991,0.241957,97.0
25499,99197,19031,0.217176,98.0
46499,99197,50141,0.216392,99.0


(25000, 4)
Running File: lab tests_blood.csv


Unnamed: 0,patient_1,patient_2,similarity
0,634,48986,-0.752667
1,7354,48986,-0.423588
2,11825,48986,-0.25786
3,24071,48986,-0.467582
4,634,87941,-0.98196


Unnamed: 0,patient_1,patient_2,similarity,rank
4082959,4,8283,0.965046,1.0
4941631,4,4944,0.954603,2.0
5813957,4,5210,0.953846,3.0
5813951,4,3064,0.950744,4.0
5813946,4,1196,0.948790,5.0
...,...,...,...,...
3847897,99923,29163,0.629105,96.0
3848029,99923,62795,0.628167,97.0
2130033,99923,27655,0.628059,98.0
2298146,99923,56734,0.628032,99.0


(282400, 4)
Running File: lab tests_breakdown products.csv


Unnamed: 0,patient_1,patient_2,similarity
0,5450,634,0.151982
1,7536,634,0.77492
2,5450,773,0.219217
3,7536,773,0.685816
4,5450,1470,-0.146544


Unnamed: 0,patient_1,patient_2,similarity,rank
3793010,4,2304,0.992673,1.0
2117685,4,1005,0.988717,2.0
3793019,4,5442,0.984444,3.0
3793022,4,5681,0.984049,4.0
2181486,4,7758,0.979659,5.0
...,...,...,...,...
2469725,99923,82279,0.661333,96.0
4629104,99923,98196,0.659243,97.0
2407670,99923,80490,0.656552,98.0
2614940,99923,63692,0.655978,99.0


(268000, 4)
Running File: lab tests_enzymes.csv


Unnamed: 0,patient_1,patient_2,similarity
0,52,78,0.861449
1,993,78,0.858231
2,1196,78,0.868819
3,1284,78,0.941508
4,1433,78,0.744648


Unnamed: 0,patient_1,patient_2,similarity,rank
3234647,4,2585,0.998453,1.0
2326916,4,10630,0.989642,2.0
2326978,4,25140,0.988945,3.0
1183715,4,8309,0.988520,4.0
2945239,4,4263,0.985301,5.0
...,...,...,...,...
4053574,99923,60893,0.647000,96.0
4053891,99923,84382,0.645320,97.0
4053786,99923,76541,0.644793,98.0
4053373,99923,46588,0.643923,99.0


(273400, 4)
Running File: lab tests_hepatitis.csv


Unnamed: 0,patient_1,patient_2,similarity
0,226,226,1.0
1,252,226,0.359656
2,279,226,0.902887
3,314,226,0.154535
4,357,226,0.356283


Unnamed: 0,patient_1,patient_2,similarity,rank
19074,226,2451,0.982539,1.0
46240,226,6505,0.977658,2.0
56644,226,7666,0.972427,3.0
80920,226,10963,0.953130,4.0
27744,226,4004,0.946773,5.0
...,...,...,...,...
150857,99197,20117,0.501453,96.0
316743,99197,83854,0.499629,97.0
231777,99197,29909,0.498457,98.0
313853,99197,82279,0.494075,99.0


(57800, 4)
Running File: lab tests_other lab.csv


Unnamed: 0,patient_1,patient_2,similarity
0,52,3866,0.423273
1,78,3866,0.12628
2,117,3866,0.934899
3,188,3866,0.109712
4,433,3866,0.731933


Unnamed: 0,patient_1,patient_2,similarity,rank
2211951,4,4741,0.993242,1.0
2211968,4,5975,0.991820,2.0
2211911,4,1457,0.991110,3.0
2211894,4,78,0.989137,4.0
2211915,4,1704,0.983552,5.0
...,...,...,...,...
1889994,99923,63693,0.859251,96.0
1880530,99923,58019,0.858438,97.0
2211886,99923,98573,0.856254,98.0
1953746,99923,98717,0.855954,99.0


(285500, 4)
Running File: lab tests_proteins.csv


Unnamed: 0,patient_1,patient_2,similarity
0,27404,279,0.538967
1,42281,279,0.376455
2,67089,279,-0.909544
3,76780,279,-0.692547
4,99197,279,-0.506973


Unnamed: 0,patient_1,patient_2,similarity,rank
677816,4,1967,1.000000,1.0
678423,4,6676,1.000000,1.0
679030,4,7144,1.000000,1.0
679637,4,10366,1.000000,1.0
680244,4,10633,1.000000,1.0
...,...,...,...,...
1040225,99836,85141,0.995375,96.0
416223,99836,84972,0.995317,97.0
416087,99836,84970,0.995278,98.0
415951,99836,84934,0.995240,99.0


(169659, 4)
Complete
