# Lab Tests Cross-Sectional Similarity Matrices
This notebook takes in all lab test cross-sectional data and computes the similarity matrices

In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import os
import pandas as pd
import numpy as np
import re, math
from sklearn.metrics.pairwise import cosine_similarity


# my functions for cross-sectional similarity
from crossSectionSimilarity import *
from simMatrix import simMatrix

In [10]:
input_dir = "../data/patientData/"
output_dir = "../data/simMatrices/"

In [11]:
files = os.listdir(input_dir)
files = [file for file in files if re.search("lab tests_cross sect", file)]
files

['lab tests_cross sect_antibodies.csv',
 'lab tests_cross sect_blood.csv',
 'lab tests_cross sect_breakdown products.csv',
 'lab tests_cross sect_enzymes.csv',
 'lab tests_cross sect_hepatitis.csv',
 'lab tests_cross sect_other lab.csv',
 'lab tests_cross sect_proteins.csv']

In [12]:
# we remove patients that do not have a lot of data from this analysis
def removeLowDataPts(df, column_perc = .5):    
    # if number of missing higher then remove
    missing_cutoff = math.floor(len(df.columns)*column_perc)
    
    # extract total count and filter patients
    df['total_missing'] = df.isnull().sum(axis=1)
    filtered_df = df[df['total_missing'] <= missing_cutoff]
    filtered_df.drop(columns = 'total_missing', inplace = True)
    
    # print stats and return filtered dataframe
    print(f"Original Number of Patients: {len(df)}")
    print(f"Filtered Number of Patients: {len(filtered_df)}")
    
    return filtered_df

In [13]:
# single run through of the pipeline to generate cosine similarity
# these functions will likely be pushed to crossSectionSimilarity 
def prepData(df, column_perc = .5):
    
    # filter out patients with low data
    filtered_df = removeLowDataPts(df, column_perc)
    
    # normalize our data (mean = 0, sd = 1)
    scaled_df = normalizeDF(filtered_df)
    
    return scaled_df

### Compute the Similarity Matrices
Run the entire process

I am aware that this could be made significantly faster, but it does not matter for now. I suspect that I will finish the base analysis and visualization early. At which point, I can improve upon certain areas.

In [14]:
import warnings
warnings.filterwarnings("ignore")

for file in files:
    print(file)
    df = pd.read_csv(input_dir + file)
    processed_df = prepData(df)
    
    # initialize our similarity matrix class and patient combos 
    lab_sim = simMatrix("lab tests_" + re.search("_.*_(.*)\\.csv$", file).group(1))
    processed_df, col_combos = columnCombos(processed_df)
    
    # now compute similarity between each patient-patient
    for i, col_tuple in enumerate(col_combos):
        cosine, cosine_melted = computeCosine(processed_df, col_tuple)
        lab_sim.insertRow(cosine_melted)
        if any(cosine_melted.duplicated(subset = ['patient_1', 'patient_2'])):
            print("breaking")
            break
            
        try:
            if i % round(len(col_combos)*.1) ==0:
                print(f"Round {i} of {len(col_combos)}")
        except:
            pass
    
    # export
    print("Exporting...")
    lab_sim.export(output_dir)
    display(lab_sim.similarities)
    print("\n")
    

lab tests_cross sect_antibodies.csv
Original Number of Patients: 250
Filtered Number of Patients: 250
Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,252,252,1.000000
1,279,252,0.279399
2,357,252,0.999969
3,433,252,0.927209
4,634,252,-0.024586
...,...,...,...
62495,96152,99197,0.548144
62496,96364,99197,0.176565
62497,97547,99197,0.999972
62498,98622,99197,0.519501




lab tests_cross sect_blood.csv
Original Number of Patients: 2880
Filtered Number of Patients: 2824
Round 0 of 990
Round 99 of 990
Round 198 of 990
Round 297 of 990
Round 396 of 990
Round 495 of 990
Round 594 of 990
Round 693 of 990
Round 792 of 990
Round 891 of 990
Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,634,48986,-0.752667
1,7354,48986,-0.423588
2,11825,48986,-0.257860
3,24071,48986,-0.467582
4,634,87941,-0.981960
...,...,...,...
1,9706,10515,-0.274580
2,20994,10515,-0.424933
3,24993,10515,-0.113938
4,25429,10515,-0.180432




lab tests_cross sect_breakdown products.csv
Original Number of Patients: 2880
Filtered Number of Patients: 2680
Round 0 of 2701
Round 270 of 2701
Round 540 of 2701
Round 810 of 2701
Round 1080 of 2701
Round 1350 of 2701
Round 1620 of 2701
Round 1890 of 2701
Round 2160 of 2701
Round 2430 of 2701
Round 2700 of 2701
Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,5450,634,0.151982
1,7536,634,0.774920
2,5450,773,0.219217
3,7536,773,0.685816
4,5450,1470,-0.146544
...,...,...,...
149,95380,40033,0.616786
150,95447,40033,0.070571
151,98605,40033,-0.434843
0,19164,40033,-0.459876




lab tests_cross sect_enzymes.csv
Original Number of Patients: 2880
Filtered Number of Patients: 2734
Round 0 of 55
Round 6 of 55
Round 12 of 55
Round 18 of 55
Round 24 of 55
Round 30 of 55
Round 36 of 55
Round 42 of 55
Round 48 of 55
Round 54 of 55
Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,52,78,0.861449
1,993,78,0.858231
2,1196,78,0.868819
3,1284,78,0.941508
4,1433,78,0.744648
...,...,...,...
241,17133,98402,-0.005786
242,17133,98761,0.082601
243,17133,99166,-0.681433
244,17133,99485,-0.254495




lab tests_cross sect_hepatitis.csv
Original Number of Patients: 578
Filtered Number of Patients: 578
Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,226,226,1.000000
1,252,226,0.359656
2,279,226,0.902887
3,314,226,0.154535
4,357,226,0.356283
...,...,...,...
334079,95816,99197,0.553746
334080,96152,99197,0.713750
334081,96686,99197,0.685834
334082,97547,99197,0.528597




lab tests_cross sect_other lab.csv
Original Number of Patients: 2880
Filtered Number of Patients: 2855
Round 0 of 28
Round 3 of 28
Round 6 of 28
Round 9 of 28
Round 12 of 28
Round 15 of 28
Round 18 of 28
Round 21 of 28
Round 24 of 28
Round 27 of 28
Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,52,3866,0.423273
1,78,3866,0.126280
2,117,3866,0.934899
3,188,3866,0.109712
4,433,3866,0.731933
...,...,...,...
87119,99740,87846,0.247167
87120,99768,87846,0.986534
87121,99791,87846,0.694785
87122,99830,87846,0.567569




lab tests_cross sect_proteins.csv
Original Number of Patients: 2880
Filtered Number of Patients: 1415
Round 0 of 210
Round 21 of 210
Round 42 of 210
Round 63 of 210
Round 84 of 210
Round 105 of 210
Round 126 of 210
Round 147 of 210
Round 168 of 210
Round 189 of 210
Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,27404,279,0.538967
1,42281,279,0.376455
2,67089,279,-0.909544
3,76780,279,-0.692547
4,99197,279,-0.506973
...,...,...,...
4126,93077,98573,0.990254
4127,93209,98573,0.970094
4128,96218,98573,0.994059
4129,97529,98573,0.180245




