# Chart Events Similarity Matrices
Converts all nominal categorical variables to continuous via multiple correspondence analysis then combines with scaled continuous values. At which point, I can call the cosine similarity functions that I have already developed

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import numpy as np
import re

from simMatrix import simMatrix
from crossSectionSimilarity import *

In [3]:
input_dir = "../data/patientData/"
output_dir = "../data/simMatrices/"

In [4]:
files = os.listdir(input_dir)
cat_files = [file for file in files if re.search("chart events_categorical", file)]
num_files = [file for file in files if re.search("chart events_numeric", file)]
cat_files, num_files

(['chart events_categorical_activity.csv',
  'chart events_categorical_demographics.csv',
  'chart events_categorical_diet.csv',
  'chart events_categorical_heart lung.csv',
  'chart events_categorical_mental drug.csv',
  'chart events_categorical_pain.csv',
  'chart events_categorical_physical assessment.csv'],
 ['chart events_numeric_demographics.csv',
  'chart events_numeric_heart lung rate.csv',
  'chart events_numeric_medical history.csv',
  'chart events_numeric_pain level.csv',
  'chart events_numeric_weight.csv'])

In [5]:
cat2num_map = {'chart events_categorical_diet.csv': 'chart events_numeric_weight.csv',
               'chart events_categorical_heart lung.csv': 'chart events_numeric_heart lung rate.csv',
               'chart events_categorical_demographics.csv':'chart events_numeric_demographics.csv',
               'chart events_categorical_pain.csv':'chart events_numeric_pain level.csv',
               'chart events_categorical_mental drug.csv': 'chart events_numeric_medical history.csv'}

<b> Plan of Action: </b>
  * identify the categorical and numeric variables
  * Categorical 
   * Convert into the multi-index dummy matrix incorporating frequency
  * Numeric
   * scale the variables to have mean 0 and sd 1 
  * concatenate the datasets 
  * run cosine similarity scripts

In [6]:
def nominal2Numeric(df):
    nominal_mat = df.pivot_table(
                values='frequency', 
                index='subject_id', 
                columns=['label', 'value'])
    
    # replace the NaN's with 0 because this means 0 frequency
    nominal_mat = nominal_mat.fillna(0)
    return nominal_mat


In [7]:
def prepData(df_cat, df_num = None):
    df_cat = nominal2Numeric(df_cat)
    df_cat.columns = [re.sub("\\.", "", '_'.join(col).strip().lower()) for col in df_cat.columns.values]
    df_cat.columns = [re.sub(" ", "_", col) for col in df_cat.columns]
    df_cat = df_cat.reset_index()
    
    if df_num is not None:
        df_num = normalizeDF(df_num)
        df_cat = pd.merge(df_cat, df_num, on = "subject_id", how = "left")
        
    return df_cat

In [9]:
# for testing
for file in cat_files:
    print(file)
    df_cat = pd.read_csv(f"{input_dir}{file}")
    df_num = None

    if file in cat2num_map.keys():
        df_num = pd.read_csv(f"{input_dir}{cat2num_map[file]}")


    processed_df = prepData(df_cat, df_num)

    # initialize our similarity matrix class and patient combos 
    chart_sim = simMatrix("chart events_"+  re.search("_.*_(.*)\\.csv$", file).group(1))
    processed_df, col_combos = columnCombos(processed_df)

    # now compute similarity between each patient-patient
    for i, col_tuple in enumerate(col_combos):
        cosine, cosine_melted = computeCosine(processed_df, col_tuple)
        chart_sim.insertRow(cosine_melted)
        if any(cosine_melted.duplicated(subset = ['patient_1', 'patient_2'])):
            print("breaking")
            break

        try:
            if i % round(len(col_combos)*.1) ==0:
                print(f"Round {i} of {len(col_combos)}")
        except:
            pass

     # export
    print("Exporting...")
    chart_sim.export(output_dir)
    display(chart_sim.similarities)
    print("\n")

chart events_categorical_activity.csv
Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,4,4,1.000000
1,52,4,0.804625
2,78,4,0.878708
3,117,4,0.496871
4,140,4,0.768637
...,...,...,...
8231156,99791,99923,0.905853
8231157,99830,99923,0.821345
8231158,99836,99923,0.821575
8231159,99862,99923,0.850889




chart events_categorical_demographics.csv
Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,4,4,1.000000
1,52,4,0.810090
2,78,4,0.644189
3,117,4,0.996620
4,140,4,0.581554
...,...,...,...
8317451,99791,99923,0.894443
8317452,99830,99923,0.735930
8317453,99836,99923,0.563377
8317454,99862,99923,0.814624




chart events_categorical_diet.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,4,140,0.237708
1,52,140,0.343705
2,78,140,0.463452
3,117,140,0.405096
4,143,140,0.410488
...,...,...,...
1527779,99756,99791,0.591215
1527780,99830,99791,0.535558
1527781,99836,99791,0.530746
1527782,99862,99791,0.247013




chart events_categorical_heart lung.csv
Exporting...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,patient_1,patient_2,similarity
0,4445,4,0.088427
1,6800,4,0.429710
2,12246,4,0.291573
3,19236,4,0.366365
4,4445,52,0.177136
...,...,...,...
11471,19236,99862,0.545899
11472,4445,99923,0.174426
11473,6800,99923,0.499858
11474,12246,99923,0.479771




chart events_categorical_mental drug.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,3632,188,0.045549
1,7009,188,0.926614
2,10248,188,0.684595
3,11912,188,0.547533
4,18892,188,0.607298
...,...,...,...
329662,98944,99923,0.124035
329663,99067,99923,0.287931
329664,99685,99923,0.701646
329665,99768,99923,0.124035




chart events_categorical_pain.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,4,52,0.291644
1,78,52,0.206223
2,140,52,0.506018
3,186,52,0.506018
4,279,52,0.291644
...,...,...,...
1388175,98254,99923,0.176618
1388176,98402,99923,0.568080
1388177,98944,99923,0.284082
1388178,99514,99923,0.234940




chart events_categorical_physical assessment.csv
Exporting...


Unnamed: 0,patient_1,patient_2,similarity
0,4,4,1.000000
1,52,4,0.600567
2,78,4,0.772627
3,117,4,0.568409
4,140,4,0.705856
...,...,...,...
8242636,99791,99923,0.929754
8242637,99830,99923,0.791460
8242638,99836,99923,0.769693
8242639,99862,99923,0.734261




