In [2]:
import numpy as np
import scipy as sc
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

import SimpSOM as sps
import minisom as msom
import sompy
from umap import UMAP
from sklearn.manifold import TSNE

import random

import os
import sys
from collections import Counter
from collections import namedtuple
import inspect

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import HuberRegressor, RANSACRegressor, BayesianRidge
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA, NMF
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer

from sklearn.cluster import SpectralClustering, AffinityPropagation, OPTICS, AgglomerativeClustering
import HDBSCAN


Data transformation:
* standard scaling
* robust scaler 

Feature reduction:
* minimum univariate distribution difference
* PCA 
* UMAP

Clustering method:
* HDBSCAN
* Spectral
* SOM

This analysis should provide us with an intuition of the separability of the targets 
with the given features.

**Output**: clusters can be used as features. 

In [3]:
os.chdir("T:\laupodteam\AIOS\Bram")
HS = pd.read_csv("data/HeartScore/Data/MATRIX_FULL_23jul2019_ECG.csv", sep=";")

index_cols = ['pathos_key', 'upod_id'] 
date_cols = ['AcquisitionDateTime_ECG'] 
meta_cols = ['setsrc', 'Door', 'Analyzer']
pheno_cols = ['AGE', 'gender', 'BMI', 'RF_Diab', 'RF_Smok', 'RF_HyperTens', 'RF_HyperChol', 'RF_CVDHist', 'RF_FamHist', 'RF_obese30']
hs_cols = ['HS_AGE', 'HS_History', 'HS_ECG', 'HS_new', 'HS_RiskFacts', 'HS_new2']
tn_cols = ['tn_admission', 'tn_slope2', 'HN_TN']

# convert bool in int64
for _col in HS.columns.tolist():
    if str(HS[_col].dtype)=='bool':
        HS[_col] = HS[_col].astype(int)
        
target = 'casecontrol'
HS.rename(index=str, columns={target: 'target'}, inplace=True)
tmap = {'Control': 0, 'NSTEMI': 1}
HS['target'] = HS.target.map(tmap)

gmap = {'M': 0, 'F': 1}
HS['gender'] = HS.gender.map(gmap)

HS.set_index(index_cols, inplace=True)

cols = HS.columns.tolist()
var_cols = list(set(cols) - set(meta_cols) - set(index_cols) -set(date_cols) - set(['target']))

cell_dyn_cols = ["c_b_wbc","c_b_wvf","c_b_neu","c_b_seg","c_b_bnd","c_b_ig","c_b_lym","c_b_lyme","c_b_vlym","c_b_mon","c_b_mone","c_b_blst",
                 "c_b_eos","c_b_bas","c_b_pneu","c_b_pseg","c_b_pbnd","c_b_pig","c_b_plym","c_b_plyme","c_b_pvlym","c_b_pmon","c_b_pmone",
                 "c_b_pblst","c_b_peos","c_b_pbas","c_b_namn","c_b_nacv","c_b_nimn","c_b_nicv","c_b_npmn","c_b_npcv","c_b_ndmn",
                 "c_b_ndcv","c_b_nfmn","c_b_nfcv","c_b_Lamn","c_b_Lacv","c_b_Limn","c_b_Licv","delay_Celldyn"]
ecg_cols_agg =  ["VentricularRate_ECG","AtrialRate_ECG","P_RInterval_ECG","QRS_Duration_ECG","Q_TInterval_ECG",
                 "QTCCalculation_ECG","PAxis_ECG","RAxis_ECG","TAxis_ECG","QRSCount_ECG","QOnset_ECG",
                 "QOffset_ECG","POnset_ECG","POffset_ECG","T_Onset_ECG","T_Offset_ECG","QRS_Onset_ECG",
                 "QRS_Offset_ECG","AvgRRInterval_ECG","QTcFredericia_ECG","QTcFramingham_ECG","QTc_Bazett_ECG"]

ecg_leads = ['Lead_I_', 'Lead_II_', 'Lead_III_', 'Lead_V1_', 'Lead_V2_', 'Lead_V3_', 'Lead_V4_', 'Lead_V5_', 'Lead_V6_', 'Lead_aVF_', 'Lead_aVL_', 'Lead_aVR_']
ecg_msrmnt = ['MaxST_ECG',  'Max_R_Ampl_ECG', 'Max_S_Ampl_ECG', 'MinST_ECG', 'PFull_Area_ECG', 'PP_Area_ECG', 'PP_Duration_ECG',
 'PP_PeakAmpl_ECG', 'PP_PeakTime_ECG', 'P_Area_ECG', 'P_Duration_ECG', 'P_PeakAmpl_ECG', 'P_PeakTime_ECG', 'QRS_Area_ECG', 'QRS_Balance_ECG',
 'QRS_Deflection_ECG', 'QRSint_ECG', 'Q_Area_ECG', 'Q_Duration_ECG', 'Q_PeakAmpl_ECG', 'Q_PeakTime_ECG', 'RP_Area_ECG', 'RP_Duration_ECG', 'RP_PeakAmpl_ECG',
 'RP_PeakTime_ECG', 'R_Area_ECG', 'R_Duration_ECG', 'R_PeakAmpl_ECG', 'R_PeakTime_ECG', 'SP_Area_ECG', 'SP_Duration_ECG', 'SP_PeakAmpl_ECG', 
 'SP_PeakTime_ECG', 'STE_ECG', 'STJ_ECG', 'STM_ECG', 'S_Area_ECG', 'S_Duration_ECG', 'S_PeakAmpl_ECG', 'S_PeakTime_ECG',
 'TFull_Area_ECG', 'TP_Area_ECG', 'TP_Duration_ECG', 'TP_PeakAmpl_ECG', 'TP_PeakTime_ECG', 'T_Area_ECG', 'T_Duration_ECG', 'T_End_ECG',
 'T_PeakAmpl_ECG', 'T_PeakTime_ECG', 'T_Special_ECG', 'P_OnsetAmpl_ECG']


ecg_cols_dyn = [_lead+_msrmnt for _lead in ecg_leads for _msrmnt in ecg_msrmnt]

ecg_cols_agg = list(set(ecg_cols_agg) & set(var_cols))
ecg_cols_dyn = list(set(ecg_cols_dyn) & set(var_cols))
cell_dyn_cols = list(set(cell_dyn_cols) & set(var_cols))

ecg_cols = list(set(ecg_cols_agg+ecg_cols_dyn))
other_cols = list(set(var_cols)-set(ecg_cols)-set(cell_dyn_cols))

In [6]:
scaler= StandardScaler() # StandardScaler(), MinMaxScaler(), RobustScaler()
dim_reduction = {'ecg': PCA(n_components=6), 'celldyn': PCA(n_components=20)} # dict with dimension reduction per data group, or one dim red for all, or None, methods: PCA, NMF, UMAP
# dict with column name and impute type: median, mean, remove, regressor, (nmf?), or None, or knnimputer, or iterative which uses a round-robin approach using BayesianRidge as the regressor
imputance = {'BMI': 'median', 
             'P_RInterval_ECG': BayesianRidge(), 
             'POnset_ECG': BayesianRidge(), 
             'PAxis_ECG': RANSACRegressor(), 
             'POffset_ECG': ExtraTreesRegressor(n_estimators=200),
             'delay_Celldyn': MLPRegressor(hidden_layer_sizes=(80,50,30))}

feature_weights = 'glm' # glm, tree, gam
clustering = 'hdbscan' # hdbscan, SOM, spectral

In [7]:
# Scaling
if scaling is not None:
    dat = pd.DataFrame(data=scaler.fit_transform(HS[var_cols]), index=HS.index, columns=var_cols)
    dat = dat.join(HS[meta_cols])
else:
    dat = HS

In [8]:
# imputance
nan_cols = list(dat[var_cols].isna().sum()[dat[var_cols].isna().sum()>0].index)
if imputance is not None:
    if isinstance(imputance, dict):
        for _imp_key, _imp_val in imputance.items():
            if type(_imp_val)==str:
                if _imp_val == 'median': 
                    dat.loc[dat[_imp_key].isna(), _imp_key] = np.nanmedian(dat[_imp_key])
                elif _imp_val == 'mean':
                    dat.loc[dat[_imp_key].isna(), _imp_key] = np.nanmean(dat[_imp_key])
                elif _imp_val == 'remove':
                    dat = dat.dropna(subset=[_imp_key])
            elif 'sklearn' in str(type(_imp_val)):  
                _sub_cols = list(set(var_cols)  - set(nan_cols))
                _y = dat.loc[~dat[_imp_key].isna(), _imp_key]
                _X_train = dat.loc[~dat[_imp_key].isna(), _sub_cols]
                _X_test = dat.loc[dat[_imp_key].isna(), _sub_cols]
                try:
                    dat.loc[dat[_imp_key].isna(), _imp_key] = _imp_val.fit(_X_train, _y).predict(_X_test)
                except Exception as e:
                    print("Imputance failed for {}, shapes: {}, {}, {}".format(_imp_key, _X_train.shape, _y.shape, _X_test.shape))
                    if _X_test.shape[0]==0:
                        print("Hmm, you probably already ran the imputer, please reload the data...")
    else:
        if imputance=='iterative':
            imp = IterativeImputer(estimator=BayesianRidge(), max_iter=10)
        elif imputance=='knnimputer':
            imp= KNNImputer(n_neighbors=5)
            
        dat = pd.DataFrame(data=imp.fit_transform(dat[var_cols]), index=HS.index, columns=var_cols)
        dat = dat.join(dat[meta_cols])        

In [9]:
# Dim reduction
if dim_reduction is not None:
    if isinstance(dim_reduction, dict):
        assert(set(dim_reduction.keys()).issubset(['ecg', 'celldyn'])), "Check the dim_reduction keys"
        ecols = ['ecg_'+str(i) for i in range(0, dim_reduction['ecg'].n_components)]
        ecg_red = pd.DataFrame(dim_reduction['ecg'].fit_transform(dat[ecg_cols]), index=dat.index, columns=ecols)
        ccols = ['celldyn_'+str(i) for i in range(0, dim_reduction['celldyn'].n_components)]
        celldyn_red = pd.DataFrame(dim_reduction['celldyn'].fit_transform(dat[cell_dyn_cols]), index=dat.index, columns=ccols)
        
        dat_red = dat[other_cols].join(ecg_red).join(celldyn_red)
    else:
        rcols = ['red_'+str(i) for i in range(0, dim_reduction['ecg'].n_components)]
        tot_red = pd.DataFrame(dim_reduction.fit_transform(dat[var_cols]), index=dat.index, columns=rcols)
        dat_red = dat[other_cols].join(tot_red)

## SOM clustering

In [None]:
# Clustering
# Cluster both dat_red and dat
# SOM: https://github.com/fcomitani/SimpSOM
# miniSOM: https://github.com/JustGlowing/minisom
# SOMPY: https://gist.github.com/sevamoo/035c56e7428318dd3065013625f12a11
# customSOM : https://pythonhosted.org/kohonen/_modules/kohonen/kohonen.html
net = sps.somNet(24, 24, dat[var_cols].values, PBC=True, n_jobs=4, PCI=True)
net.train(0.01, 1000)
net.nodes_graph(colnum=0)
net.diff_graph()
#Project the datapoints on the new 2D network map.
net.project(dat[var_cols].values, labels=HS.target.values) #  labels=labels
#Cluster the datapoints according to the Quality Threshold algorithm.
net.cluster(dat[var_cols].values, type='qthresh')
plt.show()

Periodic Boundary Conditions active.
The weights will be initialised with PCA.
Training SOM... done!


In [None]:
# Spectral
# pre-set 2 clusters

# Agglomerative 
# pre-set 2 clusters

# OPTICS 


# HDBSCAN


In [240]:
# Separability of clusters by target
y = HS[['target']]




# One more try, using supervised clustering.