In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl
import os
from SamBA.samba import NeighborHoodClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from random_scm.random_scm_classifier import RandomScmClassifier
from summit.multiview_platform.monoview_classifiers.scm import SCM
from sklearn.ensemble import RandomForestClassifier

rs = np.random.RandomState(42)

splits = 100

def best_feats(importances, limit=5):
    best_inds = np.argsort(-importances)
    return dict((str(ind), np.round(importances[ind], 2)) for ind in best_inds[:limit])

def benchmark(X, y, n_splits=10, train_size=0.75):
    neigh_clf = NeighborHoodClassifier(
                     base_estimator=DecisionTreeClassifier(max_depth=1,
                                                           splitter='best',
                                                           criterion='gini'),
                     n_estimators=10)
    rscm_clf = RandomScmClassifier(n_estimators=10, max_rules=10,
                     p_options=[1.0],
                     model_type="conjunction",
                     random_state=rs)
    scm_clf = SCM(
                random_state=rs,
                model_type="conjunction",
                max_rules=10,
                p=1.0)
    rf_clf = RandomForestClassifier(n_estimators=10, random_state=rs, max_depth=3)

    accuracies = np.zeros((n_splits, 4))
    n_feats = np.zeros((n_splits, 4))
    feature_importances = np.zeros((X.shape[1], n_splits, 4))
    for i in range(n_splits):
        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=rs, train_size=train_size)
        neigh_clf.fit(X_train, y_train)
        rscm_clf.fit(X_train, y_train)
        scm_clf.fit(X_train, y_train)
        rf_clf.fit(X_train, y_train)

        neigh_clf_preds = neigh_clf.predict(X_test)
        rscm_clf_preds = rscm_clf.predict(X_test)
        scm_clf_preds = scm_clf.predict(X_test)
        rf_clf_preds = rf_clf.predict(X_test)

        accuracies[i, 0] = accuracy_score(neigh_clf_preds, y_test)
        accuracies[i, 1] = accuracy_score(rscm_clf_preds, y_test)
        accuracies[i, 2] = accuracy_score(scm_clf_preds, y_test)
        accuracies[i, 3] = accuracy_score(rf_clf_preds, y_test)

        feature_importances[:, i, 0] = neigh_clf.feature_importances_
        feature_importances[:, i, 1] = rscm_clf.feature_importances_
        feature_importances[:, i, 2] = scm_clf.feature_importances_
        feature_importances[:, i, 3] = rf_clf.feature_importances_

        n_feats[i, 0]= len(np.where(neigh_clf.feature_importances_!=0)[0])
        n_feats[i, 1]= len(np.where(rscm_clf.feature_importances_!=0)[0])
        n_feats[i, 2]= len(np.where(scm_clf.feature_importances_!=0)[0])
        n_feats[i, 3]= len(np.where(rf_clf.feature_importances_!=0)[0])

    mean_accs = np.round(np.mean(accuracies, axis=0), 2)
    stds_accs = np.round(np.std(accuracies, axis=0), 2)
    mean_n_feats = np.round(np.mean(n_feats, axis=0), 2)
    mean_feature_importances = np.round(np.mean(feature_importances, axis=1),2)
    
    print("NeighborHoodClassifier has a mean accuracy of \n\t{} +/-{}, relying on \n\t{} features \n\t {} ".format(mean_accs[0], stds_accs[0], mean_n_feats[0], best_feats(mean_feature_importances[:,0])))
    print("RandomSCM has a mean accuracy of \n\t{} +/-{}, relying on \n\t{} features \n\t {} ".format(mean_accs[1], stds_accs[1], mean_n_feats[1], best_feats(mean_feature_importances[:,1])))
    print("VanillaSCM has a mean accuracy of \n\t{} +/-{}, relying on \n\t{} features \n\t {} ".format(mean_accs[2], stds_accs[2], mean_n_feats[2], best_feats(mean_feature_importances[:,2])))
    print("RandomForest has a mean accuracy of \n\t{} +/-{}, relying on \n\t{} features \n\t {} ".format(mean_accs[3], stds_accs[3], mean_n_feats[3], best_feats(mean_feature_importances[:,3])))

# Biological experiments with Neighborhood classifier

## Recover dataset

Task : predict the persitency of covid symptoms with proteomics, and metabolomics.
To extract the data, we use a script, slightly modified. 

First, let's extract the metadata that will provide the classification labels :

In [2]:
data_path = '/home/baptiste/Documents/Datasets/recover'

metadata_filename = os.path.join(data_path, 'metadata.csv')
meta_df = pd.read_csv(metadata_filename)
meta_df.columns = ['#', 'plate', '-', 'symptoms'] + list(meta_df)[4:]
print('labels metadata :', meta_df['symptoms'].to_list())
meta_idx = meta_df['ID'].to_list()
meta_label = meta_df['symptoms'].to_list()
meta_id_label_dict = {str(k): 1 if v=='S' else 0 for k, v in zip(meta_idx, meta_label)}

labels metadata : ['S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS', 'NS']


Then, we extract the first proteomic features

In [3]:
proteomics_data_filename = os.path.join(data_path, 'proteomics.csv')

dim_df = pd.read_csv(proteomics_data_filename, nrows=1)
dim = len(list(dim_df))
all_cols = [i for i in range(dim)]
feat_cols = all_cols[1:-4]
samplesidx_col = [0]

feat_df = pd.read_csv(proteomics_data_filename, skiprows=4, nrows=1, dtype=str, usecols=feat_cols)
features = list(feat_df)

idx_df = pd.read_csv(proteomics_data_filename, skiprows=6, index_col=0, skipfooter=4, usecols=[0], engine='python')
idx = list(idx_df.index.values)

df1 = pd.read_csv(proteomics_data_filename, skiprows=6, dtype=np.float32, skipfooter=4, usecols=feat_cols, engine='python')
assert df1.shape[0] == len(idx)
assert df1.shape[1] == len(features)

df1['idx'] = idx
df1.set_index('idx', inplace=True)
df1.columns = features

#clean data of samples that are not in metadata :
idx = df1.index.values
y = []
for k in range(len(idx)):
    id = idx[k]
    if id in meta_id_label_dict:
        y.append(meta_id_label_dict[id])
    else:
        # we will not put this sample in the dataset
        #print('sample to remove because of unknown label:', k, id)
        y.append('to_remove')
df1['label'] = y
df1 = df1[df1.label != 'to_remove']

#create X and y matrices for ML :
y = list(df1['label'])
del df1['label']
X = df1.to_numpy()
print('proteomics data :')
print('# of samples : ', df1.shape[0])
print('# of features : ', df1.shape[1])
print('labels:', list(dict.fromkeys(y)))

proteomics data :
# of samples :  100
# of features :  184
labels: [1, 0]


Let's run the neighborhood classifier on this data with several train/test splits : 

In [4]:
benchmark(X, y, n_splits=splits)

NeighborHoodClassifier has a mean accuracy of 
	0.51 +/-0.09, relying on 
	3.26 features 
	 {'86': 0.16, '84': 0.12, '37': 0.06, '67': 0.05, '30': 0.03} 
RandomSCM has a mean accuracy of 
	0.52 +/-0.09, relying on 
	25.15 features 
	 {'86': 0.05, '37': 0.04, '67': 0.04, '84': 0.03, '64': 0.03} 
VanillaSCM has a mean accuracy of 
	0.5 +/-0.09, relying on 
	4.48 features 
	 {'86': 0.07, '37': 0.05, '9': 0.04, '5': 0.04, '0': 0.04} 
RandomForest has a mean accuracy of 
	0.53 +/-0.1, relying on 
	45.77 features 
	 {'67': 0.02, '86': 0.02, '0': 0.01, '110': 0.01, '109': 0.01} 


Let's chek the second proteomic group of features 

In [5]:
print('\nPROTEOMICS CYTOKINES DATA :')
proteomics_cyt_data_filename = os.path.join(data_path, 'proteomics_cyt.csv')

dim_df = pd.read_csv(proteomics_cyt_data_filename, nrows=1)
dim = len(list(dim_df))
all_cols = [i for i in range(dim)]
feat_cols = all_cols[1:-2]
samplesidx_col = [0]

feat_df = pd.read_csv(proteomics_cyt_data_filename, skiprows=4, nrows=1, dtype=str, usecols=feat_cols)
features = list(feat_df)

idx_df = pd.read_csv(proteomics_cyt_data_filename, skiprows=7, index_col=0, skipfooter=14, usecols=[0], engine='python')
idx = list(idx_df.index.values)

df2 = pd.read_csv(proteomics_cyt_data_filename, skiprows=7, dtype=np.float32, skipfooter=14, usecols=feat_cols, na_values=['> ULOQ'], engine='python')
assert df2.shape[0] == len(idx)
assert df2.shape[1] == len(features)

df2['idx'] = idx
df2.set_index('idx', inplace=True)
df2.columns = features

#clean data of samples that are not in metadata :
idx = df2.index.values
y = []
for k in range(len(idx)):
    id = idx[k]
    if id in meta_id_label_dict:
        y.append(meta_id_label_dict[id])
    else:
        # we will not put this sample in the dataset
        #print('sample to remove because of unknown label:', k, id)
        y.append('to_remove')
df2['label'] = y
df2 = df2[df2.label != 'to_remove']

for col in list(df2):
   df2[col].fillna(int(df2[col].mean()), inplace=True)


#create X and y matrices for ML :
y = list(df2['label'])
del df2['label']
X = df2.to_numpy()
print('# of samples : ', df2.shape[0])
print('# of features : ', df2.shape[1])



PROTEOMICS CYTOKINES DATA :
# of samples :  100
# of features :  45


In [6]:
benchmark(X, y, n_splits=splits)

NeighborHoodClassifier has a mean accuracy of 
	0.5 +/-0.09, relying on 
	2.79 features 
	 {'32': 0.29, '30': 0.1, '26': 0.05, '28': 0.05, '21': 0.05} 
RandomSCM has a mean accuracy of 
	0.51 +/-0.09, relying on 
	21.88 features 
	 {'32': 0.1, '9': 0.06, '31': 0.06, '30': 0.05, '19': 0.05} 
VanillaSCM has a mean accuracy of 
	0.51 +/-0.09, relying on 
	5.47 features 
	 {'0': 0.17, '32': 0.09, '14': 0.07, '10': 0.07, '31': 0.06} 
RandomForest has a mean accuracy of 
	0.51 +/-0.09, relying on 
	30.86 features 
	 {'32': 0.05, '30': 0.04, '31': 0.04, '10': 0.03, '19': 0.03} 


Now with the meatbolomics : 

In [7]:
print('\nMETABOLOMICS DATA :')
metabolomics_data_filename = os.path.join(data_path, 'metabolomics.csv')
feat_df = pd.read_csv(metabolomics_data_filename, index_col=0, skiprows=[0], dtype=str, usecols=[0])
features = list(feat_df.index.values)

idx_df = pd.read_csv(metabolomics_data_filename, header=1, nrows=1)
idx = list(idx_df)[1:]
idx = [l[17:22] for l in idx]

labels_df = pd.read_csv(metabolomics_data_filename, nrows=1)
labels = list(labels_df)[1:]

cols_df = pd.read_csv(metabolomics_data_filename, header=1, nrows=1)
cols_list = list(cols_df)

df3 = pd.read_csv(metabolomics_data_filename, header=1, dtype=np.float32, na_values=['#DIV/0!'], usecols=cols_list[1:])
df3 = df3.T
df3['idx'] = idx
df3.set_index('idx', inplace=True)
df3.columns = features
df3 = df3.dropna(axis=1)

#clean data of samples that are not in metadata :
idx = df3.index.values
y = []
for k in range(len(idx)):
    id = idx[k]
    if id in meta_id_label_dict:
        y.append(meta_id_label_dict[id])
    else:
        # we will not put this sample in the dataset
        #print('sample to remove because of unknown label:', k, id)
        y.append('to_remove')
df3['label'] = y
df3 = df3[df3.label != 'to_remove']

#create X and y matrices for ML :
y = list(df3['label'])
del df3['label']
X = df3.to_numpy()

print('metabolomics data :')
print('# of samples : ', df3.shape[0])
print('# of features : ', df3.shape[1])



METABOLOMICS DATA :
metabolomics data :
# of samples :  100
# of features :  3989


In [8]:
benchmark(X, y, n_splits=splits)

NeighborHoodClassifier has a mean accuracy of 
	0.99 +/-0.02, relying on 
	7.51 features 
	 {'1316': 0.07, '2494': 0.06, '2838': 0.06, '2533': 0.05, '899': 0.05} 
RandomSCM has a mean accuracy of 
	0.98 +/-0.03, relying on 
	9.83 features 
	 {'3730': 0.01, '114': 0.01, '328': 0.01, '184': 0.01, '731': 0.01} 
VanillaSCM has a mean accuracy of 
	0.97 +/-0.04, relying on 
	1.0 features 
	 {'76': 0.68, '73': 0.23, '71': 0.07, '68': 0.02, '0': 0.0} 
RandomForest has a mean accuracy of 
	0.98 +/-0.03, relying on 
	12.49 features 
	 {'895': 0.01, '1105': 0.01, '81': 0.01, '78': 0.01, '205': 0.01} 


Now if we try to aggregate some of the views : 

In [9]:
df_1_2 = pd.concat([df1, df2], axis=1)
X = df_1_2.to_numpy()

print('Full proteomic data :')
print('# of samples : ', df_1_2.shape[0])
print('# of features : ', df_1_2.shape[1])

benchmark(X, y, n_splits=splits)

Full proteomic data :
# of samples :  100
# of features :  229
NeighborHoodClassifier has a mean accuracy of 
	0.64 +/-0.07, relying on 
	2.43 features 
	 {'189': 0.46, '211': 0.39, '219': 0.05, '88': 0.02, '197': 0.02} 
RandomSCM has a mean accuracy of 
	0.58 +/-0.1, relying on 
	23.72 features 
	 {'189': 0.19, '211': 0.09, '185': 0.05, '219': 0.05, '192': 0.03} 
VanillaSCM has a mean accuracy of 
	0.62 +/-0.09, relying on 
	4.0 features 
	 {'189': 0.23, '176': 0.07, '70': 0.06, '3': 0.05, '5': 0.05} 
RandomForest has a mean accuracy of 
	0.58 +/-0.09, relying on 
	42.65 features 
	 {'189': 0.05, '211': 0.03, '223': 0.02, '192': 0.02, '197': 0.02} 


Now on the three types of data :

In [10]:
df = pd.concat([df_1_2, df3], axis=1)
df = df.dropna(axis=0)
print('Multi-omics df :')
print('# of samples : ', df.shape[0])
print('# of features : ', df.shape[1])
X = df.to_numpy()

benchmark(X, y, n_splits=splits)

Multi-omics df :
# of samples :  100
# of features :  4218
NeighborHoodClassifier has a mean accuracy of 
	0.95 +/-0.05, relying on 
	2.78 features 
	 {'1078': 0.42, '1103': 0.2, '1245': 0.09, '1968': 0.08, '3100': 0.07} 
RandomSCM has a mean accuracy of 
	0.98 +/-0.03, relying on 
	9.82 features 
	 {'1078': 0.1, '1103': 0.08, '3983': 0.08, '3100': 0.08, '1968': 0.06} 
VanillaSCM has a mean accuracy of 
	0.96 +/-0.05, relying on 
	1.72 features 
	 {'1078': 0.54, '7': 0.35, '1103': 0.05, '1040': 0.02, '1061': 0.02} 
RandomForest has a mean accuracy of 
	0.97 +/-0.04, relying on 
	30.55 features 
	 {'3331': 0.02, '1968': 0.02, '1273': 0.02, '1078': 0.02, '3983': 0.02} 
