In [2]:
import os
import numpy as np
import pandas as pd
from mord import OrdinalRidge
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, precision_recall_fscore_support, matthews_corrcoef, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from scipy.stats import stats

In [10]:
print("RAW DATA: Classification")
rna_path = "/home/bram/jointomicscomp/data/CELL/pbmc_multimodal_RNA_5000MAD.npy"
adt_path = "/home/bram/jointomicscomp/data/CELL/pbmc_multimodal_ADT_5000MAD.npy"

rna = np.load(rna_path)
adt = np.load(adt_path)

# LOAD Split and y

trainInd = np.load("/home/bram/jointomicscomp/data/CELL/trainInd.npy")
validInd = np.load("/home/bram/jointomicscomp/data/CELL/validInd.npy")
testInd = np.load("/home/bram/jointomicscomp/data/CELL/testInd.npy")

cellTypesl1 = np.load("/home/bram/jointomicscomp/data/CELL/celltype.l1cellTypes.npy")
cellTypesl2 = np.load("/home/bram/jointomicscomp/data/CELL/celltype.l2cellTypes.npy")
cellTypesl3 = np.load("/home/bram/jointomicscomp/data/CELL/celltype.l3cellTypes.npy")
cellTypel1 = np.load("/home/bram/jointomicscomp/data/CELL/celltype.l1cellType.npy")
cellTypel2 = np.load("/home/bram/jointomicscomp/data/CELL/celltype.l2cellType.npy")
cellTypel3 = np.load("/home/bram/jointomicscomp/data/CELL/celltype.l3cellType.npy")

RAW DATA: Classification


In [7]:
len(cellTypesl1)

8

In [14]:
def run_classifier(data, y, trainInd, validInd, testInd):
    alphas = np.array([1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 2.0, 5.0, 10., 20.])
    validationPerformance = np.zeros(alphas.shape[0])
    models = []

    for i, a in enumerate(alphas):
        model = LinearSVC(penalty='l2', loss='hinge', C=a, multi_class='ovr', fit_intercept=True, random_state=1, max_iter=10000)

            # train
        model.fit(data[trainInd], y[trainInd])

        # save so that we don't have to re-train
        models.append(model)

        # evaluate using user-specified criterion
        validationPerformance[i] = evaluate_classification(y[validInd], model.predict(data[validInd]), training=True)

    bestModel = models[np.argmax(validationPerformance)]

    predictions = bestModel.predict(data[testInd]).astype(int)

    classifications = evaluate_classification(y[testInd], predictions)

    print("Accuracy : ", classifications[0])
    print("Confusion matrix : \n", classifications[5])
    return classifications

In [17]:
def evaluate_classification(y_true, y_pred, training=False):
    # returns accuracy, precision, recall, f1, mcc, confusion_matrix
    if training:
        return accuracy_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    confMat = confusion_matrix(y_true, y_pred)

    return [acc, pr, rc, f1, mcc, confMat]

In [10]:
print("Celltype l1 from raw RNA data")
run_classifier(rna, cellTypel1, trainInd, validInd, testInd)

Celltype l1 from raw RNA data




Accuracy :  1.0
Confusion matrix : 
 [[10811     0     0     0     0     0     0     0]
 [    0 33646     0     0     0     0     0     0]
 [    0     0 20479     0     0     0     0     0]
 [    0     0     0  2849     0     0     0     0]
 [    0     0     0     0 38724     0     0     0]
 [    0     0     0     0     0 14720     0     0]
 [    0     0     0     0     0     0  2694     0]
 [    0     0     0     0     0     0     0  5487]]


[1.0,
 array([1., 1., 1., 1., 1., 1., 1., 1.]),
 array([1., 1., 1., 1., 1., 1., 1., 1.]),
 array([1., 1., 1., 1., 1., 1., 1., 1.]),
 1.0,
 array([[10811,     0,     0,     0,     0,     0,     0,     0],
        [    0, 33646,     0,     0,     0,     0,     0,     0],
        [    0,     0, 20479,     0,     0,     0,     0,     0],
        [    0,     0,     0,  2849,     0,     0,     0,     0],
        [    0,     0,     0,     0, 38724,     0,     0,     0],
        [    0,     0,     0,     0,     0, 14720,     0,     0],
        [    0,     0,     0,     0,     0,     0,  2694,     0],
        [    0,     0,     0,     0,     0,     0,     0,  5487]])]

In [11]:
print("Celltype l2 from raw RNA data")
run_classifier(rna, cellTypel2, trainInd, validInd, testInd)

Celltype l2 from raw RNA data


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy :  0.9959817633876825
Confusion matrix : 
 [[   57     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0  1855     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0  2534     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0  6111     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0 33665     0     0     0     0     0     0     0
      0     0     1     0     0     0     0     0     0     0     0     0
      0     0     0     0   

[0.9959817633876825,
 array([1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 0.99580771, 1.        , 0.97446912,
        0.96261407, 0.99977309, 1.        , 0.99957699, 0.9997832 ,
        1.        , 1.        , 1.        , 0.99029126, 1.        ,
        1.        , 1.        , 1.        , 0.99044586, 0.99885845,
        0.99717514, 0.99145299, 1.        , 1.        , 1.        ,
        1.        ]),
 array([1.        , 1.        , 1.        , 1.        , 0.9999703 ,
        1.        , 1.        , 0.99539042, 0.98795181, 0.9836374 ,
        0.93965517, 1.        , 1.        , 0.9966259 , 1.        ,
        0.9894958 , 1.        , 0.99665552, 0.99029126, 0.99914785,
        1.        , 1.        , 1.        , 1.        , 0.9994289 ,
        0.98925736, 1.        , 0.99950224, 1.        , 1.        ,
        1.        ]),
 array([1.        , 1.        , 1.        , 1.        , 0.99998515,
        1.        , 1.        , 0.99559902, 0.99393

In [18]:
print("Celltype l3 from raw RNA data")
rna_l3 = run_classifier(rna, cellTypel3, trainInd, validInd, testInd)
rna_l3

Celltype l3 from raw RNA data




Accuracy :  0.9470853681152254
Confusion matrix : 
 [[  4   0   0 ...   0   0   0]
 [  1   3   0 ...   0   0   0]
 [  0   0 103 ...   0   0   0]
 ...
 [  0   0   0 ...  69   0   0]
 [  0   0   0 ...   0  48   0]
 [  0   0   0 ...   0   0  86]]


[0.9470853681152254,
 array([0.66666667, 0.375     , 0.92792793, 0.90839695, 0.96551724,
        0.92307692, 0.95387841, 0.91319444, 0.9909239 , 0.97476341,
        0.97687861, 0.95758929, 0.90909091, 0.875467  , 0.93617021,
        0.87619048, 0.89830508, 0.86046512, 0.84574468, 0.83333333,
        0.97932331, 0.86363636, 0.9       , 0.89285714, 0.96521739,
        0.93333333, 0.90689655, 0.90756303, 0.78571429, 0.9137931 ,
        0.92783505, 0.93846154, 0.94444444, 0.88888889, 1.        ,
        0.92307692, 0.975     , 0.96296296, 0.89108911, 0.91783567,
        0.8622449 , 0.88709677, 0.96774194, 0.82051282, 1.        ,
        0.97368421, 0.91803279, 0.9122807 , 1.        , 0.91666667,
        0.98837209, 0.8125    , 0.89473684, 0.98773006, 1.        ,
        0.94520548, 0.87272727, 0.96629213]),
 array([1.        , 0.75      , 0.93636364, 0.89473684, 0.96078431,
        0.96      , 0.93814433, 0.91637631, 0.99742328, 0.9778481 ,
        0.97126437, 0.98169336, 0.90909091, 0.863

In [13]:
print("Celltype l1 from raw ADT data")
run_classifier(adt, cellTypel1, trainInd, validInd, testInd)

Celltype l1 from raw ADT data




Accuracy :  0.9897225871261881
Confusion matrix : 
 [[10807     0     1     0     0     1     0     2]
 [    0 33536    25     0     3     5     6    71]
 [    0    27 20359     0     2     8     2    81]
 [    0     1     0  2741    99     1     5     2]
 [    4    17     9    87 38492     4   106     5]
 [    4    28    34     0     7 14624     9    14]
 [    4    50    27     2   349     5  2251     6]
 [    1    48   146     0     1    17     4  5270]]


[0.9897225871261881,
 array([0.99879852, 0.99492687, 0.988253  , 0.96855124, 0.98816522,
        0.99720423, 0.94460764, 0.96679508]),
 array([0.99963001, 0.99673067, 0.99414034, 0.96209196, 0.99400888,
        0.99347826, 0.8355605 , 0.96045198]),
 array([0.99921409, 0.99582795, 0.99118793, 0.96531079, 0.99107844,
        0.99533776, 0.88674414, 0.96361309]),
 0.9870709389247482,
 array([[10807,     0,     1,     0,     0,     1,     0,     2],
        [    0, 33536,    25,     0,     3,     5,     6,    71],
        [    0,    27, 20359,     0,     2,     8,     2,    81],
        [    0,     1,     0,  2741,    99,     1,     5,     2],
        [    4,    17,     9,    87, 38492,     4,   106,     5],
        [    4,    28,    34,     0,     7, 14624,     9,    14],
        [    4,    50,    27,     2,   349,     5,  2251,     6],
        [    1,    48,   146,     0,     1,    17,     4,  5270]])]

In [14]:
print("Celltype l2 from raw ADT data")
run_classifier(adt, cellTypel2, trainInd, validInd, testInd)

Celltype l2 from raw ADT data


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy :  0.9485279344718337
Confusion matrix : 
 [[   56     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     1     0     0     0]
 [    0  1538   116   184     1     2     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     2    11     0     1
      0     0     0     0     0     0     0]
 [    0    94  2398    29     1     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     1    10     0     0
      0     0     0     0     0     1     0]
 [    0    88    28  5990     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     4     0     0
      0     0     0     0     0     1     0]
 [    0     2     1     0 33399   165     0     4     0     5     2     4
      0     0     4     8     0     0     0     2     2     5     0     0
     21     0     0    39   

[0.9485279344718337,
 array([1.        , 0.89107764, 0.9414998 , 0.96519497, 0.98374127,
        0.92298806, 0.97730496, 0.95159516, 0.87692308, 0.87540141,
        0.8141743 , 0.97601267, 0.86363636, 0.86788445, 0.96076731,
        0.83552632, 1.        , 0.99665552, 0.96190476, 0.98977853,
        0.96606514, 0.02387268, 0.9478022 , 0.99041534, 0.97257876,
        0.87553854, 0.99137931, 0.97762387, 0.90572391, 0.96072187,
        1.        ]),
 array([0.98245614, 0.82911051, 0.94632991, 0.98019964, 0.99206915,
        0.94780546, 0.98358315, 0.96661545, 0.68674699, 0.89208869,
        0.68994253, 0.97889242, 0.26027397, 0.86166175, 0.961184  ,
        0.26680672, 0.78461538, 0.99665552, 0.98058252, 0.99020026,
        0.99371395, 0.02004454, 0.9212283 , 0.99678457, 0.95202741,
        0.8542737 , 0.99137931, 0.91338975, 0.88778878, 0.95699683,
        0.99550225]),
 array([0.99115044, 0.85897794, 0.94390868, 0.97263944, 0.98788766,
        0.93523215, 0.98043401, 0.9590465 , 0.77027

In [19]:
print("Celltype l3 from raw ADT data")
run_classifier(adt, cellTypel3, trainInd, validInd, testInd)

Celltype l3 from raw ADT data




Accuracy :  0.8622117821598566
Confusion matrix : 
 [[ 1  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  3]
 [ 0  0 62 ...  0  0  0]
 ...
 [ 0  0  0 ... 69  0  0]
 [ 0  0  0 ...  0 43  0]
 [ 0  0  0 ...  0  0 86]]


  _warn_prf(average, modifier, msg_start, len(result))


[0.8622117821598566,
 array([1.        , 0.        , 0.60784314, 0.61386139, 0.57680251,
        0.        , 0.64450867, 0.63636364, 0.97493677, 0.91496063,
        0.92513369, 0.94244604, 0.        , 0.76770186, 0.63265306,
        0.79331307, 0.75206612, 0.42857143, 0.76851852, 0.6       ,
        0.94535519, 0.91666667, 0.25      , 0.62962963, 0.75838926,
        0.89855072, 0.78      , 0.79679144, 0.375     , 0.72429907,
        0.83240223, 0.82608696, 0.75      , 0.71428571, 1.        ,
        0.85714286, 0.96864111, 0.05555556, 0.75330396, 0.78127734,
        0.625     , 0.67142857, 0.76470588, 0.84210526, 0.        ,
        0.95633188, 0.82105263, 0.77037037, 1.        , 0.85714286,
        0.82178218, 1.        , 0.92857143, 0.9702381 , 0.90909091,
        0.80232558, 0.93478261, 0.96629213]),
 array([0.25      , 0.        , 0.56363636, 0.46616541, 0.90196078,
        0.        , 0.91958763, 0.26829268, 0.99320684, 0.9193038 ,
        0.99425287, 0.97425629, 0.        , 0.759