## Set path

In [1]:
import os
os.chdir(os.path.join(os.getcwd(), 'code/'))
outputs_dir = os.path.abspath(os.path.join(os.getcwd(), '../outputs'))
dataset_dir = os.path.abspath(os.path.join(os.getcwd(), '../datasets'))

## Load necessary libraries

In [2]:
from evaluation_utils import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import anndata
import scanpy as sc

## Load test set and preprocess protein expression data to obtain ground truth

In [3]:
test_data = anndata.read(os.path.join(dataset_dir, "different samples/CITE-SLN111-Gayoso/Mouse2.h5ad"))

test_protein = anndata.AnnData(X=test_data.obsm["protein_expression"])
test_protein_var = pd.DataFrame(test_data.uns["protein_name"], columns=["protein_name"])
test_protein.var = test_protein_var
test_protein.obs = test_data.obs

sc.pp.normalize_total(test_protein)
sc.pp.log1p(test_protein)
sc.pp.scale(test_protein)

true_protein_expression = pd.DataFrame(test_protein.X, columns=test_protein.var["protein_name"], index=test_protein.obs.index)

true_protein_expression

protein_name,ADT_CD102_A0104,ADT_CD103_A0201,ADT_CD106_A0226,ADT_CD115(CSF-1R)_A0105,ADT_CD117(c-Kit)_A0012,ADT_CD11a_A0595,ADT_CD11c_A0106,ADT_CD122(IL-2Rb)_A0227,ADT_CD127(IL-7Ra)_A0198,ADT_CD134(OX-40)_A0195,...,ADT_TCRVr1.1-Cr4_A0209,ADT_TCRVr2_A0211,ADT_TCRVr3_A0210,ADT_TCRbchain_A0120,ADT_TCRr-d_A0121,ADT_TER-119-ErythroidCells_A0122,ADT_Tim-4_A0567,ADT_XCR1_A0568,ADT_anti-P2RY12_A0415,ADT_integrinb7_A0214
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGAATCTAG-2,0.619120,-0.114163,-0.328064,-0.024065,-0.549299,-0.484973,-1.195373,-0.388596,-0.436230,1.083798,...,1.172691,-1.883148,-0.465361,-0.686746,0.323900,-0.760324,0.666260,-0.070138,0.157762,0.011546
AAACCCACACCGGAAA-2,1.234581,-0.050368,0.343594,0.262031,-0.260612,-1.908212,-0.129476,-0.123102,-0.223401,1.979306,...,-0.619515,-0.336774,1.096280,-0.547913,-2.139202,1.103162,0.411873,-0.524168,0.979452,-0.987889
AAACCCACACTACTTT-2,-0.162405,-0.066871,-0.480999,0.246325,-0.276461,0.027686,0.073638,-1.639926,-0.532065,-0.938885,...,1.001469,-0.729317,0.009238,-1.485906,-1.008919,0.630121,-0.404217,0.948042,-0.029336,0.260531
AAACCCAGTAGGCAAC-2,0.684397,0.394751,1.104627,-0.722757,-0.684387,-0.219446,0.675170,0.646906,0.393864,0.629129,...,-0.619515,-0.042766,0.971294,-0.403301,-0.336378,0.629527,-0.131775,1.210810,0.362341,-1.377752
AAACCCAGTCTCAGGC-2,0.443483,-0.748948,0.152621,-0.118447,-0.644535,-1.494500,-1.195373,-0.473509,-1.439341,0.874759,...,-0.619515,0.888528,0.848808,-0.534204,1.373377,0.672672,-2.771737,-2.060979,1.466940,-0.042302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTAAGATCA-2,0.251860,-1.049443,0.254363,-0.038365,0.184810,-0.547920,0.173439,0.307960,0.831689,0.541697,...,1.154138,-1.883148,0.239056,-0.982248,-2.139202,0.761077,-1.670627,-0.806284,0.137797,-0.111452
TTTGTTGGTCAGACTT-2,-0.174888,0.249366,-2.437645,-0.164985,-0.285121,0.065340,-0.404774,-0.514865,1.871109,-0.387090,...,-0.619515,-1.883148,-0.719246,1.058828,0.135109,0.621864,-0.413961,1.428109,-0.040052,-0.364608
TTTGTTGGTGTTTACG-2,2.367218,-0.579395,1.120177,-0.227285,0.428233,-1.859143,0.015039,1.901414,1.062428,-0.522693,...,1.847485,-0.127116,0.401274,-0.785361,-0.419007,2.386312,0.676703,-0.315806,0.842595,-1.293236
TTTGTTGTCAGAATAG-2,-0.488316,0.622193,0.566598,1.466732,-1.112772,-0.727710,0.229004,-0.341256,-0.398281,-0.561470,...,-0.619515,0.183298,-0.865580,-0.957726,0.783576,0.835150,0.113166,1.497642,0.639406,-0.117564


## Load prediction

In [4]:
predicted_protein_expression = pd.read_table(
    os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/sciPENN/test_protein_prediction.txt"), sep="\t", index_col=0)
predicted_protein_expression

Unnamed: 0_level_0,ADT_CD102_A0104,ADT_CD103_A0201,ADT_CD106_A0226,ADT_CD115(CSF-1R)_A0105,ADT_CD117(c-Kit)_A0012,ADT_CD11a_A0595,ADT_CD11c_A0106,ADT_CD122(IL-2Rb)_A0227,ADT_CD127(IL-7Ra)_A0198,ADT_CD134(OX-40)_A0195,...,ADT_TCRVr1.1-Cr4_A0209,ADT_TCRVr2_A0211,ADT_TCRVr3_A0210,ADT_TCRbchain_A0120,ADT_TCRr-d_A0121,ADT_TER-119-ErythroidCells_A0122,ADT_Tim-4_A0567,ADT_XCR1_A0568,ADT_anti-P2RY12_A0415,ADT_integrinb7_A0214
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGAATCTAG-2,-0.075074,-0.480129,0.314208,-0.115879,-0.250983,-0.529469,-0.178850,-0.290278,-0.299035,-0.212885,...,0.146033,0.048426,-0.005435,-0.769430,0.216387,-0.038839,0.434090,0.035958,-0.275547,0.352861
AAACCCACACCGGAAA-2,2.070195,0.266736,0.577778,0.676200,0.448604,-1.921979,0.222479,0.173083,0.093942,0.989365,...,0.283938,0.974859,1.071386,-0.541235,1.082818,0.066703,0.969275,0.627288,0.849877,-0.683034
AAACCCACACTACTTT-2,0.115370,-0.626729,-0.219099,-0.348924,-0.692816,-0.538368,-0.355847,-0.490868,-0.591396,-0.681801,...,-0.193898,-0.422313,-0.511602,-0.866305,-0.478096,0.009231,-0.336346,-0.341177,-0.385641,-0.186743
AAACCCAGTAGGCAAC-2,0.130569,-0.453139,0.033271,-0.069973,-0.526858,-0.628178,-0.214568,-0.006709,-0.632176,-0.300095,...,-0.085706,-0.063461,-0.028688,-0.822515,-0.128905,-0.254644,-0.107969,-0.092040,0.012610,0.138125
AAACCCAGTCTCAGGC-2,0.007473,-0.193142,0.059421,0.181116,-0.135164,-0.593218,-0.057032,0.146777,-0.613469,0.141324,...,0.177557,0.229020,0.481565,-0.742699,0.135457,-0.177336,-0.013411,0.217457,0.455122,0.127714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTAAGATCA-2,0.150713,0.044235,-0.300959,-0.170286,-0.499647,-0.676415,-0.081885,-0.165093,-0.581892,-0.181092,...,-0.352278,-0.047560,-0.140448,-0.741950,-0.260319,0.320621,-0.243821,0.079172,0.137088,0.377301
TTTGTTGGTCAGACTT-2,-0.658673,-0.294298,-0.189475,-0.143172,0.529070,0.544871,-0.196329,-0.126351,0.927257,0.199352,...,-0.028141,0.145750,0.132429,1.455479,0.131354,-0.264727,-0.115566,0.005645,0.092516,-0.355014
TTTGTTGGTGTTTACG-2,1.526162,-0.064761,0.048496,0.044260,-0.099444,-1.181569,0.008174,-0.128753,-0.251232,0.148012,...,-0.220676,0.112171,0.257751,-0.686211,0.091789,0.014448,0.166384,0.145029,0.275247,-0.891063
TTTGTTGTCAGAATAG-2,0.805792,0.036285,0.201381,0.152380,-0.268955,-0.586130,-0.229748,0.134020,-0.539321,0.178207,...,0.011201,0.300608,0.417309,-0.690056,0.459855,-0.132069,0.189741,0.187593,0.592481,0.393760


## Get protein names and cell names

In [5]:
protein_names = np.array(true_protein_expression.columns)
cell_names = np.array(true_protein_expression.index)
protein_names, cell_names

(array(['ADT_CD102_A0104', 'ADT_CD103_A0201', 'ADT_CD106_A0226',
        'ADT_CD115(CSF-1R)_A0105', 'ADT_CD117(c-Kit)_A0012',
        'ADT_CD11a_A0595', 'ADT_CD11c_A0106', 'ADT_CD122(IL-2Rb)_A0227',
        'ADT_CD127(IL-7Ra)_A0198', 'ADT_CD134(OX-40)_A0195',
        'ADT_CD135_A0098', 'ADT_CD137_A0194', 'ADT_CD140a_A0573',
        'ADT_CD14_A0424', 'ADT_CD15(SSEA-1)_A0076',
        'ADT_CD150(SLAM)_A0203', 'ADT_CD16-32_A0109',
        'ADT_CD169(Siglec-1)_A0440', 'ADT_CD172a(SIRPa)_A0422',
        'ADT_CD183(CXCR3)_A0228', 'ADT_CD184(CXCR4)_A0444',
        'ADT_CD192(CCR2)_A0426', 'ADT_CD195(CCR5)_A0376',
        'ADT_CD196(CCR6)_A0225', 'ADT_CD197(CCR7)_A0377', 'ADT_CD19_A0093',
        'ADT_CD200(OX2)_A0079', 'ADT_CD201(EPCR)_A0439',
        'ADT_CD204(Msr1)_A0448', 'ADT_CD206(MMR)_A0173', 'ADT_CD20_A0192',
        'ADT_CD21-CD35(CR2-CR1)_A0107', 'ADT_CD223(LAG-3)_A0378',
        'ADT_CD23_A0108', 'ADT_CD24_A0212', 'ADT_CD25_A0097',
        'ADT_CD274(B7-H1_PD-L1)_A0190', 'ADT_CD278

## Calculate PCC at the protein level

In [6]:
p_corrs_protein_level = calculate_PCC_array_protein_level(protein_names, predicted_protein_expression, true_protein_expression)
p_corrs_protein_level

Unnamed: 0,PCC
ADT_CD102_A0104,0.756204
ADT_CD103_A0201,0.724803
ADT_CD106_A0226,0.559213
ADT_CD115(CSF-1R)_A0105,0.535132
ADT_CD117(c-Kit)_A0012,0.593525
...,...
ADT_TER-119-ErythroidCells_A0122,0.105329
ADT_Tim-4_A0567,0.592028
ADT_XCR1_A0568,0.307506
ADT_anti-P2RY12_A0415,0.238258


## Perform Z-score transformation at the protein level

In [7]:
true_protein_expression_zscore = pd.DataFrame(StandardScaler().fit_transform(true_protein_expression.values), index=true_protein_expression.index, 
                                              columns=true_protein_expression.columns)
predicted_protein_expression_zscore = pd.DataFrame(StandardScaler().fit_transform(predicted_protein_expression.values), 
                                                   index=predicted_protein_expression.index, columns=predicted_protein_expression.columns)    
true_protein_expression_zscore, predicted_protein_expression_zscore

(protein_name        ADT_CD102_A0104  ADT_CD103_A0201  ADT_CD106_A0226  \
 index                                                                   
 AAACCCAAGAATCTAG-2         0.619161        -0.114171        -0.328085   
 AAACCCACACCGGAAA-2         1.234663        -0.050371         0.343617   
 AAACCCACACTACTTT-2        -0.162416        -0.066876        -0.481030   
 AAACCCAGTAGGCAAC-2         0.684442         0.394777         1.104700   
 AAACCCAGTCTCAGGC-2         0.443512        -0.748997         0.152631   
 ...                             ...              ...              ...   
 TTTGTTGGTAAGATCA-2         0.251877        -1.049512         0.254380   
 TTTGTTGGTCAGACTT-2        -0.174900         0.249382        -2.437806   
 TTTGTTGGTGTTTACG-2         2.367374        -0.579433         1.120251   
 TTTGTTGTCAGAATAG-2        -0.488349         0.622234         0.566636   
 TTTGTTGTCGAGTGGA-2        -0.174103        -1.540474        -0.460084   
 
 protein_name        ADT_CD115(CSF-1

## Calculate RMSE at the protein level

In [8]:
rmses_protein_level = calculate_RMSE_array_protein_level(protein_names, predicted_protein_expression_zscore, true_protein_expression_zscore)
rmses_protein_level

Unnamed: 0,RMSE
ADT_CD102_A0104,0.698278
ADT_CD103_A0201,0.741886
ADT_CD106_A0226,0.938922
ADT_CD115(CSF-1R)_A0105,0.964229
ADT_CD117(c-Kit)_A0012,0.901638
...,...
ADT_TER-119-ErythroidCells_A0122,1.337663
ADT_Tim-4_A0567,0.903296
ADT_XCR1_A0568,1.176855
ADT_anti-P2RY12_A0415,1.234295


## Calculate PCC at the cell level

In [9]:
p_corrs_cell_level = calculate_PCC_array_cell_level(cell_names, predicted_protein_expression, true_protein_expression)
p_corrs_cell_level

Unnamed: 0,PCC
AAACCCAAGAATCTAG-2,0.449261
AAACCCACACCGGAAA-2,0.609972
AAACCCACACTACTTT-2,0.484842
AAACCCAGTAGGCAAC-2,0.371011
AAACCCAGTCTCAGGC-2,0.526625
...,...
TTTGTTGGTAAGATCA-2,0.348971
TTTGTTGGTCAGACTT-2,0.582157
TTTGTTGGTGTTTACG-2,0.504733
TTTGTTGTCAGAATAG-2,0.423039


## Perform $\ell_2$ normalization at the cell level

In [10]:
l2norm_true_protein_expression = np.sqrt(np.multiply(true_protein_expression, true_protein_expression).sum(axis=1))
l2norm_predicted_protein_expression = np.sqrt(np.multiply(predicted_protein_expression, predicted_protein_expression).sum(axis=1)) + 1e-08
true_protein_expression_l2norm = true_protein_expression / l2norm_true_protein_expression.values.reshape(-1,1)
predicted_protein_expression_l2norm = predicted_protein_expression / l2norm_predicted_protein_expression.values.reshape(-1,1)
true_protein_expression_l2norm, predicted_protein_expression_l2norm

(protein_name        ADT_CD102_A0104  ADT_CD103_A0201  ADT_CD106_A0226  \
 index                                                                   
 AAACCCAAGAATCTAG-2         0.071939        -0.013265        -0.038120   
 AAACCCACACCGGAAA-2         0.113341        -0.004624         0.031544   
 AAACCCACACTACTTT-2        -0.019292        -0.007943        -0.057136   
 AAACCCAGTAGGCAAC-2         0.083415         0.048113         0.134633   
 AAACCCAGTCTCAGGC-2         0.045271        -0.076454         0.015580   
 ...                             ...              ...              ...   
 TTTGTTGGTAAGATCA-2         0.028641        -0.119342         0.028926   
 TTTGTTGGTCAGACTT-2        -0.016706         0.023820        -0.232853   
 TTTGTTGGTGTTTACG-2         0.201816        -0.049396         0.095500   
 TTTGTTGTCAGAATAG-2        -0.050533         0.064388         0.058634   
 TTTGTTGTCGAGTGGA-2        -0.016098        -0.142437        -0.042541   
 
 protein_name        ADT_CD115(CSF-1

## Calculate RMSE at the cell level

In [11]:
rmses_cell_level = calculate_RMSE_array_cell_level(cell_names, predicted_protein_expression_l2norm, true_protein_expression_l2norm)
rmses_cell_level

Unnamed: 0,RMSE
AAACCCAAGAATCTAG-2,0.100312
AAACCCACACCGGAAA-2,0.082822
AAACCCACACTACTTT-2,0.096899
AAACCCAGTAGGCAAC-2,0.110144
AAACCCAGTCTCAGGC-2,0.103064
...,...
TTTGTTGGTAAGATCA-2,0.108942
TTTGTTGGTCAGACTT-2,0.087343
TTTGTTGGTGTTTACG-2,0.092894
TTTGTTGTCAGAATAG-2,0.110322


## Save evaluation results

In [12]:
p_corrs_protein_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/sciPENN/PCC_protein_level.xlsx"))
rmses_protein_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/sciPENN/RMSE_protein_level.xlsx"))
p_corrs_cell_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/sciPENN/PCC_cell_level.xlsx"))
rmses_cell_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/sciPENN/RMSE_cell_level.xlsx"))