## Set path

In [1]:
import os
os.chdir(os.path.join(os.getcwd(), 'code/'))
outputs_dir = os.path.abspath(os.path.join(os.getcwd(), '../outputs'))

## Load necessary libraries

In [2]:
from evaluation_utils import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import anndata

## Load prediction and ground truth

In [3]:
true_protein_expression = pd.read_table(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Babel/test_protein_groundtruth.txt"), 
                                        sep="\t", index_col=0)
predicted_protein_expression = pd.read_table(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Babel/test_protein_prediction.txt"), 
                                             sep="\t", index_col=0)

true_protein_expression, predicted_protein_expression

(      ADT_CD102_A0104  ADT_CD103_A0201  ADT_CD106_A0226  \
 0            2.335790         0.541323         0.358351   
 1            2.545600         0.497062         0.497062   
 2            2.014434         0.623903         0.359839   
 3            2.317535         0.720685         0.828758   
 4            2.432592         0.396730         0.594206   
 ...               ...              ...              ...   
 7559         2.144180         0.254494         0.544951   
 7560         2.108415         0.826914         0.000000   
 7561         3.195932         0.320597         0.757929   
 7562         1.784287         0.910916         0.688670   
 7563         2.249596         0.208992         0.458114   
 
       ADT_CD115(CSF-1R)_A0105  ADT_CD117(c-Kit)_A0012  ADT_CD11a_A0595  \
 0                    0.252556                0.252556         2.415874   
 1                    0.279102                0.279102         1.439698   
 2                    0.359839                0.35983

## Get protein names and cell names

In [4]:
test_protein_data = anndata.read(
    os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Babel/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod2.h5ad"))
protein_names = np.array(true_protein_expression.columns)
cell_names = np.array(test_protein_data.obs.index)
protein_names, cell_names

(array(['ADT_CD102_A0104', 'ADT_CD103_A0201', 'ADT_CD106_A0226',
        'ADT_CD115(CSF-1R)_A0105', 'ADT_CD117(c-Kit)_A0012',
        'ADT_CD11a_A0595', 'ADT_CD11c_A0106', 'ADT_CD122(IL-2Rb)_A0227',
        'ADT_CD127(IL-7Ra)_A0198', 'ADT_CD134(OX-40)_A0195',
        'ADT_CD135_A0098', 'ADT_CD137_A0194', 'ADT_CD140a_A0573',
        'ADT_CD14_A0424', 'ADT_CD15(SSEA-1)_A0076',
        'ADT_CD150(SLAM)_A0203', 'ADT_CD16-32_A0109',
        'ADT_CD169(Siglec-1)_A0440', 'ADT_CD172a(SIRPa)_A0422',
        'ADT_CD183(CXCR3)_A0228', 'ADT_CD184(CXCR4)_A0444',
        'ADT_CD192(CCR2)_A0426', 'ADT_CD195(CCR5)_A0376',
        'ADT_CD196(CCR6)_A0225', 'ADT_CD197(CCR7)_A0377', 'ADT_CD19_A0093',
        'ADT_CD200(OX2)_A0079', 'ADT_CD201(EPCR)_A0439',
        'ADT_CD204(Msr1)_A0448', 'ADT_CD206(MMR)_A0173', 'ADT_CD20_A0192',
        'ADT_CD21-CD35(CR2-CR1)_A0107', 'ADT_CD223(LAG-3)_A0378',
        'ADT_CD23_A0108', 'ADT_CD24_A0212', 'ADT_CD25_A0097',
        'ADT_CD274(B7-H1_PD-L1)_A0190', 'ADT_CD278

## Calculate PCC at the protein level

In [5]:
p_corrs_protein_level = calculate_PCC_array_protein_level(protein_names, predicted_protein_expression, true_protein_expression)
p_corrs_protein_level



Unnamed: 0,PCC
ADT_CD102_A0104,0.658516
ADT_CD103_A0201,0.755765
ADT_CD106_A0226,0.536617
ADT_CD115(CSF-1R)_A0105,0.562213
ADT_CD117(c-Kit)_A0012,0.562100
...,...
ADT_TER-119-ErythroidCells_A0122,
ADT_Tim-4_A0567,0.530526
ADT_XCR1_A0568,0.221788
ADT_anti-P2RY12_A0415,0.068426


## Perform Z-score transformation at the protein level

In [6]:
true_protein_expression_zscore = pd.DataFrame(StandardScaler().fit_transform(true_protein_expression.values), index=true_protein_expression.index, 
                                              columns=true_protein_expression.columns)
predicted_protein_expression_zscore = pd.DataFrame(StandardScaler().fit_transform(predicted_protein_expression.values), 
                                                   index=predicted_protein_expression.index, columns=predicted_protein_expression.columns)    
true_protein_expression_zscore, predicted_protein_expression_zscore

(      ADT_CD102_A0104  ADT_CD103_A0201  ADT_CD106_A0226  \
 0            0.528275        -0.289040        -0.475452   
 1            0.932322        -0.380566        -0.078409   
 2           -0.090581        -0.118275        -0.471191   
 3            0.493122         0.081857         0.871030   
 4            0.714695        -0.588038         0.199653   
 ...               ...              ...              ...   
 7559         0.159279        -0.882162         0.058667   
 7560         0.090404         0.301524        -1.501184   
 7561         2.184710        -0.745470         0.668290   
 7562        -0.533792         0.475227         0.470046   
 7563         0.362286        -0.976254        -0.189892   
 
       ADT_CD115(CSF-1R)_A0105  ADT_CD117(c-Kit)_A0012  ADT_CD11a_A0595  \
 0                   -0.215540               -0.649119        -0.566809   
 1                   -0.130504               -0.569918        -1.989291   
 2                    0.128127               -0.32903

## Calculate RMSE at the protein level

In [7]:
rmses_protein_level = calculate_RMSE_array_protein_level(protein_names, predicted_protein_expression_zscore, true_protein_expression_zscore)
rmses_protein_level

Unnamed: 0,RMSE
ADT_CD102_A0104,0.826419
ADT_CD103_A0201,0.698906
ADT_CD106_A0226,0.962687
ADT_CD115(CSF-1R)_A0105,0.935722
ADT_CD117(c-Kit)_A0012,0.935842
...,...
ADT_TER-119-ErythroidCells_A0122,1.000000
ADT_Tim-4_A0567,0.968993
ADT_XCR1_A0568,1.247567
ADT_anti-P2RY12_A0415,1.364971


## Calculate PCC at the cell level

In [8]:
p_corrs_cell_level = calculate_PCC_array_cell_level(cell_names, predicted_protein_expression, true_protein_expression)
p_corrs_cell_level

Unnamed: 0,PCC
AAACCCAAGAATCTAG-2,0.959129
AAACCCACACCGGAAA-2,0.912893
AAACCCACACTACTTT-2,0.943490
AAACCCAGTAGGCAAC-2,0.957629
AAACCCAGTCTCAGGC-2,0.940130
...,...
TTTGTTGGTAAGATCA-2,0.945511
TTTGTTGGTCAGACTT-2,0.944862
TTTGTTGGTGTTTACG-2,0.886889
TTTGTTGTCAGAATAG-2,0.929211


## Perform $\ell_2$ normalization at the cell level

In [9]:
l2norm_true_protein_expression = np.sqrt(np.multiply(true_protein_expression, true_protein_expression).sum(axis=1))
l2norm_predicted_protein_expression = np.sqrt(np.multiply(predicted_protein_expression, predicted_protein_expression).sum(axis=1)) + 1e-08
true_protein_expression_l2norm = true_protein_expression / l2norm_true_protein_expression.values.reshape(-1,1)
predicted_protein_expression_l2norm = predicted_protein_expression / l2norm_predicted_protein_expression.values.reshape(-1,1)
true_protein_expression_l2norm, predicted_protein_expression_l2norm

(      ADT_CD102_A0104  ADT_CD103_A0201  ADT_CD106_A0226  \
 0            0.176323         0.040863         0.027051   
 1            0.212431         0.041480         0.041480   
 2            0.150032         0.046467         0.026800   
 3            0.174176         0.054164         0.062286   
 4            0.174748         0.028499         0.042685   
 ...               ...              ...              ...   
 7559         0.164200         0.019489         0.041732   
 7560         0.157023         0.061584         0.000000   
 7561         0.259855         0.026067         0.061626   
 7562         0.137930         0.070416         0.053236   
 7563         0.148291         0.013777         0.030198   
 
       ADT_CD115(CSF-1R)_A0105  ADT_CD117(c-Kit)_A0012  ADT_CD11a_A0595  \
 0                    0.019065                0.019065         0.182368   
 1                    0.023291                0.023291         0.120143   
 2                    0.026800                0.02680

## Calculate RMSE at the cell level

In [10]:
rmses_cell_level = calculate_RMSE_array_cell_level(cell_names, predicted_protein_expression_l2norm, true_protein_expression_l2norm)
rmses_cell_level

Unnamed: 0,RMSE
AAACCCAAGAATCTAG-2,0.020811
AAACCCACACCGGAAA-2,0.029171
AAACCCACACTACTTT-2,0.024445
AAACCCAGTAGGCAAC-2,0.020576
AAACCCAGTCTCAGGC-2,0.026073
...,...
TTTGTTGGTAAGATCA-2,0.023728
TTTGTTGGTCAGACTT-2,0.024813
TTTGTTGGTGTTTACG-2,0.033926
TTTGTTGTCAGAATAG-2,0.027349


## Save evaluation results

In [11]:
p_corrs_protein_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Babel/PCC_protein_level.xlsx"))
rmses_protein_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Babel/RMSE_protein_level.xlsx"))
p_corrs_cell_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Babel/PCC_cell_level.xlsx"))
rmses_cell_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Babel/RMSE_cell_level.xlsx"))