## Set path

In [1]:
import os
os.chdir(os.path.join(os.getcwd(), 'code/'))
outputs_dir = os.path.abspath(os.path.join(os.getcwd(), '../outputs'))

## Load necessary libraries

In [2]:
from evaluation_utils import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import anndata

## Load prediction and ground truth

In [3]:
true_protein_expression = pd.read_table(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMM/test_protein_groundtruth.txt"), 
                                        sep="\t", index_col=0)
predicted_protein_expression = pd.read_table(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMM/test_protein_prediction.txt"), 
                                             sep="\t", index_col=0)
true_protein_expression, predicted_protein_expression

(      ADT_CD102_A0104  ADT_CD103_A0201  ADT_CD106_A0226  \
 0                65.0              5.0              3.0   
 1                73.0              4.0              4.0   
 2                45.0              6.0              3.0   
 3                78.0              9.0             11.0   
 4                64.0              3.0              5.0   
 ...               ...              ...              ...   
 7559             52.0              2.0              5.0   
 7560             45.0              8.0              0.0   
 7561            124.0              2.0              6.0   
 7562             30.0              9.0              6.0   
 7563             73.0              2.0              5.0   
 
       ADT_CD115(CSF-1R)_A0105  ADT_CD117(c-Kit)_A0012  ADT_CD11a_A0595  \
 0                         2.0                     2.0             71.0   
 1                         2.0                     2.0             20.0   
 2                         3.0                     3.

## Get protein names and cell names

In [4]:
test_protein_data = anndata.read(
    os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMM/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod2.h5ad"))
protein_names = np.array(true_protein_expression.columns)
cell_names = np.array(test_protein_data.obs.index)
protein_names, cell_names

(array(['ADT_CD102_A0104', 'ADT_CD103_A0201', 'ADT_CD106_A0226',
        'ADT_CD115(CSF-1R)_A0105', 'ADT_CD117(c-Kit)_A0012',
        'ADT_CD11a_A0595', 'ADT_CD11c_A0106', 'ADT_CD122(IL-2Rb)_A0227',
        'ADT_CD127(IL-7Ra)_A0198', 'ADT_CD134(OX-40)_A0195',
        'ADT_CD135_A0098', 'ADT_CD137_A0194', 'ADT_CD140a_A0573',
        'ADT_CD14_A0424', 'ADT_CD15(SSEA-1)_A0076',
        'ADT_CD150(SLAM)_A0203', 'ADT_CD16-32_A0109',
        'ADT_CD169(Siglec-1)_A0440', 'ADT_CD172a(SIRPa)_A0422',
        'ADT_CD183(CXCR3)_A0228', 'ADT_CD184(CXCR4)_A0444',
        'ADT_CD192(CCR2)_A0426', 'ADT_CD195(CCR5)_A0376',
        'ADT_CD196(CCR6)_A0225', 'ADT_CD197(CCR7)_A0377', 'ADT_CD19_A0093',
        'ADT_CD200(OX2)_A0079', 'ADT_CD201(EPCR)_A0439',
        'ADT_CD204(Msr1)_A0448', 'ADT_CD206(MMR)_A0173', 'ADT_CD20_A0192',
        'ADT_CD21-CD35(CR2-CR1)_A0107', 'ADT_CD223(LAG-3)_A0378',
        'ADT_CD23_A0108', 'ADT_CD24_A0212', 'ADT_CD25_A0097',
        'ADT_CD274(B7-H1_PD-L1)_A0190', 'ADT_CD278

## Calculate PCC at the protein level

In [5]:
p_corrs_protein_level = calculate_PCC_array_protein_level(protein_names, predicted_protein_expression, true_protein_expression)
p_corrs_protein_level

Unnamed: 0,PCC
ADT_CD102_A0104,0.289841
ADT_CD103_A0201,0.127997
ADT_CD106_A0226,0.197379
ADT_CD115(CSF-1R)_A0105,0.227959
ADT_CD117(c-Kit)_A0012,0.219482
...,...
ADT_TER-119-ErythroidCells_A0122,0.021239
ADT_Tim-4_A0567,0.150852
ADT_XCR1_A0568,0.056257
ADT_anti-P2RY12_A0415,0.015256


## Perform Z-score transformation at the protein level

In [6]:
true_protein_expression_zscore = pd.DataFrame(StandardScaler().fit_transform(true_protein_expression.values), index=true_protein_expression.index, 
                                              columns=true_protein_expression.columns)
predicted_protein_expression_zscore = pd.DataFrame(StandardScaler().fit_transform(predicted_protein_expression.values), 
                                                   index=predicted_protein_expression.index, columns=predicted_protein_expression.columns)
true_protein_expression_zscore, predicted_protein_expression_zscore

(      ADT_CD102_A0104  ADT_CD103_A0201  ADT_CD106_A0226  \
 0            0.205191        -0.313454        -0.235714   
 1            0.418915        -0.382112        -0.172739   
 2           -0.329120        -0.244796        -0.235714   
 3            0.552493        -0.038822         0.268084   
 4            0.178476        -0.450770        -0.109765   
 ...               ...              ...              ...   
 7559        -0.142111        -0.519428        -0.109765   
 7560        -0.329120        -0.107480        -0.424639   
 7561         1.781407        -0.519428        -0.046790   
 7562        -0.729852        -0.038822        -0.046790   
 7563         0.418915        -0.519428        -0.109765   
 
       ADT_CD115(CSF-1R)_A0105  ADT_CD117(c-Kit)_A0012  ADT_CD11a_A0595  \
 0                   -0.138495               -0.213718        -0.537360   
 1                   -0.138495               -0.213718        -0.893460   
 2                   -0.066630               -0.15367

## Calculate RMSE at the protein level

In [7]:
rmses_protein_level = calculate_RMSE_array_protein_level(protein_names, predicted_protein_expression_zscore, true_protein_expression_zscore)
rmses_protein_level

Unnamed: 0,RMSE
ADT_CD102_A0104,1.191771
ADT_CD103_A0201,1.320608
ADT_CD106_A0226,1.266981
ADT_CD115(CSF-1R)_A0105,1.242611
ADT_CD117(c-Kit)_A0012,1.249414
...,...
ADT_TER-119-ErythroidCells_A0122,1.399114
ADT_Tim-4_A0567,1.303187
ADT_XCR1_A0568,1.373858
ADT_anti-P2RY12_A0415,1.403384


## Calculate PCC at the cell level

In [8]:
p_corrs_cell_level = calculate_PCC_array_cell_level(cell_names, predicted_protein_expression, true_protein_expression)
p_corrs_cell_level

Unnamed: 0,PCC
AAACCCAAGAATCTAG-2,0.628491
AAACCCACACCGGAAA-2,0.614829
AAACCCACACTACTTT-2,0.759845
AAACCCAGTAGGCAAC-2,0.747273
AAACCCAGTCTCAGGC-2,0.544027
...,...
TTTGTTGGTAAGATCA-2,0.703010
TTTGTTGGTCAGACTT-2,0.859920
TTTGTTGGTGTTTACG-2,0.759271
TTTGTTGTCAGAATAG-2,0.788569


## Perform $\ell_2$ normalization at the cell level

In [9]:
l2norm_true_protein_expression = np.sqrt(np.multiply(true_protein_expression, true_protein_expression).sum(axis=1))
l2norm_predicted_protein_expression = np.sqrt(np.multiply(predicted_protein_expression, predicted_protein_expression).sum(axis=1)) + 1e-08
true_protein_expression_l2norm = true_protein_expression / l2norm_true_protein_expression.values.reshape(-1,1)
predicted_protein_expression_l2norm = predicted_protein_expression / l2norm_predicted_protein_expression.values.reshape(-1,1)
true_protein_expression_l2norm, predicted_protein_expression_l2norm

(      ADT_CD102_A0104  ADT_CD103_A0201  ADT_CD106_A0226  \
 0            0.078820         0.006063         0.003638   
 1            0.094102         0.005156         0.005156   
 2            0.037785         0.005038         0.002519   
 3            0.091423         0.010549         0.012893   
 4            0.061779         0.002896         0.004827   
 ...               ...              ...              ...   
 7559         0.055467         0.002133         0.005333   
 7560         0.039294         0.006986         0.000000   
 7561         0.244979         0.003951         0.011854   
 7562         0.027464         0.008239         0.005493   
 7563         0.043529         0.001193         0.002981   
 
       ADT_CD115(CSF-1R)_A0105  ADT_CD117(c-Kit)_A0012  ADT_CD11a_A0595  \
 0                    0.002425                0.002425         0.086096   
 1                    0.002578                0.002578         0.025781   
 2                    0.002519                0.00251

## Calculate RMSE at the cell level

In [10]:
rmses_cell_level = calculate_RMSE_array_cell_level(cell_names, predicted_protein_expression_l2norm, true_protein_expression_l2norm)
rmses_cell_level

Unnamed: 0,RMSE
AAACCCAAGAATCTAG-2,0.078780
AAACCCACACCGGAAA-2,0.079593
AAACCCACACTACTTT-2,0.063186
AAACCCAGTAGGCAAC-2,0.063116
AAACCCAGTCTCAGGC-2,0.087577
...,...
TTTGTTGGTAAGATCA-2,0.069875
TTTGTTGGTCAGACTT-2,0.048177
TTTGTTGGTGTTTACG-2,0.062357
TTTGTTGTCAGAATAG-2,0.059874


## Save evaluation results

In [11]:
p_corrs_protein_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMM/PCC_protein_level.xlsx"))
rmses_protein_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMM/RMSE_protein_level.xlsx"))
p_corrs_cell_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMM/PCC_cell_level.xlsx"))
rmses_cell_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/scMM/RMSE_cell_level.xlsx"))