## Set path

In [1]:
import os
os.chdir(os.path.join(os.getcwd(), 'code/'))
outputs_dir = os.path.abspath(os.path.join(os.getcwd(), '../outputs'))

## Load necessary libraries

In [2]:
from evaluation_utils import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

## Load prediction and ground truth
### Seurat v3 (CCA), Seurat v3 (PCA), Seurat v4 (CCA), Seurat v4 (PCA)

In [3]:
true_protein_expression = pd.read_table(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Seurat/test_protein_clr.txt"), 
                                        delimiter=" ", index_col=0).T
predicted_protein_expression = pd.read_table(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Seurat/test_protein_prediction (Seurat v3 (CCA)).txt"), 
                                             delimiter=" ", index_col=0).T
true_protein_expression, predicted_protein_expression

(                    ADT-CD102-A0104  ADT-CD103-A0201  ADT-CD106-A0226  \
 AAACCCAAGAATCTAG-2         2.335790         0.541323         0.358350   
 AAACCCACACCGGAAA-2         2.545600         0.497062         0.497062   
 AAACCCACACTACTTT-2         2.014434         0.623903         0.359839   
 AAACCCAGTAGGCAAC-2         2.317535         0.720685         0.828758   
 AAACCCAGTCTCAGGC-2         2.432592         0.396730         0.594206   
 ...                             ...              ...              ...   
 TTTGTTGGTAAGATCA-2         2.144180         0.254494         0.544950   
 TTTGTTGGTCAGACTT-2         2.108415         0.826915         0.000000   
 TTTGTTGGTGTTTACG-2         3.195932         0.320597         0.757929   
 TTTGTTGTCAGAATAG-2         1.784287         0.910916         0.688670   
 TTTGTTGTCGAGTGGA-2         2.249596         0.208992         0.458114   
 
                     ADT-CD115(CSF-1R)-A0105  ADT-CD117(c-Kit)-A0012  \
 AAACCCAAGAATCTAG-2                 0.

## Get protein names and cell names

In [4]:
protein_names = np.array(true_protein_expression.columns)
cell_names = np.array(true_protein_expression.index)
protein_names, cell_names

(array(['ADT-CD102-A0104', 'ADT-CD103-A0201', 'ADT-CD106-A0226',
        'ADT-CD115(CSF-1R)-A0105', 'ADT-CD117(c-Kit)-A0012',
        'ADT-CD11a-A0595', 'ADT-CD11c-A0106', 'ADT-CD122(IL-2Rb)-A0227',
        'ADT-CD127(IL-7Ra)-A0198', 'ADT-CD134(OX-40)-A0195',
        'ADT-CD135-A0098', 'ADT-CD137-A0194', 'ADT-CD14-A0424',
        'ADT-CD140a-A0573', 'ADT-CD15(SSEA-1)-A0076',
        'ADT-CD150(SLAM)-A0203', 'ADT-CD16-32-A0109',
        'ADT-CD169(Siglec-1)-A0440', 'ADT-CD172a(SIRPa)-A0422',
        'ADT-CD183(CXCR3)-A0228', 'ADT-CD184(CXCR4)-A0444',
        'ADT-CD19-A0093', 'ADT-CD192(CCR2)-A0426', 'ADT-CD195(CCR5)-A0376',
        'ADT-CD196(CCR6)-A0225', 'ADT-CD197(CCR7)-A0377', 'ADT-CD20-A0192',
        'ADT-CD200(OX2)-A0079', 'ADT-CD201(EPCR)-A0439',
        'ADT-CD204(Msr1)-A0448', 'ADT-CD206(MMR)-A0173',
        'ADT-CD21-CD35(CR2-CR1)-A0107', 'ADT-CD223(LAG-3)-A0378',
        'ADT-CD23-A0108', 'ADT-CD24-A0212', 'ADT-CD25-A0097',
        'ADT-CD274(B7-H1-PD-L1)-A0190', 'ADT-CD278

## Calculate PCC at the protein level

In [5]:
p_corrs_protein_level = calculate_PCC_array_protein_level(protein_names, predicted_protein_expression, true_protein_expression)
p_corrs_protein_level

Unnamed: 0,PCC
ADT-CD102-A0104,0.701199
ADT-CD103-A0201,0.785138
ADT-CD106-A0226,0.583109
ADT-CD115(CSF-1R)-A0105,0.677170
ADT-CD117(c-Kit)-A0012,0.639016
...,...
ADT-TER-119-ErythroidCells-A0122,0.109484
ADT-Tim-4-A0567,0.597788
ADT-XCR1-A0568,0.399823
ADT-anti-P2RY12-A0415,0.037725


## Perform Z-score transformation at the protein level

In [6]:
true_protein_expression_zscore = pd.DataFrame(StandardScaler().fit_transform(true_protein_expression.values), index=true_protein_expression.index, 
                                              columns=true_protein_expression.columns)
predicted_protein_expression_zscore = pd.DataFrame(StandardScaler().fit_transform(predicted_protein_expression.values), 
                                                   index=predicted_protein_expression.index, columns=predicted_protein_expression.columns)    
true_protein_expression_zscore, predicted_protein_expression_zscore

(                    ADT-CD102-A0104  ADT-CD103-A0201  ADT-CD106-A0226  \
 AAACCCAAGAATCTAG-2         0.528275        -0.289040        -0.475452   
 AAACCCACACCGGAAA-2         0.932322        -0.380566        -0.078409   
 AAACCCACACTACTTT-2        -0.090581        -0.118275        -0.471191   
 AAACCCAGTAGGCAAC-2         0.493122         0.081856         0.871030   
 AAACCCAGTCTCAGGC-2         0.714695        -0.588038         0.199653   
 ...                             ...              ...              ...   
 TTTGTTGGTAAGATCA-2         0.159279        -0.882162         0.058667   
 TTTGTTGGTCAGACTT-2         0.090404         0.301524        -1.501184   
 TTTGTTGGTGTTTACG-2         2.184711        -0.745470         0.668290   
 TTTGTTGTCAGAATAG-2        -0.533792         0.475227         0.470046   
 TTTGTTGTCGAGTGGA-2         0.362286        -0.976254        -0.189892   
 
                     ADT-CD115(CSF-1R)-A0105  ADT-CD117(c-Kit)-A0012  \
 AAACCCAAGAATCTAG-2                -0.

## Calculate RMSE at the protein level

In [7]:
rmses_protein_level = calculate_RMSE_array_protein_level(protein_names, predicted_protein_expression_zscore, true_protein_expression_zscore)
rmses_protein_level

Unnamed: 0,RMSE
ADT-CD102-A0104,0.773047
ADT-CD103-A0201,0.655533
ADT-CD106-A0226,0.913116
ADT-CD115(CSF-1R)-A0105,0.803529
ADT-CD117(c-Kit)-A0012,0.849687
...,...
ADT-TER-119-ErythroidCells-A0122,1.334553
ADT-Tim-4-A0567,0.896897
ADT-XCR1-A0568,1.095606
ADT-anti-P2RY12-A0415,1.387282


## Calculate PCC at the cell level

In [8]:
p_corrs_cell_level = calculate_PCC_array_cell_level(cell_names, predicted_protein_expression, true_protein_expression)
p_corrs_cell_level

Unnamed: 0,PCC
AAACCCAAGAATCTAG-2,0.970795
AAACCCACACCGGAAA-2,0.945821
AAACCCACACTACTTT-2,0.967086
AAACCCAGTAGGCAAC-2,0.958689
AAACCCAGTCTCAGGC-2,0.971983
...,...
TTTGTTGGTAAGATCA-2,0.960083
TTTGTTGGTCAGACTT-2,0.963319
TTTGTTGGTGTTTACG-2,0.928862
TTTGTTGTCAGAATAG-2,0.950427


## Perform $\ell_2$ normalization at the cell level

In [9]:
l2norm_true_protein_expression = np.sqrt(np.multiply(true_protein_expression, true_protein_expression).sum(axis=1))
l2norm_predicted_protein_expression = np.sqrt(np.multiply(predicted_protein_expression, predicted_protein_expression).sum(axis=1)) + 1e-08
true_protein_expression_l2norm = true_protein_expression / l2norm_true_protein_expression.values.reshape(-1,1)
predicted_protein_expression_l2norm = predicted_protein_expression / l2norm_predicted_protein_expression.values.reshape(-1,1)
true_protein_expression_l2norm, predicted_protein_expression_l2norm

(                    ADT-CD102-A0104  ADT-CD103-A0201  ADT-CD106-A0226  \
 AAACCCAAGAATCTAG-2         0.176323         0.040863         0.027051   
 AAACCCACACCGGAAA-2         0.212431         0.041480         0.041480   
 AAACCCACACTACTTT-2         0.150032         0.046467         0.026800   
 AAACCCAGTAGGCAAC-2         0.174176         0.054164         0.062286   
 AAACCCAGTCTCAGGC-2         0.174748         0.028499         0.042685   
 ...                             ...              ...              ...   
 TTTGTTGGTAAGATCA-2         0.164200         0.019489         0.041732   
 TTTGTTGGTCAGACTT-2         0.157023         0.061584         0.000000   
 TTTGTTGGTGTTTACG-2         0.259855         0.026067         0.061626   
 TTTGTTGTCAGAATAG-2         0.137930         0.070416         0.053236   
 TTTGTTGTCGAGTGGA-2         0.148291         0.013777         0.030198   
 
                     ADT-CD115(CSF-1R)-A0105  ADT-CD117(c-Kit)-A0012  \
 AAACCCAAGAATCTAG-2                 0.

## Calculate RMSE at the cell level

In [10]:
rmses_cell_level = calculate_RMSE_array_cell_level(cell_names, predicted_protein_expression_l2norm, true_protein_expression_l2norm)
rmses_cell_level

Unnamed: 0,RMSE
AAACCCAAGAATCTAG-2,0.017501
AAACCCACACCGGAAA-2,0.023091
AAACCCACACTACTTT-2,0.018681
AAACCCAGTAGGCAAC-2,0.020327
AAACCCAGTCTCAGGC-2,0.017959
...,...
TTTGTTGGTAAGATCA-2,0.020327
TTTGTTGGTCAGACTT-2,0.020313
TTTGTTGGTGTTTACG-2,0.027130
TTTGTTGTCAGAATAG-2,0.022913


## Save evaluation results

In [11]:
p_corrs_protein_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Seurat/PCC_protein_level (Seurat v3 (CCA)).xlsx"))
rmses_protein_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Seurat/RMSE_protein_level (Seurat v3 (CCA)).xlsx"))
p_corrs_cell_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Seurat/PCC_cell_level (Seurat v3 (CCA)).xlsx"))
rmses_cell_level.to_excel(os.path.join(outputs_dir, "different samples/CITE-SLN111-Gayoso-Mouse1toMouse2/Seurat/RMSE_cell_level (Seurat v3 (CCA)).xlsx"))