In [None]:
import pandas as pd

In [None]:
#read your PAM50 genes
gene_expr = pd.read_csv('./path/to/pam50_genes.txt', sep='\t', index_col=0)
gene_expr.head()

## PAM50 genes
| CLID   |   Sample_1 |   Sample_2 |     Sample_3 |   Sample_4 |   Sample_5 |   Sample_6 |   Sample_7 |   Sample_8 |   Sample_9 |    Sample_10 |
|:-------|-----------:|-----------:|-------------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-------------:|
| CDC20  |    1.74169 |   0.658006 | -0.500287    |   1.59368  |   1.24371  |   3.09567  |   1.14402  |  -0.71377  |    2.79974 |  0.500286    |
| KIF2C  |    1.66693 |   0.476314 | -0.322913    |   1.08879  |   1.37673  |   1.2593   |   0.877581 |  -0.771949 |    2.96251 |  0.322913    |
| PHGDH  |    1.53828 |  -0.450438 |  9.52667e-10 |  -0.354776 |   2.08242  |   0.415371 |   2.36549  |   0.492217 |    5.6434  | -0.317556    |
| CDCA1  |    1.59396 |   2.34256  | -1.3695      |  -0.031503 |   0.788258 |   0.520124 |   0.151055 |  -1.03284  |    1.65673 |  9.45093e-11 |
| UBE2T  |    1.41267 |   2.12426  | -0.648622    |   0.539226 |   2.08272  |   1.18369  |   0.24693  |  -0.735035 |    1.16661 | -0.08231     |


In [None]:
#read your IHC class (ERneg_HER2neg, ERpos_HER2neg, HER2pos_ERneg, HER2pos_ERpos, TNBC)
classes = pd.read_csv('./path/to/IHC_class.txt', sep='\t', index_col=0)
classes.head()

## IHC class
| CLID     | Zhao_class    |
|:---------|:--------------|
| Sample_1 | HER2pos_ERneg |
| Sample_2 | HER2pos_ERneg |
| Sample_3 | HER2pos_ERneg |
| Sample_4 | HER2pos_ERneg |
| Sample_5 | HER2pos_ERneg |

In [None]:
#read the sigma file extracted from the UN323 PAM50 training set
sigma = pd.read_csv('./SIGMA.txt', sep='\t', index_col=0)
sigma.head()

In [None]:
def quantile_centering(expr_matrix, gene_quantile):
    """Do row centering based on the quantile.
    
    :param expr_matrix: pandas.DataFrame where row are genes and columns are samples
    :param gene_quantile: pandas.DataFrame or Series containig all the genes in the
      first parameter and the value of the quantile to be used, i.e. .5 if one wants
      to do row centering using the mean.

    :return: Centered dataframe.
    """
    res = expr_matrix.copy()
    for name, values in expr_matrix.iterrows():
        q = gene_quantile.loc[name]
        q_value = expr_matrix.loc[name].quantile(q)
        res.loc[name] -= q_value
    return res

In [None]:
centered = []
# For each class in the test set get the samples and apply the quantiles from the train set
for class_ in classes['Zhao_class'].unique():
    samples = classes.index[classes['Zhao_class'] == class_]
    gene_expr_class = gene_expr[samples]
    percentile_group = sigma[class_]
    centered.append(quantile_centering(gene_expr_class, percentile_group))
result = pd.concat(centered, sort=False, axis=1)
result