# Evaluate parallelTest.sh Results
```
Andrew E. Davidson
aedavids@ucsc.edu
11/6/22
```

cibersort took 3.5 days to run on GTEx_TCGA training data set. 

<span style="color:green">It is possible to split traning data set in to shards run cibersort concurrently and re-assemble results</span>

ref:
- extraCellularRNA/terra/cibersortx/cibersortParallelization.md 
- extraCellularRNA/terra/cibersortx/bin/parallelTest.sh

results:
- Fractions are the same
- 'Mixture', 'Correlation', and  'RMSE' are the same
- 16 out of 99 p-values are different
    ```
    array([-0.01, -0.03, -0.01, -0.02, -0.03, -0.02, -0.02, -0.04, -0.01,
               -0.03, -0.02, -0.02, -0.03, -0.01, -0.02, -0.02])
    ```

TODO:
- estimate run time

In [1]:
from IPython.display import display
import numpy as np
import pandas as pd
import pathlib as pl

In [2]:
! ls -d /scratch/aedavids/cibersort.out/parallelTest.sh-mixtureMa*

/scratch/aedavids/cibersort.out/parallelTest.sh-mixtureMatrix100.tvs-2022-11-06-14.55.05-PST
/scratch/aedavids/cibersort.out/parallelTest.sh-mixtureMatrixLeft50.tsv-2022-11-06-14.57.10-PST
/scratch/aedavids/cibersort.out/parallelTest.sh-mixtureMatrixRight50.tsv-2022-11-06-14.57.43-PST


In [3]:
rootDir = pl.Path("/scratch/aedavids/cibersort.out")
mixtureMatrix100Dir = rootDir.joinpath("parallelTest.sh-mixtureMatrix100.tvs-2022-11-06-14.55.05-PST")
m100f = "CIBERSORTx_parallelTest.sh-mixtureMatrix100.tvs-2022-11-06-14.55.05-PST_Results.txt"
resultsMixture100File = mixtureMatrix100Dir.joinpath(m100f)

In [4]:
mixtureMatrixLeft50Dir = rootDir.joinpath("parallelTest.sh-mixtureMatrixLeft50.tsv-2022-11-06-14.57.10-PST")
l50f = "CIBERSORTx_parallelTest.sh-mixtureMatrixLeft50.tsv-2022-11-06-14.57.10-PST_Results.txt"
resultsMixtureLeft50File = mixtureMatrixLeft50Dir.joinpath(l50f)

In [5]:
mixtureMatrixRight50Dir = rootDir.joinpath("parallelTest.sh-mixtureMatrixRight50.tsv-2022-11-06-14.57.43-PST")
r50f = "CIBERSORTx_parallelTest.sh-mixtureMatrixRight50.tsv-2022-11-06-14.57.43-PST_Results.txt"
resultsMixtureRight50File = mixtureMatrixRight50Dir.joinpath(r50f)

In [6]:
results100DF = pd.read_csv(resultsMixture100File, sep="\t")
print(results100DF.shape)
results100DF.loc[:, ['Mixture', 'P-value', 'Correlation', 'RMSE']].head()

(99, 87)


Unnamed: 0,Mixture,P-value,Correlation,RMSE
0,GTEX-1117F-0226-SM-5GZZ7,0.0,0.985425,0.926455
1,GTEX-1117F-0526-SM-5EGHJ,0.0,0.979791,0.934695
2,GTEX-1117F-0726-SM-5GIEN,0.0,0.984906,0.447916
3,GTEX-1117F-2826-SM-5GZXL,0.0,0.988464,0.907675
4,GTEX-1117F-3226-SM-5N9CT,0.0,0.917052,0.949553


In [7]:
resultsLeft50DF = pd.read_csv(resultsMixtureLeft50File, sep="\t")
print(resultsLeft50DF.shape)
resultsLeft50DF.loc[:, ['Mixture', 'P-value', 'Correlation', 'RMSE']].head()

(49, 87)


Unnamed: 0,Mixture,P-value,Correlation,RMSE
0,GTEX-1117F-0226-SM-5GZZ7,0.0,0.985425,0.926455
1,GTEX-1117F-0526-SM-5EGHJ,0.0,0.979791,0.934695
2,GTEX-1117F-0726-SM-5GIEN,0.0,0.984906,0.447916
3,GTEX-1117F-2826-SM-5GZXL,0.0,0.988464,0.907675
4,GTEX-1117F-3226-SM-5N9CT,0.0,0.917052,0.949553


In [8]:
resultsRight50DF = pd.read_csv(resultsMixtureRight50File, sep="\t")
print(resultsRight50DF.shape)
# mixture is our sample id
resultsRight50DF.loc[:, ['Mixture', 'P-value', 'Correlation', 'RMSE']].head()

(50, 87)


Unnamed: 0,Mixture,P-value,Correlation,RMSE
0,GTEX-1122O-0008-SM-5QGR2,0.08,0.119214,0.997317
1,GTEX-1122O-0126-SM-5GICA,0.0,0.982535,0.66133
2,GTEX-1122O-0226-SM-5N9DA,0.0,0.99779,0.120497
3,GTEX-1122O-0326-SM-5H124,0.0,0.95875,0.597504
4,GTEX-1122O-0526-SM-5N9DM,0.01,0.744702,0.934792


## combine top and bottom results
Make sure top and bottom do not share any mixtures. 

In [9]:
mix100IdsNP = results100DF.loc[:, "Mixture"].to_numpy()
print(mix100IdsNP.shape)

l50IdsNP = resultsLeft50DF.loc[:, "Mixture"].to_numpy()
print(l50IdsNP.shape)

r50IdsNP = resultsRight50DF.loc[:, "Mixture"].to_numpy()
print(r50IdsNP.shape)

rightLeftIntersection = np.intersect1d(l50IdsNP,r50IdsNP)
print(rightLeftIntersection.size)
assert rightLeftIntersection.size == 0, "ERROR left and write have mixture(ie sample) ids in common"


(99,)
(49,)
(50,)
0


In [10]:
testDF = pd.concat([resultsLeft50DF, resultsRight50DF])
print(results100DF.shape)
print(testDF.shape)
assert results100DF.shape == testDF.shape, "ERROR the shape of the combination of lefst and right does not equal results100"

(99, 87)
(99, 87)


## Is resultsLeft50DF + resultsRight50DF data sets = results100DF ?

In [11]:
# make sure mixture ids (ie samples are in the same order)
assert sum(testDF.loc[:,["Mixture"]].values == results100DF.loc[:,["Mixture"]].values) == 99

In [12]:
# check cibersort results stats

# mixture is an object
expected = results100DF.loc[:, ['P-value', 'Correlation', 'RMSE']].to_numpy()
#print(expected.dtypes)
ret = testDF.loc[:, ['P-value', 'Correlation', 'RMSE']].to_numpy()
#print(ret.dtypes)

atol = 1e-08 # 1e-06 # default is 1e-08
rtol = 1e-05 # 1e-03 # default is 1e-05
isMetricClose = np.isclose(expected, ret, rtol=rtol, atol=atol)

# array([83, 99, 99]) 1e-08 1e-05
print(isMetricClose.shape)
print("sum(isMetricClose): {}".format(sum(isMetricClose)))

# ~ is not
~isMetricClose[:,0]
#display( results100DF.loc[~isMetricClose[:,0], ['Mixture', 'P-value', 'Correlation', 'RMSE']] )
#testDF.loc[~isMetricClose[:,0], ['Mixture', 'P-value', 'Correlation', 'RMSE']]

# print(results100DF.loc[~isMetricClose[:,0],'P-value'].to_numpy())
# print(testDF.loc[~isMetricClose[:,0], 'P-value'].to_numpy())

print("some p-values are diffent")
results100DF.loc[~isMetricClose[:,0],'P-value'].to_numpy() - testDF.loc[~isMetricClose[:,0], 'P-value'].to_numpy()

(99, 3)
sum(isMetricClose): [83 99 99]
some p-values are diffent


array([-0.01, -0.03, -0.01, -0.02, -0.03, -0.02, -0.02, -0.04, -0.01,
       -0.03, -0.02, -0.02, -0.03, -0.01, -0.02, -0.02])

In [13]:
# make sure fractions are the same

frationComponentsList = ['ACC', 'Adipose_Subcutaneous', 'Adipose_Visceral_Omentum',
       'Adrenal_Gland', 'Artery_Aorta', 'Artery_Coronary', 'Artery_Tibial',
       'BLCA', 'BRCA', 'Bladder', 'Brain_Amygdala',
       'Brain_Anterior_cingulate_cortex_BA24', 'Brain_Caudate_basal_ganglia',
       'Brain_Cerebellar_Hemisphere', 'Brain_Cerebellum', 'Brain_Cortex',
       'Brain_Frontal_Cortex_BA9', 'Brain_Hippocampus', 'Brain_Hypothalamus',
       'Brain_Nucleus_accumbens_basal_ganglia', 'Brain_Putamen_basal_ganglia',
       'Brain_Spinal_cord_cervical_c-1', 'Brain_Substantia_nigra',
       'Breast_Mammary_Tissue', 'CESC', 'CHOL', 'COAD',
       'Cells_Cultured_fibroblasts', 'Cells_EBV-transformed_lymphocytes',
       'Cervix_Endocervix', 'Colon_Sigmoid', 'Colon_Transverse', 'DLBC',
       'ESCA', 'Esophagus_Gastroesophageal_Junction', 'Esophagus_Mucosa',
       'Esophagus_Muscularis', 'GBM', 'HNSC', 'Heart_Atrial_Appendage',
       'Heart_Left_Ventricle', 'KICH', 'KIRC', 'KIRP', 'Kidney_Cortex', 'LGG',
       'LIHC', 'LUAD', 'LUSC', 'Liver', 'Lung', 'MESO', 'Minor_Salivary_Gland',
       'Muscle_Skeletal', 'Nerve_Tibial', 'OV', 'Ovary', 'PAAD', 'PCPG',
       'PRAD', 'Pancreas', 'Pituitary', 'Prostate', 'READ', 'SARC', 'SKCM',
       'STAD', 'Skin_Not_Sun_Exposed_Suprapubic', 'Skin_Sun_Exposed_Lower_leg',
       'Small_Intestine_Terminal_Ileum', 'Spleen', 'Stomach', 'TGCT', 'THCA',
       'THYM', 'Testis', 'Thyroid', 'UCEC', 'UCS', 'UVM', 'Uterus', 'Vagina',
       'Whole_Blood']

retDF = testDF.loc[:,frationComponentsList ]
expectedDF = results100DF.loc[:,frationComponentsList ]

assert np.allclose(retDF, expectedDF), "Error fractions are different"