# Create sample Test Integration data
we need to create count, colData, and estimated scaling factors that matches
the data we used in our upset plot unit test

In [1]:
# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

import numpy as np
import os
import pandas as pd

pwd = os.getcwd()
print(f'pwd: {pwd}')

outdir = os.path.join(pwd, "../python/pipeline/dataFactory/test/data/testIntegration")
print(f'\noutdir: {outdir}')

os.makedirs(outdir, exist_ok=True)

pwd: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks

outdir: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/../python/pipeline/dataFactory/test/data/testIntegration


In [2]:
# unique genes in sort order
# pipeline/dataFactory/test/data/testSignatureGenes/1vsAll/Whole_Blood_vs_all.results
# (extraCellularRNA) aedavids@mustard $ tail -n +9 UVM_vs_all.results | cut -d , -f 1
# (extraCellularRNA) aedavids@mustard $ tail -n +9 Vagina_vs_all.results | cut -d , -f 1
# (extraCellularRNA) aedavids@mustard $ tail -n +9 Whole_Blood_vs_all.results | cut -d , -f 1
# sort | uniq

l1 = np.array( [ "AC090150.1", "AC104389.6", "AC114812.1", "AC244102.4", "ALAS2" ] )
l2 = np.array( [ "ANO1-AS1",   "ARPP21",     "C1orf61",    "CXCR1",      "HBD"   ] )
l3 = np.array( [ "HBG2",       "HBM",        "LINC02612",  "OLIG1",      "OLIG2" ] )
l4 = np.array( [ "PCSEAT",     "SILC1",      "UGT2A3",     "UVM_AAA",    "UVM_V" ] )
l5 = np.array( [ "UVM_V_W",    "UVM_V_W.1",  "V_BBB",      "V_W",        "W_CCC" ] )

In [3]:
genesNP = np.concatenate( [l1, l2, l3, l4, l5] )
genesNP

array(['AC090150.1', 'AC104389.6', 'AC114812.1', 'AC244102.4', 'ALAS2',
       'ANO1-AS1', 'ARPP21', 'C1orf61', 'CXCR1', 'HBD', 'HBG2', 'HBM',
       'LINC02612', 'OLIG1', 'OLIG2', 'PCSEAT', 'SILC1', 'UGT2A3',
       'UVM_AAA', 'UVM_V', 'UVM_V_W', 'UVM_V_W.1', 'V_BBB', 'V_W',
       'W_CCC'], dtype='<U10')

## Create knock out vectors

In [4]:
uvmGenes = [ "AC090150.1", "AC114812.1", "AC244102.4", "ANO1-AS1", "LINC02612", "PCSEAT",
                "UVM_AAA", "UVM_V", "UVM_V_W.1" ]
vaginaGenes = ["ARPP21", "C1orf61", "OLIG1", "OLIG2", "SILC1", "UGT2A3", "UVM_V", "UVM_V_W", "V_BBB" ]
wholeBloodGenes = ["AC104389.6", "ALAS2", "CXCR1", "HBD", "HBG2", "HBM", "UVM_V_W", "V_W", "W_CCC"]

In [5]:
# check for interesection
print(f'uvm intersect vagina: {set(uvmGenes).intersection(vaginaGenes)}' )
print(f'uvm intersect wholeBloodGenes: {set(uvmGenes).intersection(wholeBloodGenes)}' )
print(f'vagina intersect wholeBloodGenes: {set(vaginaGenes).intersection(wholeBloodGenes)}' )

uvm intersect vagina: {'UVM_V'}
uvm intersect wholeBloodGenes: set()
vagina intersect wholeBloodGenes: {'UVM_V_W'}


In [6]:
uvmKnockOutNP        = np.isin(genesNP, uvmGenes) 
print( f'uvmKnockOutNP\n{uvmKnockOutNP}' )
vaginaKnockOutNP     = np.isin(genesNP, vaginaGenes) 
wholeBloodKnockOutNP = np.isin(genesNP, wholeBloodGenes) 

uvmKnockOutNP
[ True False  True  True False  True False False False False False False
  True False False  True False False  True  True False  True False False
 False]


## Create count matrix

In [7]:
dataDict = {
    "geneId" : genesNP,
    "GTEX-1117F-0226-SM-UVM" : 1 * uvmKnockOutNP , 
    "GTEX-1117F-0526-SM-UVM" : 1.1 * uvmKnockOutNP, 
    "GTEX-1117F-0726-SM-VAG" : 3 * vaginaKnockOutNP, 
    "GTEX-1117F-0226-SM-VAG" : 3.3 * vaginaKnockOutNP,
    "GTEX-1117F-0526-SM-WB" :  5 * wholeBloodKnockOutNP, 
    "GTEX-1117F-0726-SM-WB" :  5.5 * wholeBloodKnockOutNP
}
countDF = pd.DataFrame(dataDict)
countDF = countDF.set_index("geneId")

countDF

Unnamed: 0_level_0,GTEX-1117F-0226-SM-UVM,GTEX-1117F-0526-SM-UVM,GTEX-1117F-0726-SM-VAG,GTEX-1117F-0226-SM-VAG,GTEX-1117F-0526-SM-WB,GTEX-1117F-0726-SM-WB
geneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AC090150.1,1,1.1,0,0.0,0,0.0
AC104389.6,0,0.0,0,0.0,5,5.5
AC114812.1,1,1.1,0,0.0,0,0.0
AC244102.4,1,1.1,0,0.0,0,0.0
ALAS2,0,0.0,0,0.0,5,5.5
ANO1-AS1,1,1.1,0,0.0,0,0.0
ARPP21,0,0.0,3,3.3,0,0.0
C1orf61,0,0.0,3,3.3,0,0.0
CXCR1,0,0.0,0,0.0,5,5.5
HBD,0,0.0,0,0.0,5,5.5


In [8]:
countPath = os.path.join(outdir, "geneCounts.csv")
print(f'\nsaved to : {countPath}')

countDF.to_csv(countPath)


saved to : /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/../python/pipeline/dataFactory/test/data/testIntegration/geneCounts.csv


## Create Col Data

In [9]:
sample_id = countDF.columns

participant_id = ["-".join( s.split("-")[0:2] )  for s in sample_id]

category = [ "UVM", "UVM", "Vagina",  "Vagina",  "Whole_Blood", "Whole_Blood" ]

gender = [ "Female" for i in range(len(participant_id)) ]

age = [ 66.0 for i in range(len(participant_id)) ]

dataSet = [ 'GTEx' for i in range(len(participant_id)) ]

colDataDF = pd.DataFrame( {
    "sample_id"      : sample_id,
    "participant_id" : participant_id,
    "category"       : category, 
    "gender"         : gender,
    "age"            : age,
    "dataSet"        : dataSet
})

display( colDataDF )


colDataPath = os.path.join(outdir, "colData.csv")
colDataDF.to_csv(colDataPath, index=False)
print(f'\nsaved to : {colDataPath}')

Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
0,GTEX-1117F-0226-SM-UVM,GTEX-1117F,UVM,Female,66.0,GTEx
1,GTEX-1117F-0526-SM-UVM,GTEX-1117F,UVM,Female,66.0,GTEx
2,GTEX-1117F-0726-SM-VAG,GTEX-1117F,Vagina,Female,66.0,GTEx
3,GTEX-1117F-0226-SM-VAG,GTEX-1117F,Vagina,Female,66.0,GTEx
4,GTEX-1117F-0526-SM-WB,GTEX-1117F,Whole_Blood,Female,66.0,GTEx
5,GTEX-1117F-0726-SM-WB,GTEX-1117F,Whole_Blood,Female,66.0,GTEx



saved to : /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/../python/pipeline/dataFactory/test/data/testIntegration/colData.csv


## Create Estimated scaling factors

In [11]:
sfSeries = pd.Series(np.arange(1,7, 1), name="sizeFactors(dds)")
p = "../../deconvolutionAnalysis/python/pipeline/dataFactory/test/data/testSignatureGenes/1vsAll"
OnevsAllDir = os.path.join(pwd, p)
estimatedScalingFactorsPath = os.path.join(OnevsAllDir, "estimatedSizeFactors.csv")

display( sfSeries )

sfSeries.to_csv(estimatedScalingFactorsPath, index=False)

print(f'\nsaved to : {estimatedScalingFactorsPath}')

0    1
1    2
2    3
3    4
4    5
5    6
Name: sizeFactors(dds), dtype: int64


saved to : /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/../../deconvolutionAnalysis/python/pipeline/dataFactory/test/data/testSignatureGenes/1vsAll/estimatedSizeFactors.csv
