# Create Test data
create a data set we use to debug our spark equivalent of DESeq tximport, DESeqDataSetFromTximport, and estimateSizeFactors

In [1]:
import pandas as pd
!pwd

/private/home/aedavids/extraCellularRNA/juypterNotebooks/spark/testData/sparkDESeqTest2


# create mock salmon quant.sf files

In [2]:
sampleNames = [ "ctrl_1", "ctrl_2", "ctrl_3", "kras_1", "kras_2", "kras_3" ]

numTranscripts = 10
transcriptIds = [ "txId_" + str(i + 1) for i in range( numTranscripts ) ]
print(transcriptIds)

numGenes = numTranscripts - 2
geneNames = [ "gene_" + str(i + 1) for i in range( numGenes ) ]
print(geneNames)


# create average transcript counts. Make numbers easy to work with
# salmon NumReads are floats
ctrlNumReads = [ 10.0  +  i for i in range(numTranscripts) ]
krasNumReads = [ 100.0 + 10 * i for i in range(numTranscripts)]

# knock out a gene so we can test prefiltering
# assume this gene only has a single transcript
ctrlNumReads[0] = krasNumReads[0] = 0.0

# following columns are part of the salmon output but not used
# by DESeq. To double check they are not used. make values different
# to make debug easier
Length = [ 1500 + 10 * i for i in range( numTranscripts ) ]
EffectiveLength = [ 1234.5 + 10 * i for i in range( numTranscripts ) ]
Tpm = [ 12.1 + i for i in range( numTranscripts ) ]

['txId_1', 'txId_2', 'txId_3', 'txId_4', 'txId_5', 'txId_6', 'txId_7', 'txId_8', 'txId_9', 'txId_10']
['gene_1', 'gene_2', 'gene_3', 'gene_4', 'gene_5', 'gene_6', 'gene_7', 'gene_8']


In [3]:
def createDFs( sampleNamesArg, numReads ) :
    retList = []
    for i in range(len(sampleNamesArg)):
        sampleName = sampleNamesArg[i]
        dataDict = {
            'Name'           : transcriptIds,
            'Length'         : Length,
            'EffectiveLength': EffectiveLength,
            'TPM'            : Tpm,
            
            # make the samples slightly different
            #'NumReads'       : [v + i for v in numReads]
            'NumReads'       : [v + i/10.0 for v in numReads]
        }
        df = pd.DataFrame( data=dataDict )
        retList.append( df )
        
    return retList
    
ctrlDFs = createDFs( sampleNames[0:3], ctrlNumReads)
krasDFs = createDFs( sampleNames[3:], krasNumReads)

In [4]:
# knock out a couple of gene counts so we can test
# filtering in estimatedScalingFactors()
kdf = ctrlDFs[2]
print(kdf)
kdf.loc[2,"NumReads"] = 0.0
print()
print(kdf)

      Name  Length  EffectiveLength   TPM  NumReads
0   txId_1    1500           1234.5  12.1       0.2
1   txId_2    1510           1244.5  13.1      11.2
2   txId_3    1520           1254.5  14.1      12.2
3   txId_4    1530           1264.5  15.1      13.2
4   txId_5    1540           1274.5  16.1      14.2
5   txId_6    1550           1284.5  17.1      15.2
6   txId_7    1560           1294.5  18.1      16.2
7   txId_8    1570           1304.5  19.1      17.2
8   txId_9    1580           1314.5  20.1      18.2
9  txId_10    1590           1324.5  21.1      19.2

      Name  Length  EffectiveLength   TPM  NumReads
0   txId_1    1500           1234.5  12.1       0.2
1   txId_2    1510           1244.5  13.1      11.2
2   txId_3    1520           1254.5  14.1       0.0
3   txId_4    1530           1264.5  15.1      13.2
4   txId_5    1540           1274.5  16.1      14.2
5   txId_6    1550           1284.5  17.1      15.2
6   txId_7    1560           1294.5  18.1      16.2
7   txId_8 

In [5]:
def saveMockQuantFiles( dfList, sampleNamesArg ) :
    for i in range(len(dfList)):
        df = dfList[i]
        fileName = sampleNamesArg[i] + ".quant.sf"
        df.to_csv(fileName, sep="\t", index=False)
        
saveMockQuantFiles( ctrlDFs, sampleNames[0:3] )
saveMockQuantFiles( krasDFs, sampleNames[3:])

# Create colData

In [6]:
sampleType = ["bulk"] * 6
treatment = ["ctrl"] * 3 + ["kras"] * 3
day = ["5"] * 6
colDataDF = pd.DataFrame( {
    "sampleName": sampleNames,
    "sampleType": sampleType,
    "treatment" : treatment,
    "day"       : day
})

colDataDF.to_csv("colData.csv", index=False)

# Create txId2GeneId.csv
create some isoforms we can use to test group_by(gene id) count

In [7]:
geneIds = geneNames + [ geneNames[numGenes -1] ] * (numTranscripts - numGenes)

txId2GeneIdDF = pd.DataFrame(data={
    'txId'  : transcriptIds,
    'geneId': geneIds
})
txId2GeneIdDF

Unnamed: 0,txId,geneId
0,txId_1,gene_1
1,txId_2,gene_2
2,txId_3,gene_3
3,txId_4,gene_4
4,txId_5,gene_5
5,txId_6,gene_6
6,txId_7,gene_7
7,txId_8,gene_8
8,txId_9,gene_8
9,txId_10,gene_8


In [8]:
txId2GeneIdDF.to_csv("txId2GeneId.csv", index=False)