# Create TestDESeq2 master data sets

create a master count matrix we can use to test /private/home/aedavids/extraCellularRNA/terra/deseq/R/ DESeq scripts

ref: [sql-programming-guide.htm](https://spark.apache.org/docs/latest/sql-programming-guide.html)

In [1]:
SPARK_HOME="../../sparkBin/spark-3.1.2-bin-hadoop3.2"
import findspark
findspark.init( SPARK_HOME )

In [2]:
import pandas as pd
import pathlib as Path

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("DESeqMasterETL") \
    .config("spark.driver.memory", "15g") \
    .getOrCreate()

# .config("spark.some.config.option", "some-value") \

In [4]:
from pyspark.sql import functions as sqlFunc

## load our mock terra data model

In [5]:
fileList = ["/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/ctrl.1/gencode.salmon.out/quant.sf",
    "/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/ctrl.2/gencode.salmon.out/quant.sf",
    "/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/ctrl.3/gencode.salmon.out/quant.sf",
    "/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/kras.1/gencode.salmon.out/quant.sf",
    "/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/kras.2/gencode.salmon.out/quant.sf",
    "/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/kras.3/gencode.salmon.out/quant.sf"]

In [6]:
# pre allocate slots to store data frames in
quantSDFs = [None] * len(fileList)

In [7]:
quantSchema = "`Name` STRING, `Length` INT, `EffectiveLength` DOUBLE, `TPM` DOUBLE, `NumReads` DOUBLE "

for i in range( len(quantSDFs) ):
    quantFile =  fileList[i] 
    print(quantFile)
    df = spark.read.load( quantFile, format="csv", sep="\t", 
                             schema=quantSchema, header="true")
    quantSDFs[i] = df

/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/ctrl.1/gencode.salmon.out/quant.sf
/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/ctrl.2/gencode.salmon.out/quant.sf
/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/ctrl.3/gencode.salmon.out/quant.sf
/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/kras.1/gencode.salmon.out/quant.sf
/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/kras.2/gencode.salmon.out/quant.sf
/private/groups/kimlab/kras.ipsc/data/bulk.data/day.5/kras.3/gencode.salmon.out/quant.sf


In [8]:
quantSDFs[0].printSchema()

root
 |-- Name: string (nullable = true)
 |-- Length: integer (nullable = true)
 |-- EffectiveLength: double (nullable = true)
 |-- TPM: double (nullable = true)
 |-- NumReads: double (nullable = true)



## explore quant.sf file format

In [9]:
quantSDFs[0].show()

+--------------------+------+---------------+---------+--------+
|                Name|Length|EffectiveLength|      TPM|NumReads|
+--------------------+------+---------------+---------+--------+
|ENST00000456328.2...|  1657|         1474.0|      0.0|     0.0|
|ENST00000450305.2...|   632|          449.0|      0.0|     0.0|
|ENST00000488147.1...|  1351|        875.269|23.712919| 383.794|
|ENST00000619216.1...|    68|            5.0|      0.0|     0.0|
|ENST00000473358.1...|   712|          529.0|      0.0|     0.0|
|ENST00000469289.1...|   535|          352.0|      0.0|     0.0|
|ENST00000607096.1...|   138|           17.0|      0.0|     0.0|
|ENST00000417324.1...|  1187|         1004.0|      0.0|     0.0|
|ENST00000461467.1...|   590|        408.837| 0.248295|   1.877|
|ENST00000606857.1...|   840|          657.0|      0.0|     0.0|
|ENST00000642116.1...|  1414|         1231.0|      0.0|     0.0|
|ENST00000492842.2...|   939|          756.0|      0.0|     0.0|
|ENST00000641515.2...|  2

# create master count spark data frame

In [10]:
sampleNamesLst = [None] * len(fileList)
for i in range( len(fileList) ):
    sampleNamesLst[i] = fileList[i].split("/")[8].replace(".", "_")
    
sampleNamesLst

['ctrl_1', 'ctrl_2', 'ctrl_3', 'kras_1', 'kras_2', 'kras_3']

In [11]:
# initialize master counts spark data fram
firstSampleName = sampleNamesLst[0]
masterCountSDF = quantSDFs[0].select( ["Name", "NumReads"] ).withColumnRenamed( "NumReads", firstSampleName )

# Register the DataFrame as a SQL temporary view
masterCountSDF.createOrReplaceTempView("masterCount")

In [12]:
for i in range( 1, len(sampleNamesLst)):
    sampleName = sampleNamesLst[i]
    print(sampleName)
    
    # select the key and counts from the sample. 
    sampleSDF = quantSDFs[i].select( ["Name", "NumReads", ] ).withColumnRenamed( "NumReads", sampleName )
    sampleSDF.createOrReplaceTempView("sample")
    
    sqlStmt = ' select mc.*, {}  \n\
                      from \n\
                          masterCount as mc, \n\
                          sample  \n\
                      where \n\
                          mc.Name == sample.Name  \n'.format(sampleName)
    #print(sqlStmt)
    
    masterCountSDF = spark.sql( sqlStmt )
    masterCountSDF.createOrReplaceTempView("masterCount")
    #masterCountSDF.show()

ctrl_2
ctrl_3
kras_1
kras_2
kras_3


# invoke an action
This will cause the optimized query plan to run

In [13]:
#masterCountSDF.show(n=3)

# Save master count

In [14]:
rootDir = Path.Path('../../terra/deseq/R')
rootDir.mkdir(exist_ok=True)

## <span style="color:red"> in practice this is bad we coalesce twice results a lot of needless network overhead</span>

In [15]:
masterCountSDF.rdd.getNumPartitions()

200

In [16]:
masterOutFile = rootDir.joinpath( "masterCount.tsv" )
# write to a single file
masterCountSDF.coalesce(1).write.csv( masterOutFile.as_posix(), mode='overwrite', sep='\t', header=True)

In [17]:
masterCountSDF.rdd.getNumPartitions()

200

In [18]:
# write to 3 files
masterPartsOutFile = rootDir.joinpath( "masterCountParts.tsv" )

numFiles = 3
masterCountSDF.coalesce(numFiles) \
    .write \
    .csv( masterPartsOutFile.as_posix(), mode='overwrite', sep='\t', header=True)

masterCountSDF.show()

+--------------------+--------+--------+--------+--------+--------+--------+
|                Name|  ctrl_1|  ctrl_2|  ctrl_3|  kras_1|  kras_2|  kras_3|
+--------------------+--------+--------+--------+--------+--------+--------+
|ENST00000209540.2...|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|
|ENST00000221899.7...|  51.224|   26.64|  31.421|     0.0|  24.985|     0.0|
|ENST00000222254.1...| 1813.79|2117.258|1947.022|2022.171|1605.718|1766.385|
|ENST00000222256.9...| 137.771| 132.202|   112.0|   133.0|  90.452|   90.85|
|ENST00000244296.6...|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|
|ENST00000247668.7...|1092.855|1213.941|1075.104|1173.128| 901.451| 952.704|
|ENST00000249071.1...|  83.865|  88.761|   77.46|   65.45|  59.054|  56.478|
|ENST00000249389.2...|     0.0|     2.0|     1.0|     0.0|     1.0|     0.0|
|ENST00000251074.5...| 726.215|  971.33|1008.867| 774.779| 683.967| 717.747|
|ENST00000253251.1...|1578.777|2016.713|1901.576|2159.728| 1645.72| 1631.07|

In [19]:
masterCountSDF.rdd.getNumPartitions()

200