# concatenate Columns

In [2]:
# SPARK_HOME="../../sparkBin/spark-3.1.2-bin-hadoop3.2"
SPARK_HOME="../../terra/deseq/spark-3.1.2-bin-hadoop3.2"
import findspark
findspark.init( SPARK_HOME )

In [3]:
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("DESeqMasterETL") \
    .getOrCreate()

## load our mock terra data model

In [6]:
sampleFile = "testData/sample.tsv"
samplePDF = pd.read_csv(sampleFile, sep="\t")

fileLst = samplePDF.loc[:, 'quant.sf'].tolist()

In [11]:
# pre allocate slots to store data frames in
quantSDFs = [None] * len(fileLst)
quantSchema = "`Name` STRING, `Length` INT, `EffectiveLength` DOUBLE, `Tmp` DOUBLE, `NumReads` DOUBLE "

for i in range( len(quantSDFs) ):
    quantFile = "testData/{}".format( fileLst[i] )
    print(quantFile)
    df = spark.read.load( quantFile, format="csv", sep="\t", 
                             schema=quantSchema, header="true")
    quantSDFs[i] = df

testData/ctrl.1.quant.sf
testData/ctrl.2.quant.sf
testData/ctrl.3.quant.sf
testData/kras.1.quant.sf
testData/kras.2.quant.sf
testData/kras.3.quant.sf


In [12]:
quantSDFs[0].show()

+--------------------+------+---------------+---------+--------+
|                Name|Length|EffectiveLength|      Tmp|NumReads|
+--------------------+------+---------------+---------+--------+
|ENST00000456328.2...|  1657|         1530.0|      0.0|     0.0|
|ENST00000450305.2...|   632|          505.0|      0.0|     0.0|
|ENST00000488147.1...|  1351|         1224.0|      0.0|     0.0|
|ENST00000619216.1...|    68|           15.0|      0.0|     0.0|
|ENST00000473358.1...|   712|          585.0|      0.0|     0.0|
|ENST00000469289.1...|   535|          408.0|      0.0|     0.0|
|ENST00000607096.1...|   138|           55.0|      0.0|     0.0|
|ENST00000417324.1...|  1187|         1060.0|      0.0|     0.0|
|ENST00000461467.1...|   590|         885.84| 5.813169|    4.93|
|ENST00000606857.1...|   840|          713.0|      0.0|     0.0|
|ENST00000642116.1...|  1414|         1287.0|      0.0|     0.0|
|ENST00000492842.2...|   939|          812.0|      0.0|     0.0|
|ENST00000641515.2...|  2

# Test 1 can we use dataframe withColumn to add a column to another dataframe?
# <span style="color:red">NO </span>

In [52]:
df1 = quantSDFs[0].select("Name", "NumReads").limit(5)
df1.show()

+--------------------+--------+
|                Name|NumReads|
+--------------------+--------+
|ENST00000456328.2...|     0.0|
|ENST00000450305.2...|     0.0|
|ENST00000488147.1...|     0.0|
|ENST00000619216.1...|     0.0|
|ENST00000473358.1...|     0.0|
+--------------------+--------+



In [53]:
df2 = quantSDFs[-2].select("NumReads").limit(5)
df2.show()

+--------+
|NumReads|
+--------+
|     4.0|
|     0.0|
|     0.0|
|     0.0|
|     0.0|
+--------+



In [54]:
# df3 = df1.withColumn( df2.col("NumReads"))
# df3 = df1.withColumn( df2["NumReads"])
df3 = df1.withColumn( df2.NumReads )
df3.show()

TypeError: withColumn() missing 1 required positional argument: 'col'

In [37]:
df2.NumReads
print(type(df2.NumReads))

<class 'pyspark.sql.column.Column'>


# test 2 can we use pivot tables and union?
<span style="color:red">???</span>
- http://www.svds.com/pivoting-data-in-sparksql/
- https://databricks.com/blog/2016/02/09/reshaping-data-with-pivot-in-apache-spark.html
- https://spark.apache.org/docs/1.6.0/api/java/org/apache/spark/sql/GroupedData.html

try
- https://stackoverflow.com/questions/40892459/spark-transpose-dataframe-without-aggregating

take a look at coordinate matrix and block matrix
- https://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.html
- https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html

In [55]:
df1 = quantSDFs[0].select("Name", "Tmp", "NumReads").limit(5)
df1.show()

+--------------------+---+--------+
|                Name|Tmp|NumReads|
+--------------------+---+--------+
|ENST00000456328.2...|0.0|     0.0|
|ENST00000450305.2...|0.0|     0.0|
|ENST00000488147.1...|0.0|     0.0|
|ENST00000619216.1...|0.0|     0.0|
|ENST00000473358.1...|0.0|     0.0|
+--------------------+---+--------+



In [56]:
df2 = quantSDFs[2].select("Name", "Tmp", "NumReads").limit(5)
df2.show()

+--------------------+--------+--------+
|                Name|     Tmp|NumReads|
+--------------------+--------+--------+
|ENST00000456328.2...|0.727248|   1.998|
|ENST00000450305.2...|     0.0|     0.0|
|ENST00000488147.1...|     0.0|     0.0|
|ENST00000619216.1...|     0.0|     0.0|
|ENST00000473358.1...|     0.0|     0.0|
+--------------------+--------+--------+



In [61]:
# df1p = df1.pivot("NumReads")
df2p = df2.groupBy("Name").pivot("NumReads").avg("NumReads")
df2p.show()

+--------------------+----+-----+
|                Name| 0.0|1.998|
+--------------------+----+-----+
|ENST00000456328.2...|null|1.998|
|ENST00000450305.2...| 0.0| null|
|ENST00000488147.1...| 0.0| null|
|ENST00000619216.1...| 0.0| null|
|ENST00000473358.1...| 0.0| null|
+--------------------+----+-----+

