# Spark Block Matrix Tranform experiment

terra/deseq/bin/createSalmonNumReadsTransposeMatrix.sh create a matrix with the following form

```
Name,ENST00000456328.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000362751.1|DDX11L1-202|DDX11L1|1657|processed_transcript|,ENST00000450305.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000002844.2|DDX11L1-201|DDX11L1|632|transcribed_unprocessed_pseudogene|
GTEX-111YS-2226-SM-5987P,0.0,0.0
GTEX-1122O-0005-SM-5O99J,21.523,0.0
GTEX-1H1ZS-0526-SM-9WG5L,8.922,0.0
GTEX-1122O-0326-SM-5H124,0.0,0.0
```

not clear if pandas will be able to tranform this file or not. It has been running for a couple of day
```
$ ll /scratch/aedavids/sortedGTExTrainSalmonNumReadsTransposeMatrix.csv 
-rw-r--r-- 1 aedavids prismuser 212G Jan 11 20:36 /scratch/aedavids/sortedGTExTrainSalmonNumReadsTransposeMatrix.csv
```

Can we use Spark?

- https://spark.apache.org/docs/latest/mllib-data-types.html#blockmatrix
- https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html

In [1]:
SPARK_HOME="../deseq/spark-3.1.2-bin-hadoop3.2"
import findspark
findspark.init( SPARK_HOME )

In [2]:
from pyspark.sql import SparkSession
from pyspark.mllib.linalg.distributed import BlockMatrix

In [3]:
spark = SparkSession\
                .builder\
                .appName("transpose")\
                .getOrCreate()

In [4]:
testMatrix = "../deseq/python/test/data/pythonTestSalmonNumReadsTransposeMatrix.csv"
df = spark.read.csv( testMatrix)
print( df.columns )
print()
df.show()

['_c0', '_c1', '_c2']

+--------------------+--------------------+--------------------+
|                 _c0|                 _c1|                 _c2|
+--------------------+--------------------+--------------------+
|                Name|ENST00000456328.2...|ENST00000450305.2...|
|GTEX-111YS-2226-S...|                 0.0|                 0.0|
|GTEX-1122O-0005-S...|              21.523|                 0.0|
|GTEX-1H1ZS-0526-S...|               8.922|                 0.0|
|GTEX-1122O-0326-S...|                 0.0|                 0.0|
+--------------------+--------------------+--------------------+



In [5]:
# colnames do not show nicely
# todo add schema
df = spark.read.csv( testMatrix, header=True)
df.show()

+--------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|                Name|ENST00000456328.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000362751.1|DDX11L1-202|DDX11L1|1657|processed_transcript||ENST00000450305.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000002844.2|DDX11L1-201|DDX11L1|632|transcribed_unprocessed_pseudogene||
+--------------------+----------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|GTEX-111YS-2226-S...|                                                                                                                        

In [9]:
# https://stackoverflow.com/a/53149013/4586180
def fixColNames( df ) :
    '''
    Column headers: lower case + remove spaces and the following characters: ,;{}()=  
    '.' replaced with 'xxx'
    '''
    newColumns = []
    problematic_chars = ',;{}()='
    for columnStr in df.columns:
        columnStr = columnStr.lower()
        columnStr = columnStr.replace(' ', '_')
        columnStr = columnStr.replace('.', 'xxx')
        for c in problematic_chars:
            columnStr = columnStr.replace(c, '')
        newColumns.append(columnStr)
    df = df.toDF(*newColumns)
    
    return df

fixedDF = fixColNames( df )
print(" ")
print(fixedDF.columns)
print()

fixedDF.select( fixedDF.columns[1:]).show()

 
['name', 'enst00000456328xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000362751xxx1|ddx11l1-202|ddx11l1|1657|processed_transcript|', 'enst00000450305xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000002844xxx2|ddx11l1-201|ddx11l1|632|transcribed_unprocessed_pseudogene|']

+------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|enst00000456328xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000362751xxx1|ddx11l1-202|ddx11l1|1657|processed_transcript||enst00000450305xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000002844xxx2|ddx11l1-201|ddx11l1|632|transcribed_unprocessed_pseudogene||
+------------------------------------------------------------------------------------------------------------------------------------+-------

In [11]:
realDF = fixedDF.select( fixedDF.columns[1:])

In [15]:
for row in realDF.rdd.collect() :
    print()
    print(row)
    print(row.)


Row(enst00000456328xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000362751xxx1|ddx11l1-202|ddx11l1|1657|processed_transcript|='0.0', enst00000450305xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000002844xxx2|ddx11l1-201|ddx11l1|632|transcribed_unprocessed_pseudogene|='0.0')

Row(enst00000456328xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000362751xxx1|ddx11l1-202|ddx11l1|1657|processed_transcript|='21.523', enst00000450305xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000002844xxx2|ddx11l1-201|ddx11l1|632|transcribed_unprocessed_pseudogene|='0.0')

Row(enst00000456328xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000362751xxx1|ddx11l1-202|ddx11l1|1657|processed_transcript|='8.922', enst00000450305xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000002844xxx2|ddx11l1-201|ddx11l1|632|transcribed_unprocessed_pseudogene|='0.0')

Row(enst00000456328xxx2|ensg00000223972xxx5|otthumg00000000961xxx2|otthumt00000362751xxx1|ddx11l1-20

In [10]:
rowsPerBlock = 1
colsPerBlock = 1
bmat1 = BlockMatrix(realDF.rdd, rowsPerBlock, colsPerBlock )
bmat1

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0 in stage 7.0 (TID 7) (192.168.68.110 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/andrewdavidson/googleUCSC/kimLab/extraCellularRNA/terra/deseq/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/Users/andrewdavidson/googleUCSC/kimLab/extraCellularRNA/terra/deseq/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/Users/andrewdavidson/googleUCSC/kimLab/extraCellularRNA/terra/deseq/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "../deseq/spark-3.1.2-bin-hadoop3.2/python/pyspark/rdd.py", line 1560, in takeUpToNumLeft
    yield next(iterator)
  File "/Users/andrewdavidson/googleUCSC/kimLab/extraCellularRNA/terra/deseq/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/Users/andrewdavidson/googleUCSC/kimLab/extraCellularRNA/terra/deseq/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/mllib/linalg/distributed.py", line 1145, in _convert_to_matrix_block_tuple
    raise TypeError("Cannot convert type %s into a sub-matrix block tuple" % type(block))
TypeError: Cannot convert type <class 'pyspark.sql.types.Row'> into a sub-matrix block tuple

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:652)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:635)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2236)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/andrewdavidson/googleUCSC/kimLab/extraCellularRNA/terra/deseq/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/Users/andrewdavidson/googleUCSC/kimLab/extraCellularRNA/terra/deseq/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/Users/andrewdavidson/googleUCSC/kimLab/extraCellularRNA/terra/deseq/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "../deseq/spark-3.1.2-bin-hadoop3.2/python/pyspark/rdd.py", line 1560, in takeUpToNumLeft
    yield next(iterator)
  File "/Users/andrewdavidson/googleUCSC/kimLab/extraCellularRNA/terra/deseq/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/Users/andrewdavidson/googleUCSC/kimLab/extraCellularRNA/terra/deseq/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/mllib/linalg/distributed.py", line 1145, in _convert_to_matrix_block_tuple
    raise TypeError("Cannot convert type %s into a sub-matrix block tuple" % type(block))
TypeError: Cannot convert type <class 'pyspark.sql.types.Row'> into a sub-matrix block tuple

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:652)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:635)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2236)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
