In [60]:
from pyspark import SparkContext
from pyspark import SparkConf
import numpy as np
from numpy.linalg import svd,qr

spark = SparkSession.builder \
    .appName("Cluster-Spark-Job") \
    .master("spark://spark-master:7077") \
    .getOrCreate()
num_partitions = 2  # Eine Partition pro Worker

In [14]:
spark.stop()

In [44]:
A = np.random.rand(1000, 500)

In [52]:
def svd(arr):
    """
Function to calculate the SVD of a given nxm Matrix

@param Arr: Array that will be transformed
@param param2: this is a second param
@return: this is a description of what is returned
@raise keyError: raises an exception
"""
    # Calculate V and singular values  
    res = arr.T @ arr  
    eigval_V, V = np.linalg.eigh(res) #eigh zur numerischen Stabilität
    singval = np.sqrt(np.abs(eigval_V)) 
              
    # Berechnung von U
    U = np.zeros((arr.shape[0], min(arr.shape)))
    
    # Berechne U-Spalten direkt aus der Definition
    for i in range(len(singval)):
        if singval[i] > 1e-10:  # Numerische Stabilität
            U[:, i] = (arr @ V[:, i]) / singval[i]
      
    # Create the diagonal matrix for sigma  
    sigma = np.zeros((arr.shape[0], arr.shape[1]))  
    sigma[:len(singval), :len(singval)] = np.diag(singval)  
      
    return U, sigma, V  

In [4]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import numpy as np
from numpy.linalg import svd,qr

def parallel_randomized_svd(A, n_iter=5):
    """
    Berechnet die randomisierte SVD einer Matrix A, parallelisiert mit Spark.
    
    Parameters:
    A : numpy.ndarray
        Die Eingabematrix mit Dimension m x n.
    k : int
        Die reduzierte Anzahl der Singulärwerte (Dimension des Unterraums).
    n_iter : int
        Anzahl der Iterationen zur Verbesserung der Projektion (optional).
    
    Returns:
    U : numpy.ndarray
        Die linke Singulärvektormatrix.
    Sigma : numpy.ndarray
        Die Diagonalmatrix der singulären Werte.
    Vt : numpy.ndarray
        Die transponierte rechte Singulärvektormatrix.
    """
    m, n = A.shape

    k = 3
    # Schritt 1: Zufällige Projektion
    Omega = np.random.randn(n, k)
    
    # Schritt 2: Parallele Matrixmultiplikation A @ Omega
    A_rdd = spark.sparkContext.parallelize(A, num_partitions)
    Y_rdd = A_rdd.map(lambda row: np.dot(row, Omega))
    
    # Schritt 3: QR-Zerlegung parallelisieren
    Y = np.array(Y_rdd.collect())
    Q, _ = qr(Y, mode='reduced')
    
    # Q als RDD halten für weitere parallele Berechnungen
    Q_rdd = spark.sparkContext.parallelize(Q, num_partitions)
    
    # Schritt 4: Parallele Berechnung von B = Q.T @ A
    A_rdd_transposed = spark.sparkContext.parallelize(A.T, num_partitions)
    B_rdd = A_rdd_transposed.map(lambda col: np.dot(Q.T, col))
    
    # B als numpy Matrix sammeln
    B = np.array(B_rdd.collect()).T
    
    # Schritt 5: SVD der kleineren Matrix B
    U_hat, Sigma, Vt = svd(B, full_matrices=False)
    
    # Schritt 6: Parallele Rekonstruktion von U = Q @ U_hat
    U_rdd = Q_rdd.map(lambda q_row: np.dot(q_row, U_hat))
    
    # U als numpy Array sammeln
    U = np.array(U_rdd.collect())
    
    return U, Sigma, Vt

def parallelinvers(U,sigma,V):
    return V.T @ np.diag(1/(sigma)) @ U.T

def betacalc(A):
    X = A[:,:-1]
    X = np.hstack([np.ones((X.shape[0], 1)), X])  #Einser Spalte für Konstante
    Y = A[:,-1]
    U,Sigma,Vt = parallel_randomized_svd(X.T @ X)
    parinv = parallelinvers(U,Sigma,Vt)
    return parinv @ X.T @ Y

In [None]:
# Beispielmatrix
from pyspark import SparkContext
from pyspark.sql import SparkSession
import numpy as np
from numpy.linalg import svd,qr
spark = SparkSession.builder \
    .appName("Cluster-Spark-Job") \
    .master("spark://spark-master:7077") \
    .getOrCreate()
num_partitions = 4  # Eine Partition pro Worker
A = np.random.rand(300000, 10000)
res = betacalc(A)
spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/12 10:51:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [1]:
def parallel_randomized_svd(A, k, n_iter=5):
    m, n = A.shape
    # Adjust k to be smaller than n
    k = min(k, n)

    # Broadcast Omega
    Omega = np.random.randn(n, k)
    Omega_broadcast = spark.sparkContext.broadcast(Omega)

    # Step 2: Compute Y = A @ Omega in parallel
    A_rdd = spark.sparkContext.parallelize(A, num_partitions)
    Y_rdd = A_rdd.map(lambda row: np.dot(row, Omega_broadcast.value))
    
    # Collect Y_rdd (only feasible if k is small)
    Y = np.array(Y_rdd.collect())
    
    # Perform QR decomposition on Y
    Q, _ = np.linalg.qr(Y, mode='reduced')
    
    # Broadcast Q
    Q_broadcast = spark.sparkContext.broadcast(Q)

    # Step 4: Compute B = Q.T @ A in parallel
    A_rdd_T = spark.sparkContext.parallelize(A.T, num_partitions)
    B_rdd = A_rdd_T.map(lambda col: np.dot(Q_broadcast.value.T, col))
    B = np.array(B_rdd.collect()).T

    # Step 5: Compute SVD of B
    U_hat, Sigma, Vt = svd(B, full_matrices=False)

    # Step 6: Compute U = Q @ U_hat
    U = np.dot(Q, U_hat)

    return U, Sigma, Vt


In [None]:
import numpy as np
A = np.random.rand(300000, 10000)

from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Summarizer
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import DenseMatrix

def compute_svd_with_mllib(A_rdd, k):
    # Convert RDD to DataFrame
    df = A_rdd.map(lambda row: (Vectors.dense(row),)).toDF(["features"])

    # Perform PCA (which uses SVD internally)
    pca = PCA(k=k, inputCol="features", outputCol="pca_features")
    pca_model = pca.fit(df)

    # Extract components (V), singular values (s), and compute U if needed
    V = pca_model.pc.toArray()
    s = pca_model.explainedVariance.toArray()
    # Note: U is not directly available

    return V, s
compute_svd_with_mllib

In [93]:
# Convert the singular values (Sigma) to a diagonal matrix
Sigma_diag = np.diag(Sigma)

# Reconstruct the matrix A from U, Sigma, and Vt
A_reconstructed = np.dot(np.dot(U, Sigma_diag), Vt)

# Check if the reconstructed matrix is close to the original
if np.allclose(A, A_reconstructed):
    print("The SVD reconstruction was successful!")
else:
    print("The SVD reconstruction is not accurate.")


The SVD reconstruction was successful!


In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Cluster-Spark-Job") \
    .master("spark://spark-master:7077") \
    .getOrCreate()
spark.read.parquet("people.parquet")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/12 11:03:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/opt/bitnami/spark/notebooks/people.parquet.

In [40]:
A

array([[0.91530289, 0.95176931, 0.9888379 , ..., 0.43769061, 0.44734605,
        0.03962414],
       [0.69368828, 0.29421052, 0.997087  , ..., 0.11518715, 0.00172986,
        0.94186572],
       [0.07247462, 0.39327233, 0.5861451 , ..., 0.51036406, 0.10745809,
        0.23442968],
       ...,
       [0.76105317, 0.97112879, 0.2676072 , ..., 0.32741688, 0.19668947,
        0.8570138 ],
       [0.24067845, 0.10014896, 0.55088377, ..., 0.33856894, 0.39691969,
        0.94438011],
       [0.97386001, 0.67953161, 0.37735366, ..., 0.44469186, 0.27913325,
        0.67257998]])

In [12]:
import numpy as np
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Randomized-SVD-Spark-NumPy") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

num_partitions = 2  # Adjust based on your cluster setup

def parallel_randomized_svd(A, k, n_iter=2):
    """
    Computes the randomized SVD of matrix A, parallelized with Spark and using NumPy.
    
    Parameters:
    A : numpy.ndarray
        Input matrix with dimensions m x n.
    k : int
        The number of singular values to compute (dimension of the subspace).
    n_iter : int
        Number of iterations for improving the projection (optional).
    
    Returns:
    U : numpy.ndarray
        Left singular vector matrix.
    s : numpy.ndarray
        Singular values.
    Vt : numpy.ndarray
        Transposed right singular vector matrix.
    """
    m, n = A.shape
    rank = min(m, n)
    k = min(k, rank)  # Ensure k doesn't exceed the matrix rank
    
    # Step 1: Random projection
    Omega = np.random.randn(n, k)
    
    # Step 2: Compute Y = A @ Omega
    A_rdd = spark.sparkContext.parallelize(A, num_partitions)
    Y_rdd = A_rdd.map(lambda row: np.dot(row, Omega))
    Y = np.array(Y_rdd.collect())
    
    # Step 3: Compute QR decomposition of Y
    Q, _ = np.linalg.qr(Y, mode='reduced')
    
    # Step 4: Power iteration to improve accuracy
    for _ in range(n_iter):
        Z_rdd = spark.sparkContext.parallelize(A.T, num_partitions)
        Z_rdd = Z_rdd.map(lambda col: np.dot(Q.T, np.dot(A, col)))
        Z = np.array(Z_rdd.collect()).T
        Q, _ = np.linalg.qr(Z, mode='reduced')
    
    # Step 5: Compute B = Q.T @ A
    B_rdd = spark.sparkContext.parallelize(A.T, num_partitions)
    B_rdd = B_rdd.map(lambda col: np.dot(Q.T, col))
    B = np.array(B_rdd.collect()).T
    
    # Step 6: Compute SVD of B
    Uhat, s, Vt = np.linalg.svd(B, full_matrices=False)
    
    # Step 7: Compute U = Q @ Uhat
    U = np.dot(Q, Uhat)
    
    # Step 8: If k equals the full rank, pad U and Vt to full size
    if k == rank:
        if m > n:  # Tall matrix
            U_full = np.zeros((m, m))
            U_full[:, :n] = U
            U_full[:, n:] = np.eye(m)[:, n:]
            U = U_full
        elif n > m:  # Wide matrix
            Vt_full = np.zeros((n, n))
            Vt_full[:m, :] = Vt
            Vt_full[m:, :] = np.eye(n)[m:, :]
            Vt = Vt_full
    
    return U, s, Vt

# Example usage
m, n = 1000, 500
A = np.random.rand(m, n)
k = 1000  # Number of singular values to compute (set to full rank)

U, s, Vt = parallel_randomized_svd(A, k)
print(f"U shape: {U.shape}, s shape: {s.shape}, Vt shape: {Vt.shape}")

spark.stop()

24/10/12 09:09:34 WARN TaskSetManager: Stage 11 contains a task of very large size (1974 KiB). The maximum recommended task size is 1000 KiB.
24/10/12 09:09:34 WARN TaskSetManager: Stage 12 contains a task of very large size (1970 KiB). The maximum recommended task size is 1000 KiB.
24/10/12 09:09:35 WARN TaskSetManager: Lost task 1.0 in stage 12.0 (TID 37) (172.19.0.2 executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/util.py", line 83, in wrapper
    retu

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 12.0 failed 4 times, most recent failure: Lost task 0.3 in stage 12.0 (TID 42) (172.19.0.3 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_1637/1256250453.py", line 49, in <lambda>
ValueError: shapes (1000,500) and (1000,) not aligned: 500 (dim 1) != 1000 (dim 0)

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:195)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_1637/1256250453.py", line 49, in <lambda>
ValueError: shapes (1000,500) and (1000,) not aligned: 500 (dim 1) != 1000 (dim 0)

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more


In [52]:
print(U @ Sigma @ Vt.T)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 500 is different from 1000)