In [None]:
!pip install -r ./requirements.txt

In [None]:
import numpy as np
import scipy as sc
import scipy.sparse as sp
import random
from collections import defaultdict
from pyspark.sql import SparkSession
from pyspark.mllib.linalg.distributed import MatrixEntry , CoordinateMatrix

spark = (SparkSession 
         .builder
         .appName("Cloud computing ENSAE project")
         .master("local[5]")
         .getOrCreate()
        )
sc = spark.sparkContext

In [None]:
def mult(x):
    y = np.array([2])
    return x*y
 
x = np.arange(10000)
distData = sc.parallelize(x)
 
results = distData.map(mult).collect()
 

In [None]:
def generate_sparse_matrix_dict_repr(m,n,num_nonzero):
    dict_repr=dict()
    
    i,j = np.random.choice(m, num_nonzero, replace=True) , np.random.choice(n, num_nonzero, replace=True)
    for x,y in zip(i,j):
        dict_repr[(x,y)]=1
    return dict_repr

In [None]:
def dot_product_with_dict_repr(dict_repr,m,n):
    multiplied_dict_repr=dict()
    for i in range(n):
        dict_col_i={x:v for (x,y),v in dict_repr.items() if y==i}
        dict_col_i=defaultdict(int,dict_col_i)
        if dict_col_i:
            for j in range(m):
                result=0
                dict_col_j={x:v for (x,y),v in dict_repr.items() if y==j and x in dict_col_i.keys()}
                dict_col_j=defaultdict(int,dict_col_j)
                if dict_col_j:
                    for x,v in dict_col_i.items():
                        result+= v*dict_col_j[x]
                    if result!=0:
                        multiplied_dict_repr[(i,j)]=result
    return multiplied_dict_repr

In [None]:
def dict_repr_to_sp_csc(dict_rep,m,n):
    row = np.array([])
    col = np.array([])
    data = np.array([])

    for (x,y),v in dict_rep.items():
        row=np.append(row,x)
        col=np.append(col,y)
        data=np.append(data,v)
    return sp.csc_matrix((data, (row, col)), shape=(m, n))

In [None]:
M=int(1e7)
N=40
L=520

In [None]:
a=generate_sparse_matrix_dict_repr(M,N,L)

In [None]:
sparse_matrix=dict_repr_to_sp_csc(a,M,N)

In [None]:
sparse_matrix.T@sparse_matrix.toarray()

In [None]:
# non computable
#dict_repr_to_sp_csc(dot_product_with_dict_repr(a,M,N),N,N).toarray()

In [None]:
listMatrixEntry=[MatrixEntry(x,y,v) for (x,y),v in a.items()]

In [None]:
entries = sc.parallelize(listMatrixEntry)

In [None]:
mat = CoordinateMatrix(entries, M, N)

In [None]:
#https://stackoverflow.com/questions/45881580/pyspark-rdd-sparse-matrix-multiplication-from-scala-to-python
def coordinateMatrixMultiply(leftmatrix, rightmatrix):
    left  =  leftmatrix.entries.map(lambda e: (e.j, (e.i, e.value)))
    right = rightmatrix.entries.map(lambda e: (e.i, (e.j, e.value)))
    productEntries = left \
        .join(right) \
        .map(lambda e: ((e[1][0][0], e[1][1][0]), (e[1][0][1]*e[1][1][1]))) \
        .reduceByKey(lambda x,y: x+y) \
        .map(lambda e: (*e[0], e[1]))
    return productEntries

In [None]:
dotproduct = coordinateMatrixMultiply(mat.transpose(),mat)

In [None]:
dotproduct.collect()

In [None]:
spark.stop()