# Applications de DIMSUM

In [None]:
! pip install -e ../

* Restart Kernel to import lib (may needed)

In [None]:
import numpy as np
import scipy as sc
import scipy.sparse as sp
import random
from collections import defaultdict
from pyspark.sql import SparkSession
from pyspark.mllib.linalg.distributed import MatrixEntry, CoordinateMatrix
from numpy import linalg as LA
from collections import Counter

In [None]:
from dimsum.utils.dimsum import *

In [None]:
spark = (
    SparkSession.builder.appName("Cloud computing ENSAE project")
    .master("local[5]")
    .getOrCreate()
)
sc = spark.sparkContext

## Régression linéaire en "haute dimension" (nombre de lignes >>> nombre de colonnes)

### Génération des données 

Nous allons générer des données de poids en fonction de la taille^2 avec un IMC=25 avec des données bruités

$ poids(kg)= (IMC=25)*taille^2(m)+ \sum^n_{i=1} Bruit_i$

In [None]:
def generate_data(
    number_of_observation: int,
    number_of_perturbations: int,
    number_of_col: int,
    beta: int = 25,
):
    poids = np.random.uniform(10, 100, number_of_observation)
    taille_carre = poids / 25
    listMatrixEntry = [
        MatrixEntry(i, 0, taille_carre[i]) for i in range(len(taille_carre))
    ]
    listMatrixEntryPerturbation = [
        MatrixEntry(i, j, random_value)
        for i, j, random_value in zip(
            np.random.uniform(0, number_of_observation, number_of_perturbations),
            np.random.uniform(1, number_of_col, number_of_perturbations),
            np.random.rand(number_of_perturbations),
        )
    ]
    return poids, listMatrixEntry+listMatrixEntryPerturbation

In [None]:
M=int(1e6)
N=int(1e3)
L=int(1e6)

In [None]:
Y,X=generate_data(M, L, N)

In [None]:
entries = sc.parallelize(X)
mat = CoordinateMatrix(entries, M, N)
# Entries may have duplicate key (0,1, 10) and (0,1, 30), we want to sum for demo purpose (0,1,10+30)
mat = mat.entries.map(lambda e: ((e.i,e.j),e.value)).reduceByKey(lambda x,y:x+y).map(lambda e: (e[0][0],e[0][1],e[1])).collect()
mat = CoordinateMatrix(sc.parallelize(list(map(lambda e: MatrixEntry(*e),mat))), M, N)

## Cosine similarity

In [None]:
sorted(mat.toRowMatrix().columnSimilarities().entries.collect(),key=lambda me: me.value, reverse=True)[:10]

In [None]:
row_magnitude=mat.entries.map(lambda e: (e.i,e.value)).reduceByKey(lambda x,y:max(x,y))
norm_mat=mat.entries.map(lambda e: (e.i,(e.j,e.value))).join(row_magnitude).map(lambda e: (e[0],e[1][0][0],e[1][0][1]/e[1][1])).collect()
norm_mat = CoordinateMatrix(sc.parallelize(list(map(lambda e: MatrixEntry(*e),norm_mat))), M, N)

In [None]:
dotproduct = coordinateMatrixMultiply(norm_mat.transpose(), norm_mat)
list_dotproduct=sorted(dotproduct.collect())

In [None]:
#sp.linalg.inv(list_repr_to_sp_csc(list_dotproduct,M,N))