In [None]:
#from collections import defaultdict
import collections
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt

In [None]:
words = "apple banana apple strawberry banana lemon banana lemon."

d = collections.defaultdict(int)

d

for word in words.split():
    d[word] += 1
    
d.keys()

In [None]:
words.split()

dict(collections.Counter(words.split()))

In [None]:
#create empty CSC matrix:
mtx = sps.csc_matrix((3, 4), dtype=np.int8)

mtx.todense()

# create using (data, ij) tuple:
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
mtx = sps.csc_matrix((data, (row, col)), shape=(3, 3))

mtx.todense()

In [None]:
# Any order is fine
# create using (data, ij) tuple:
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 2, 1, 0])
data = np.array([1, 2, 3, 6, 5, 4])
mtx = sps.csc_matrix((data, (row, col)), shape=(3, 3))
mtx.todense()

In [None]:
#Compressed Sparse Column Format (CSC)
#efficient column slicing, column-oriented operations slow row slicing, expensive changes to the sparsity structure
#use: actual computations (most linear solvers support this format)

# create using (data, indices, indptr) tuple:
# nonzero values of the i-th column are data[indptr[i]:indptr[i+1]] 
# with row indices indices[indptr[i]:indptr[i+1]]
data = np.array([1, 4, 5, 2, 3, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
indptr = np.array([0, 2, 3, 6])  # first column non-zero values are data[0:2] so two numbers  1, 4 
## those two numbers correspond to rows indices[0:2] so row 0 and row 2 
## whereas second column non-zero data are data[2:3] so just one number corresponding to row indices[2:3] so row 2 
## third colum data[3:6] so three numbers 2,3,6 at rows indices[3:6] so rows 0,1,2 
mtx = sps.csc_matrix((data, indices, indptr), shape=(3, 3))
mtx.todense()

In [None]:
#Compressed Sparse Row Format (CSR)
#indices is array of column indices

#data is array of corresponding nonzero values

#indptr points to row starts in indices and data

#length is n_row + 1, last item = number of values = length of both indices and data

#nonzero values of the i-th row are data[indptr[i]:indptr[i+1]] with column indices indices[indptr[i]:indptr[i+1]]

#item (i, j) can be accessed as data[indptr[i]+k], where k is position of j in indices[indptr[i]:indptr[i+1]]

# you can specify coo ind
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
mtx = sps.csr_matrix((data, (row, col)), shape=(3, 3))
mtx

mtx.todense()

mtx.data

mtx.indices

mtx.indptr

data = np.array([1, 2, 3, 4, 5, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
indptr = np.array([0, 2, 3, 6])
mtx = sps.csr_matrix((data, indices, indptr), shape=(3, 3))


 mtx.todense()
 

In [None]:
# Scipy allows you to perform operations on sparse matrices but it may not be efficient
# except if sparsity level is > 80% or so depending on the algorithm

import scipy.sparse as sp
values = [2, 4, 1, 3, 1, 1]
row_indices = [1, 3, 2, 0, 3, 1]
column_indices = [0, 0, 1, 2, 2, 3]
A = sp.coo_matrix((values, (row_indices, column_indices)), shape=(4,4))
print(A.todense())

print(A.data)
print(A.row)
print(A.col)

A=A.todense()
A

A=np.asarray(A)

B = sp.csc_matrix(A)

B

B.todense()

#


##99.9% sparsity array A 
A = sp.rand(1000,1000, 0.001) + sp.eye(1000)
B = np.asarray(A.todense())
x = np.random.randn(1000)
%timeit A @ x
%timeit B @ x

In [None]:
A
B
A = sp.rand(10,10,0.1)
A.todense() ## 10 nonzero elements out of 100

A = sp.rand(10,10,0.05)
A.todense()  # 5 out of 100 are non-zero

## anything above 10% density will slow things down! 
A = sp.rand(1000,1000, 0.1) + sp.eye(1000)
B = np.asarray(A.todense())
x = np.random.randn(1000)
%timeit A @ x
%timeit B @ x

In [None]:
import scipy.sparse.linalg as spla
A = A.tocsc()
%timeit spla.spsolve(A,x)     # only works with CSC or CSR format
%timeit np.linalg.solve(B,x)


A = sp.rand(1000,1000, 0.001) + sp.eye(1000)
A = A.tocsc()
B = np.asarray(A.todense())
x = np.random.randn(1000)
%timeit A @ x
%timeit B @ x
%timeit spla.spsolve(A,x)     # only works with CSC or CSR format
%timeit np.linalg.solve(B,x)