## Imports

In [None]:
import numpy as np
from scipy import sparse
import pandas as pd
from scipy import stats


## Creating a Sparse Matrix

In [None]:
# Create a sparse matrix where rows are users and columns are products
# Only storing the actual interactions
row = np.array([0, 3, 1, 0])     # User IDs
col = np.array([0, 3, 1, 2])     # Product IDs
data = np.array([4, 5, 7, 9])    # Interaction values (like ratings)

# Create the sparse matrix
sparse_matrix = sparse.coo_matrix((data, (row, col)), shape=(4, 4))

# seeing the sparse matrix as a regular matrix
print("Here's our sparse matrix as a regular array:")
print(sparse_matrix.toarray())


Here's our sparse matrix as a regular array:
[[4 0 9 0]
 [0 7 0 0]
 [0 0 0 0]
 [0 0 0 5]]


## Basic Statistical Analysis

In [None]:
def calculate_sparse_mean(sparse_matrix):
    """
    Calculate mean of non-zero elements in a sparse matrix.
    This is useful when zeros represent 'no data' rather than actual zeros.
    """
    if sparse_matrix.nnz == 0:  # nnz is the number of non-zero elements
        return 0.0
    return sparse_matrix.sum() / sparse_matrix.nnz

mean_value = calculate_sparse_mean(sparse_matrix)
print(f"\nMean of non-zero elements: {mean_value:.2f}")



Mean of non-zero elements: 6.25


## Handling Row and Column Statistics

In [None]:
def analyze_row_patterns(sparse_matrix):
    """
    Analyze patterns in each row of a sparse matrix.
    Returns dictionary with various row statistics.
    """
    # Convert to CSR format for efficient row operations
    csr_matrix = sparse_matrix.tocsr()

    # Calculate statistics
    row_sums = np.array(csr_matrix.sum(axis=1)).flatten()
    row_nonzeros = np.diff(csr_matrix.indptr)  # Number of non-zeros per row

    # Calculate means, handling empty rows
    row_means = np.zeros_like(row_sums, dtype=float)
    mask = row_nonzeros > 0
    row_means[mask] = row_sums[mask] / row_nonzeros[mask]

    return {
        'activity_sum': row_sums,      # Total activity per user
        'interaction_count': row_nonzeros,  # Number of interactions per user
        'average_value': row_means     # Average value per user
    }


In [None]:
stats = analyze_row_patterns(sparse_matrix)
print("\nUser Statistics:")
for i, (sum_val, count, mean) in enumerate(zip(
    stats['activity_sum'],
    stats['interaction_count'],
    stats['average_value']
)):
    print(f"User {i}: {count} interactions, "
          f"total activity = {sum_val}, "
          f"average value = {mean:.2f}")



User Statistics:
User 0: 2 interactions, total activity = 13, average value = 6.50
User 1: 1 interactions, total activity = 7, average value = 7.00
User 2: 0 interactions, total activity = 0, average value = 0.00
User 3: 1 interactions, total activity = 5, average value = 5.00


## Correlation Analysis

In [None]:
def calculate_sparse_correlation(sparse_matrix, min_overlap=2):
    """
    Calculate correlation between columns, considering only overlapping non-zero elements.
    Like finding which products are often rated similarly.
    """
    # Convert to dense format for this calculation
    # (For very large matrices, you'd want to do this differently)
    dense_cols = sparse_matrix.toarray().T
    n_cols = dense_cols.shape[0]
    correlations = np.zeros((n_cols, n_cols))

    for i in range(n_cols):
        for j in range(i, n_cols):
            # Find where both columns have non-zero values
            mask = (dense_cols[i] != 0) & (dense_cols[j] != 0)
            if mask.sum() >= min_overlap:
                corr = stats.pearsonr(dense_cols[i][mask],
                                    dense_cols[j][mask])[0]
                correlations[i, j] = correlations[j, i] = corr

    return correlations

In [None]:
corr_matrix = calculate_sparse_correlation(sparse_matrix)
print("\nCorrelation matrix:")
print(corr_matrix)


Correlation matrix:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
