In [1]:
"""To profile the running time line-by-line, CUDA needs to run synchronously. Set this to 1 to do that"""
%env CUDA_LAUNCH_BLOCKING=0

env: CUDA_LAUNCH_BLOCKING=0


#### Install Optional Dependencies for Line Profiling

In [None]:
# # %%capture
# """For profiling how long each line takes, both on CPU and GPU (needs synchronization)"""
# !pip install line_profiler
# %load_ext line_profiler
# !git clone https://github.com/NVIDIA/PyProf.git
# !pip install ./PyProf

## Load Cython and Our Novel Efficient functions


This cell contains the code we've implemented. You should be able to call each function directly, or alternatively, see our example calls below

Cython allows us to call lower level c-code instead of using Python. It can be a surprisingly big speedup!

In [1]:
"""This cell contains the code we've implemented. You should be able to call each function directly, or alternatively, see our example calls below"""

from main import run_benchmarks

n_components, n_features = 100, 100
n_nonzero_coefs = 17
n_samples = 50

## Benchmark Results

In [2]:
execution_times = run_benchmarks()

Settings used for the test:

Number of Components: 100
Number of Features: 10
Number of Nonzero Coefficients: 17
GPU algorithms enabled: False
Number of Samples: 50

Testing problem size m = 16 (1/10)
Sklearn OMP runtime: 0.0106
Naive CPU runtime: 0.0103
V0 CPU runtime: 0.0045
Skipping GPU algorithms (run_gpu=False)

Sklearn reconstruction error: 2.257218e+01
V0 CPU reconstruction error: 2.257217e+01
Naive CPU reconstruction error: 2.257218e+01

Testing problem size m = 20 (2/10)
Sklearn OMP runtime: 0.0087
Naive CPU runtime: 0.0035
V0 CPU runtime: 0.0097
Skipping GPU algorithms (run_gpu=False)

Testing problem size m = 24 (3/10)
Sklearn OMP runtime: 0.0230
Naive CPU runtime: 0.0107
V0 CPU runtime: 0.0096
Skipping GPU algorithms (run_gpu=False)

Testing problem size m = 32 (4/10)
Sklearn OMP runtime: 0.0221
Naive CPU runtime: 0.0109
V0 CPU runtime: 0.0101
Skipping GPU algorithms (run_gpu=False)

Testing problem size m = 64 (5/10)
Sklearn OMP runtime: 0.0193
Naive CPU runtime: 0.0084
V0

#### Optional: Line Profiling

In [None]:
"""Use this cell to profile line-by-line the algorithms you want to run"""

from sklearn.datasets import make_sparse_coded_signal

# m = 64
# n_components, n_features = m*8, m
# n_nonzero_coefs = m//4
# n_samples = 200000


# 20000 x 8000 x 1600 x 10 is just within memory reach on GPU
n_components, n_features = 20000, 8000
n_nonzero_coefs = 1600
# Keep this above 1 not to mess with dimensions of y
n_samples = 3

# Out of memory on CPU if bigger than this
# n_components, n_features = 20000, 8000
# n_nonzero_coefs = 1000
# n_samples = 10

y, X, w = make_sparse_coded_signal(
    n_samples=n_samples,
    n_components=n_components,
    n_features=n_features,
    n_nonzero_coefs=n_nonzero_coefs,
    random_state=0)

y = y.T

# Naive CPU
# %lprun -f omp_naive run_omp(torch.as_tensor(X, device='cpu', dtype=torch.float), torch.as_tensor(y, device='cpu', dtype=torch.float), n_nonzero_coefs)
# Naive GPU
# %lprun -f omp_naive -f run_omp -f gpu_transfer_and_alg gpu_transfer_and_alg(X,y, "naive")


# # V0 CPU
# # %lprun -f omp_v0 run_omp(torch.as_tensor(X, device='cpu', dtype=torch.float), torch.as_tensor(y, device='cpu', dtype=torch.float), n_nonzero_coefs, alg="v0")
# # V0 GPU
# %lprun -f omp_v0 -f run_omp -f gpu_transfer_and_alg gpu_transfer_and_alg(X,y, "v0")

In [None]:
"""Use this cell to get execution time as function of problem size"""

execution_times = {}
execution_times["sklearn"] = []
execution_times["naive_cpu"] = []
execution_times["v0_cpu"] = []
execution_times["naive_gpu"] = []
execution_times["v0_gpu"] = []


tol=0.1
k=0

# Big problems
n_samples = 100
m_arr = [16, 20, 24, 32, 64, 128, 256, 512, 1024, 2048]
# m_arr = [2048]

# Small problems
# n_samples = 75000
# m_arr = [8, 16, 24, 32, 64, 128]


for m in m_arr:
  n_components, n_features = m*8, m
  n_nonzero_coefs = m//4

  y, X, w = make_sparse_coded_signal(
    n_samples=n_samples,
    n_components=n_components,
    n_features=n_features,
    n_nonzero_coefs=n_nonzero_coefs,
    random_state=2)

  y = y.T

  omp_args = dict(tol=tol, n_nonzero_coefs=n_nonzero_coefs-k, precompute=False, fit_intercept=True, normalize=True)
  # Single core
  print('Single core. Sklearn')
  omp = OrthogonalMatchingPursuit(**omp_args)
  with elapsed_timer() as elapsed:
      omp.fit(X, y.T)
  execution_times["sklearn"].append(elapsed())

  with elapsed_timer() as elapsed:
    run_omp(torch.as_tensor(X, device='cpu', dtype=torch.float), torch.as_tensor(y, device='cpu', dtype=torch.float), n_nonzero_coefs)
  execution_times["naive_cpu"].append(elapsed())

  with elapsed_timer() as elapsed:
    run_omp(torch.as_tensor(X, device='cpu', dtype=torch.float), torch.as_tensor(y, device='cpu', dtype=torch.float), n_nonzero_coefs, alg="v0")
  execution_times["v0_cpu"].append(elapsed())

  with elapsed_timer() as elapsed:
    gpu_transfer_and_alg(X,y, "naive")
  execution_times["naive_gpu"].append(elapsed())

  with elapsed_timer() as elapsed:
    gpu_transfer_and_alg(X,y, "v0")
  execution_times["v0_gpu"].append(elapsed())


In [None]:
execution_times