In [1]:
import linear_dag as ld
import numpy as np
import pandas as pd
import time
import os
import scipy.sparse as sp

Benchmark against running single vectors and full genotype matrix across single and double precision and number of vectors

In [2]:
linarg_dir = '/Users/ambershen/Desktop/linARG/dx_analysis/test_pipeline/linear_args/no_filter'
linarg = ld.LinearARG.read(f"{linarg_dir}/linear_arg.npz", f"{linarg_dir}/linear_arg.pvar.gz", f"{linarg_dir}/linear_arg.psam.gz")

In [3]:
def load_linarg(linarg_dir, partition_id):
    start = time.time()
    linarg = ld.LinearARG.read(f'{linarg_dir}/{partition_id}/linear_arg.npz', f'{linarg_dir}/{partition_id}/linear_arg.pvar.gz', f'{linarg_dir}/{partition_id}/linear_arg.psam.gz')    
    end = time.time()
    return linarg, end-start


def load_genotypes(linarg_dir, partition_id):
    start = time.time()
    mtx_files = os.listdir(f'{linarg_dir}/{partition_id}/genotype_matrices/')
    ind_arr = np.array([int(f.split('_')[0]) for f in mtx_files])
    order = ind_arr.argsort()
    mtx_files = np.array(mtx_files)[order].tolist() # sort files by index
    genotypes = sp.hstack([sp.load_npz(f'{linarg_dir}/{partition_id}/genotype_matrices/{m}') for m in mtx_files])   
    end = time.time() 
    return genotypes, end-start


def matmat(X, v):
    if X.shape[1] == v.shape[0]:
        start = time.time()
        u = X @ v
        end = time.time()
    elif linarg.shape[0] == v.shape[0]:
        start = time.time()
        u = X.T @ v
        end = time.time()
    else:
        print('dimensions do not match')
        return None
    return end - start


def matvec(X, v):
    if X.shape[1] == v.shape[0]:
        start = time.time()
        for i in range(v.shape[1]):
            u = X @ v[:, i]
        end = time.time()
        return end - start
    elif linarg.shape[0] == v.shape[0]:
        start = time.time()
        for i in range(v.shape[1]):
            u = X.T @ v[:, i]
        end = time.time()
        return end - start
    else:
        print('dimensions do not match')
        return None
       
    
def benchmark_dot_product(linarg, genotypes, n_vectors):
    df = pd.DataFrame(columns=['method', 'dot_product_type', 'n', 'time'])
    
    for n in n_vectors:
        y = np.random.normal(size=(linarg.shape[0], n))
        beta = np.random.normal(size=(linarg.shape[1], n))
        
        linarg_matmat_right = matmat(linarg, beta)
        linarg_matmat_left = matmat(linarg, y)
        
        linarg_matvec_right = matvec(linarg, beta)
        linarg_matvec_left = matvec(linarg, y)
        
        geno_matmat_right = matmat(genotypes, beta)
        geno_matmat_left = matmat(genotypes, y)
        
        df.loc[df.shape[0]] = ['linarg_matmat', 'right', n, linarg_matmat_right]
        df.loc[df.shape[0]] = ['linarg_matmat', 'left', n, linarg_matmat_left]
        df.loc[df.shape[0]] = ['linarg_matvec', 'right', n, linarg_matvec_right]
        df.loc[df.shape[0]] = ['linarg_matvec', 'left', n, linarg_matvec_left]
        df.loc[df.shape[0]] = ['genotypes', 'right', n, geno_matmat_right]
        df.loc[df.shape[0]] = ['genotypes', 'left', n, geno_matmat_left]
    
    return df

        

In [4]:
linarg_dir = '/Users/ambershen/Desktop/linARG/dx_analysis/test_pipeline/linear_args/'
partition_id = 'no_filter'

linarg, linarg_load_time = load_linarg(linarg_dir, partition_id)
genotypes, genotypes_load_time = load_genotypes(linarg_dir, partition_id)

In [10]:
n_vectors = [2, 5, 10, 100, 1000, 10000]
df = benchmark_dot_product(linarg, genotypes, n_vectors)

In [None]:
df