In [18]:
import numpy as np
import linear_dag as ld
import os
import pandas as pd
import h5py

def get_size_of_mtx(A):    
    if (np.min(A.data) > -127) and (np.max(A.data) < 128):
        data_size = A.data.astype(np.int8).nbytes
    else:
        data_size = A.data.nbytes
    size_in_memory = data_size + A.indices.astype(np.uint32).nbytes + A.indptr.astype(np.uint32).nbytes
    return size_in_memory


def get_size_of_linarg(linarg):
    mtx_size = get_size_of_mtx(linarg.A)
    indices_size = linarg.variant_indices.astype(np.uint32).nbytes + linarg.flip.nbytes
    return mtx_size + indices_size
    

def get_ukb_linarg_size(linarg_path, blocks):
    size_in_memory = []
    for block in blocks:        
        linarg = ld.LinearARG.read(linarg_path, block=block)
        size_in_memory.append(get_size_of_linarg(linarg))
    return np.sum(size_in_memory) / 10**9


def get_linarg_disk_size(linarg_path, blocks):
    disk_size = 0
    for block in blocks:
        with h5py.File(linarg_path, 'r') as f:
            dataset_names = ['indptr', 'indices', 'data', 'variant_indices', 'flip'] 
            for name in dataset_names:
                dset = f[block][name]
                disk_size += dset.id.get_storage_size()
    return disk_size / 10**9


def get_variant_metadata_disk_size(linarg_path, blocks):
    disk_size = 0
    for block in blocks:
        with h5py.File(linarg_path, 'r') as f:
            dataset_names = ["CHROM", "POS", "ID", "REF", "ALT"]
            for name in dataset_names:
                dset = f[block][name]
                disk_size += dset.id.get_storage_size()
    return disk_size / 10**9


def get_vcf_disk_size(chroms):
    disk_size = 0
    for chrom in chroms:
        disk_size += os.path.getsize(f"/mnt/project/Bulk/Previous WGS releases/GATK and GraphTyper WGS/SHAPEIT Phased VCFs/ukb20279_c{chrom}_b0_v1.vcf.gz")
    return disk_size
        
        
def get_nnz_ratio(blocks, linarg_path):
    linarg_nnz = 0
    genotypes_nnz = 0
    n_variants = 0
    for block in blocks:        
        linarg = ld.LinearARG.read(linarg_path, block=block)
        v = np.ones(linarg.shape[0])
        linarg_nnz += linarg.A.nnz
        genotypes_nnz += np.sum(v @ linarg)
        n_variants += linarg.shape[1]
    return genotypes_nnz / linarg_nnz, n_variants


def get_all_paths(file_path):
    paths = []

    def collect(name):
        paths.append(name)

    with h5py.File(file_path, 'r') as f:
        f.visit(collect)
    
    return paths

In [7]:
linarg_path = '/Users/ambershen/Desktop/linARG/data/1kg/1kg_chromosomes.h5'
blocks = list(set([x.split('/')[0] for x in  get_all_paths(linarg_path)]))

In [13]:
get_variant_metadata_disk_size(linarg_path, blocks)

0.498627687

In [10]:
get_linarg_disk_size(linarg_path, blocks)

2.159486619

In [17]:
get_ukb_linarg_size(linarg_path, blocks)

5.694802372

In [19]:
get_nnz_ratio(blocks, linarg_path)

(22.856828224117738, 70692015)