In [9]:
import numpy as np
import webdataset as wds
import glob
import os
import torch
import pandas as pd
import yaml

embedding_dims = yaml.safe_load(open('../conf/datadims/embedding_dims.yaml', 'r'))
sequence_lengths = yaml.safe_load(open('../conf/datadims/seq_lengths.yaml', 'r'))

selected_modesl = ['awdlstm', 'nt_transformer_ms', 'nt_transformer_1000g', 'nt_transformer_human_ref', 'hyenadna-tiny-1k', 'hyenadna-large-1m', 'resnetlm', 'dnabert2']
selected_tasks = ['cpg_methylation', 'gene_finding', 'histone_modification','enhancer_annotation', 'chromatin_accessibility']

DATA_DIR = '../data'

requested_memory_df = []

for task in sequence_lengths.keys():

    n_embeddings = sequence_lengths[task]

    task = task.replace('_length', '')
    if task not in selected_tasks:
        continue


    task_annotations = pd.read_csv(os.path.join(DATA_DIR, task, f'{task}.bed'), sep='\t', engine='python')
    n_samples = len(task_annotations)
    
    for model in embedding_dims.keys():
        if model not in selected_modesl:
            continue

        embedding_dim = embedding_dims[model]
        
        requested_memory = (np.float32().itemsize * embedding_dim * n_embeddings * n_samples) / (1024 ** 3)
        
        requested_memory_df.append({
            'task': task,
            'model': model,
            'embedding_dim': embedding_dim,
            'n_embeddings': n_embeddings,
            'n_samples': n_samples,
            'requested_memory_GB': requested_memory
        })

requested_memory_df = pd.DataFrame(requested_memory_df)
print(len(requested_memory_df), 'total rows')

40 total rows


In [10]:
requested_memory_df = requested_memory_df.sort_values(by=['task','requested_memory_GB'], ascending=[True, False], ignore_index=True)
requested_memory_df

Unnamed: 0,task,model,embedding_dim,n_embeddings,n_samples,requested_memory_GB
0,chromatin_accessibility,nt_transformer_ms,2560,512,2062129,10068.989258
1,chromatin_accessibility,nt_transformer_1000g,2560,512,2062129,10068.989258
2,chromatin_accessibility,nt_transformer_human_ref,1280,512,2062129,5034.494629
3,chromatin_accessibility,dnabert2,768,512,2062129,3020.696777
4,chromatin_accessibility,resnetlm,256,512,2062129,1006.898926
5,chromatin_accessibility,hyenadna-large-1m,256,512,2062129,1006.898926
6,chromatin_accessibility,hyenadna-tiny-1k,128,512,2062129,503.449463
7,chromatin_accessibility,awdlstm,64,512,2062129,251.724731
8,cpg_methylation,nt_transformer_ms,2560,512,959039,4682.807617
9,cpg_methylation,nt_transformer_1000g,2560,512,959039,4682.807617


In [11]:
print('Total memory required for storing embeddings (not compressed): ', requested_memory_df['requested_memory_GB'].sum() / 1024, 'TB')

Total memory required for storing embeddings (not compressed):  56.337196089327335 TB


In [12]:
requested_memory_df.groupby('task')['requested_memory_GB'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chromatin_accessibility,8.0,3870.267746,4136.122628,251.724731,881.03656,2013.797852,6293.118286,10068.989258
cpg_methylation,8.0,1799.954178,1923.595909,117.07019,409.745667,936.561523,2926.754761,4682.807617
enhancer_annotation,8.0,104.572334,111.755575,6.801453,23.805084,54.411621,170.036316,272.058105
gene_finding,8.0,262.917638,280.977871,17.100334,59.85117,136.802673,427.508354,684.013367
histone_modification,8.0,1173.449203,1254.055306,76.321899,267.126648,610.575195,1908.047485,3052.875977


In [13]:
requested_memory_df.groupby('task')['requested_memory_GB'].sum()

task
chromatin_accessibility    30962.141968
cpg_methylation            14399.633423
enhancer_annotation          836.578674
gene_finding                2103.341103
histone_modification        9387.593628
Name: requested_memory_GB, dtype: float64