In [7]:
import pandas as pd
import numpy as np
import os

ESTIMATED_EMBEDDING_TIME_PATH = '../results/embedding_times.csv'
DATA_DIR = '../data'

In [31]:
df = pd.read_csv(ESTIMATED_EMBEDDING_TIME_PATH)
df = df.sort_values(by=['task', 'model'])
df.rename(columns={'time': 'batch_time_sec', 'n_samples':'batch_samples'}, inplace=True)
df = df[~df['model'].str.contains('gena')]
# df = df[~df['model'].str.contains('awdlstm')]
df

Unnamed: 0,task,model,batch_time_sec,batch_samples
25,chromatin_accessibility,awdlstm,203.093478,256
19,chromatin_accessibility,dnabert2,15.290493,256
13,chromatin_accessibility,hyenadna-large-1m,14.018939,256
9,chromatin_accessibility,hyenadna-tiny-1k,9.767648,256
41,chromatin_accessibility,nt_transformer_1000g,38.805929,256
34,chromatin_accessibility,nt_transformer_human_ref,20.636343,256
29,chromatin_accessibility,nt_transformer_ms,38.240175,256
46,chromatin_accessibility,nt_transformer_v2_500m,22.511534,256
22,chromatin_accessibility,resnetlm,25.030536,256
26,cpg_methylation,awdlstm,197.537044,256


In [32]:
df['n_total_samples'] = np.nan

for task in df['task'].unique():

    df_bed = pd.read_csv(os.path.join(DATA_DIR, task, f'{task}.bed'), sep='\t', low_memory=False)
    n_total_samples = len(df_bed)
    df['n_total_samples'] = np.where(df['task'] == task, n_total_samples, df['n_total_samples'])

df['est_sec_per_sample'] = df['batch_time_sec']/df['batch_samples']
df['est_total_hours'] = df['est_sec_per_sample']*df['n_total_samples']/360
df['est_total_days'] = df['est_total_hours']/24

df

Unnamed: 0,task,model,batch_time_sec,batch_samples,n_total_samples,est_sec_per_sample,est_total_hours,est_total_days
25,chromatin_accessibility,awdlstm,203.093478,256,2062129.0,0.793334,4544.324546,189.346856
19,chromatin_accessibility,dnabert2,15.290493,256,2062129.0,0.059728,342.132906,14.255538
13,chromatin_accessibility,hyenadna-large-1m,14.018939,256,2062129.0,0.054761,313.681204,13.07005
9,chromatin_accessibility,hyenadna-tiny-1k,9.767648,256,2062129.0,0.038155,218.556328,9.106514
41,chromatin_accessibility,nt_transformer_1000g,38.805929,256,2062129.0,0.151586,868.303303,36.179304
34,chromatin_accessibility,nt_transformer_human_ref,20.636343,256,2062129.0,0.080611,461.749136,19.239547
29,chromatin_accessibility,nt_transformer_ms,38.240175,256,2062129.0,0.149376,855.644237,35.651843
46,chromatin_accessibility,nt_transformer_v2_500m,22.511534,256,2062129.0,0.087936,503.707542,20.987814
22,chromatin_accessibility,resnetlm,25.030536,256,2062129.0,0.097776,560.071555,23.336315
26,cpg_methylation,awdlstm,197.537044,256,959039.0,0.771629,2055.617719,85.650738


In [30]:
df.groupby('task')['est_total_days'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chromatin_accessibility,9.0,40.13042,56.734447,9.106514,14.255538,20.987814,35.651843,189.346856
cpg_methylation,9.0,16.676861,26.258101,2.297636,4.828772,7.198714,15.13945,85.650738
enhancer_annotation,9.0,1.047351,2.043869,0.104341,0.181932,0.312365,0.782417,6.456668
gene_finding,9.0,1.59732,3.011039,0.197558,0.293477,0.501739,1.216564,9.555357
histone_modification,9.0,11.388786,17.383138,1.661157,4.622913,5.702103,9.714964,57.12663
