In [17]:
import pandas as pd
import os
import yaml

WORK_DIR = '../'
results_dir = os.path.join(WORK_DIR, 'results', 'estimates')

In [29]:
emb_stats = pd.read_csv(os.path.join(results_dir, 'embeddings_stats.csv'))
emb_stats.rename(columns={'time':'chunk emb time (seconds)','size (bytes)': 'chunk size (bytes)', 'n_samples': 'samples per chunk'}, inplace=True)
emb_stats

Unnamed: 0,task,model,chunk emb time (seconds),samples per chunk,chunk size (bytes)
0,gene_finding,hyenadna-tiny-1k,1136.719639,4783,16974564567
1,enhancer_annotation,hyenadna-tiny-1k,83.364254,28,1331168444
2,histone_modification,hyenadna-tiny-1k,878.459177,50000,12167916103
3,chromatin_accessibility,hyenadna-tiny-1k,876.718117,50000,12161888991
4,cpg_methylation,hyenadna-tiny-1k,874.093837,50000,12177768906
5,gene_finding,hyenadna-large-1m,1979.431083,4783,33897712652
6,enhancer_annotation,hyenadna-large-1m,148.898188,28,2656704110
7,histone_modification,hyenadna-large-1m,1992.29405,50000,24278876951
8,chromatin_accessibility,hyenadna-large-1m,2031.097446,50000,24272783417
9,cpg_methylation,hyenadna-large-1m,2050.380581,50000,24276876162


In [40]:
downstream_stats = pd.read_csv(os.path.join(results_dir, 'downstream_stats.csv'))
downstream_stats = downstream_stats[downstream_stats['epoch'] == 1].drop(columns=['epoch'])
downstream_stats.rename(columns={'time':'chunk training time (seconds)'}, inplace=True)
downstream_stats

Unnamed: 0,task,model,chunk training time (seconds)
0,gene_finding,hyenadna-tiny-1k,162.424445
2,enhancer_annotation,hyenadna-tiny-1k,14.042472
4,histone_modification,hyenadna-tiny-1k,120.56379
6,chromatin_accessibility,hyenadna-tiny-1k,121.548147
8,cpg_methylation,hyenadna-tiny-1k,122.220143
10,gene_finding,hyenadna-large-1m,318.331048
12,enhancer_annotation,hyenadna-large-1m,26.340381
14,histone_modification,hyenadna-large-1m,218.308265
16,chromatin_accessibility,hyenadna-large-1m,219.620006
18,cpg_methylation,hyenadna-large-1m,220.473587


In [51]:
df_stats = pd.merge(
    emb_stats,
    downstream_stats,
    on=['model', 'task'],
    how='outer',
    suffixes=('_emb', '_downstream'),
)
df_stats = df_stats.sort_values(by=['task', 'model'])
df_stats

Unnamed: 0,task,model,chunk emb time (seconds),samples per chunk,chunk size (bytes),chunk training time (seconds)
0,chromatin_accessibility,hyenadna-large-1m,2031.097446,50000,24272783417,219.620006
5,chromatin_accessibility,hyenadna-tiny-1k,876.718117,50000,12161888991,121.548147
11,chromatin_accessibility,resnetlm,4382.098835,50000,24269158218,221.764109
1,cpg_methylation,hyenadna-large-1m,2050.380581,50000,24276876162,220.473587
6,cpg_methylation,hyenadna-tiny-1k,874.093837,50000,12177768906,122.220143
12,cpg_methylation,resnetlm,4454.044875,50000,24274541079,219.585632
2,enhancer_annotation,hyenadna-large-1m,148.898188,28,2656704110,26.340381
7,enhancer_annotation,hyenadna-tiny-1k,83.364254,28,1331168444,14.042472
13,enhancer_annotation,resnetlm,152.586243,28,2655463731,26.809431
3,gene_finding,hyenadna-large-1m,1979.431083,4783,33897712652,318.331048


In [52]:
task_len = {}
for task in df_stats['task'].unique():
    df_task = pd.read_csv(os.path.join(WORK_DIR, 'data',task, f'{task}.bed'), sep='\t', low_memory=False)
    task_len[task] = len(df_task)

df_stats['total samples'] = df_stats['task'].map(task_len)

df_stats

Unnamed: 0,task,model,chunk emb time (seconds),samples per chunk,chunk size (bytes),chunk training time (seconds),total samples
0,chromatin_accessibility,hyenadna-large-1m,2031.097446,50000,24272783417,219.620006,2062129
5,chromatin_accessibility,hyenadna-tiny-1k,876.718117,50000,12161888991,121.548147,2062129
11,chromatin_accessibility,resnetlm,4382.098835,50000,24269158218,221.764109,2062129
1,cpg_methylation,hyenadna-large-1m,2050.380581,50000,24276876162,220.473587,959039
6,cpg_methylation,hyenadna-tiny-1k,874.093837,50000,12177768906,122.220143,959039
12,cpg_methylation,resnetlm,4454.044875,50000,24274541079,219.585632,959039
2,enhancer_annotation,hyenadna-large-1m,148.898188,28,2656704110,26.340381,285
7,enhancer_annotation,hyenadna-tiny-1k,83.364254,28,1331168444,14.042472,285
13,enhancer_annotation,resnetlm,152.586243,28,2655463731,26.809431,285
3,gene_finding,hyenadna-large-1m,1979.431083,4783,33897712652,318.331048,5977


In [53]:
df_stats['n chunks'] = df_stats['total samples'] / df_stats['samples per chunk']

df_stats['total emb time (hours)'] = (df_stats['chunk emb time (seconds)'] * df_stats['n chunks']) / 3600
df_stats['total emb size (GB)'] = df_stats['chunk size (bytes)'] * df_stats['n chunks'] / (1024 ** 3)

df_stats['epoch training time (hours)'] = df_stats['chunk training time (seconds)'] * df_stats['n chunks'] / 3600
total_epochs = 100
df_stats['total training time (hours)'] = df_stats['epoch training time (hours)'] * total_epochs
df_stats['total training time (days)'] = df_stats['total training time (hours)'] / 24

df_stats

Unnamed: 0,task,model,chunk emb time (seconds),samples per chunk,chunk size (bytes),chunk training time (seconds),total samples,n chunks,total emb time (hours),total emb size (GB),epoch training time (hours),total training time (hours),total training time (days)
0,chromatin_accessibility,hyenadna-large-1m,2031.097446,50000,24272783417,219.620006,2062129,41.24258,23.268805,932.32115,2.516027,251.602657,10.483444
5,chromatin_accessibility,hyenadna-tiny-1k,876.718117,50000,12161888991,121.548147,2062129,41.24258,10.043921,467.139929,1.392489,139.248866,5.802036
11,chromatin_accessibility,resnetlm,4382.098835,50000,24269158218,221.764109,2062129,41.24258,50.202517,932.181905,2.54059,254.059001,10.585792
1,cpg_methylation,hyenadna-large-1m,2050.380581,50000,24276876162,220.473587,959039,19.18078,10.924416,433.669817,1.174682,117.468205,4.894509
6,cpg_methylation,hyenadna-tiny-1k,874.093837,50000,12177768906,122.220143,959039,19.18078,4.657167,217.537495,0.651188,65.118824,2.713284
12,cpg_methylation,resnetlm,4454.044875,50000,24274541079,219.585632,959039,19.18078,23.731126,433.628105,1.169951,116.995103,4.874796
2,enhancer_annotation,hyenadna-large-1m,148.898188,28,2656704110,26.340381,285,10.178571,0.420992,25.184315,0.074474,7.447429,0.31031
7,enhancer_annotation,hyenadna-tiny-1k,83.364254,28,1331168444,14.042472,285,10.178571,0.235703,12.618856,0.039703,3.970342,0.165431
13,enhancer_annotation,resnetlm,152.586243,28,2655463731,26.809431,285,10.178571,0.431419,25.172557,0.0758,7.580047,0.315835
3,gene_finding,hyenadna-large-1m,1979.431083,4783,33897712652,318.331048,5977,1.249634,0.687101,39.450581,0.110499,11.049926,0.460414


In [50]:
df_stats.groupby('model').sum(numeric_only=True).sort_values(by='total emb size (GB)', ascending=False)['total emb size (GB)']

model
hyenadna-large-1m    1713.372751
resnetlm             1713.033909
hyenadna-tiny-1k      858.756578
nt_transformer_ms      69.616249
Name: total emb size (GB), dtype: float64