In [12]:
import boto3
import argparse
from botocore.config import Config
import os

from dotenv import load_dotenv
load_dotenv()

DATA_DIR = '../'


In [15]:
'''This cell allows to download all objects in a specified S3 bucket into a local directory.'''

config = Config(
    request_checksum_calculation="when_required",
    response_checksum_validation="when_required",
)

bucket_name = os.getenv('bucket_name')

s3_client = boto3.client('s3', config=config, aws_access_key_id=os.getenv('aws_access_key_id'), aws_secret_access_key=os.getenv('aws_secret_access_key'), endpoint_url=os.getenv('endpoint_url'))

objects = s3_client.list_objects_v2(Bucket=bucket_name).get('Contents')

for obj in objects:
    print(f"Object Key: {obj['Key']}, Size: {obj['Size']} bytes")

    output_path = os.path.join(DATA_DIR,obj['Key'])
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    s3_client.download_file(bucket_name, obj['Key'], output_path)


Object Key: results/variant_effects_disease/awdlstm.h5, Size: 247169636 bytes
Object Key: results/variant_effects_disease/awdlstm_dataloader.h5, Size: 231686477 bytes
Object Key: results/variant_effects_disease/convnet.h5, Size: 1001769260 bytes
Object Key: results/variant_effects_disease/dnabert2.h5, Size: 2816341233 bytes
Object Key: results/variant_effects_disease/hyenadna-large-1m-seqlen.h5, Size: 1002778326 bytes
Object Key: results/variant_effects_disease/hyenadna-medium-160k-seqlen.h5, Size: 988004109 bytes
Object Key: results/variant_effects_disease/hyenadna-medium-450k-seqlen.h5, Size: 998736372 bytes
Object Key: results/variant_effects_disease/hyenadna-small-32k-seqlen.h5, Size: 993991180 bytes
Object Key: results/variant_effects_disease/hyenadna-tiny-1k-seqlen.h5, Size: 534335117 bytes
Object Key: results/variant_effects_disease/nucleotide-transformer-2.5b-1000g.h5, Size: 9074443300 bytes
Object Key: results/variant_effects_disease/nucleotide-transformer-2.5b-multi-species.h

In [19]:
import h5py
import pandas as pd
import numpy as np
import os
import seaborn as sns

EMBEDDING_SIZES = {
    'nucleotide-transformer-500m-1000g': 1280, # not in list
    'nucleotide-transformer-2.5b-multi-species': 2560, # 'nt_transformer_ms': 2560,
    'nucleotide-transformer-2.5b-1000g':2560, # 'nt_transformer_1000g': 2560,
    'nucleotide-transformer-500m-human-ref': 1280, # 'nt_transformer_human_ref': 1280,
    'nucleotide-transformer-v2-500m-multi-species': 1024, # 'nt_transformer_v2_500m': 1024,
    'convnet': 256,
    'awdlstm': 64,
    'dnabert2': 768,
    'hyenadna-large-1m-seqlen': 256, # 'hyenadna-large-1m': 256,
    'hyenadna-tiny-1k-seqlen': 128, # 'hyenadna-tiny-1k': 128,
    'hyenadna-small-32k-seqlen': 256, # 'hyenadna-small-32k': 256,
    'hyenadna-medium-160k-seqlen': 256, # 'hyenadna-medium-160k': 256,
    'hyenadna-medium-450k-seqlen': 256, # 'hyenadna-medium-450k': 256,
}

RESULTS_DIR = os.path.join(DATA_DIR, 'results')

def collect_results(experiment_name):

    results_df = []

    for file in os.listdir(os.path.join(RESULTS_DIR, experiment_name)):
        if file.endswith('.h5'):
            print(f'Processing file: {experiment_name}/{file}')
            with h5py.File(os.path.join(RESULTS_DIR, experiment_name, file), 'r') as f:
                for version in f.keys():

                    assert f[version]['embeddings_alt'].shape[0] == f[version]['embeddings_ref'].shape[0]
                    assert f[version]['embeddings_alt'].shape[1] == f[version]['embeddings_ref'].shape[1]
                    
                    assert f[version]['embeddings_alt'].shape[1] == EMBEDDING_SIZES[version]
                    assert f[version]['embeddings_ref'].shape[1] == EMBEDDING_SIZES[version]

                    annotation = f[version]['annotation']
                    annotation = pd.read_hdf(os.path.join(RESULTS_DIR, experiment_name, file), key=f'{version}/annotation')
                    
                    assert len(annotation) == f[version]['embeddings_alt'].shape[0]
                    assert len(annotation) == f[version]['embeddings_ref'].shape[0]
                    
                    assert False == np.any(np.all(f[version]['embeddings_alt'][()] == 0, axis=1))
                    assert False == np.any(np.all(f[version]['embeddings_ref'][()] == 0, axis=1))

                    results_df.append({'version': version,
                                    'rocauc_score': f[version].attrs['rocauc_score'],
                                    'running_time': f[version].attrs['running_time'],
                                    'embeddings_dim': f[version]['embeddings_alt'].shape[1]})
                    
    results_df = pd.DataFrame(results_df)

    return results_df

In [27]:
results_df = None

for experiment in  ['expression', 'disease']:

    if results_df is None:
        results_df = collect_results(f'variant_effects_{experiment}')
        results_df['experiment'] = experiment
    else:
        results_df_temp=collect_results(f'variant_effects_{experiment}')
        results_df_temp['experiment'] = experiment
        results_df = pd.concat([results_df, results_df_temp], ignore_index=True)
        del results_df_temp
        
    
    results_df['running_time (hours)'] = results_df['running_time'] / 3600
    
    results_df['version'] = results_df['version'].replace({
        'nucleotide-transformer-500m-1000g': 'nt_transformer_500m_1000g',
        'nucleotide-transformer-2.5b-multi-species': 'nt_transformer_ms',
        'nucleotide-transformer-2.5b-1000g': 'nt_transformer_1000g',
        'nucleotide-transformer-500m-human-ref': 'nt_transformer_human_ref',
        'nucleotide-transformer-v2-500m-multi-species': 'nt_transformer_v2_500m',
        'hyenadna-large-1m-seqlen': 'hyenadna_large_1m',
        'hyenadna-tiny-1k-seqlen': 'hyenadna_tiny_1k',
        'hyenadna-small-32k-seqlen': 'hyenadna_small_32k',
        'hyenadna-medium-160k-seqlen': 'hyenadna_medium_160k',
        'hyenadna-medium-450k-seqlen': 'hyenadna_medium_450k',
    })
    

Processing file: variant_effects_expression/dnabert2.h5
Processing file: variant_effects_expression/hyenadna-small-32k-seqlen.h5
Processing file: variant_effects_expression/nucleotide-transformer-2.5b-1000g.h5
Processing file: variant_effects_expression/hyenadna-large-1m-seqlen.h5
Processing file: variant_effects_expression/convnet.h5
Processing file: variant_effects_expression/nucleotide-transformer-2.5b-multi-species.h5
Processing file: variant_effects_expression/hyenadna-medium-450k-seqlen.h5
Processing file: variant_effects_expression/hyenadna-tiny-1k-seqlen.h5
Processing file: variant_effects_expression/nucleotide-transformer-500m-human-ref.h5
Processing file: variant_effects_expression/hyenadna-medium-160k-seqlen.h5
Processing file: variant_effects_expression/nucleotide-transformer-500m-1000g.h5
Processing file: variant_effects_expression/nucleotide-transformer-v2-500m-multi-species.h5
Processing file: variant_effects_expression/awdlstm.h5
Processing file: variant_effects_disease

In [32]:
results_df.sort_values(by='running_time (hours)', ascending=False, inplace=True)
results_df

Unnamed: 0,version,rocauc_score,running_time,embeddings_dim,experiment,running_time (hours)
12,awdlstm,0.528155,153282.511492,64,expression,42.578475
22,nt_transformer_human_ref,0.481363,36854.628603,1280,disease,10.237397
25,nt_transformer_v2_500m,0.475146,34559.918712,1024,disease,9.599977
15,nt_transformer_1000g,0.490184,34064.111537,2560,disease,9.462253
18,nt_transformer_ms,0.770924,33044.683999,2560,disease,9.179079
5,nt_transformer_ms,0.544748,21699.136028,2560,expression,6.027538
24,nt_transformer_500m_1000g,0.454699,16993.703686,1280,disease,4.720473
8,nt_transformer_human_ref,0.545587,13045.727227,1280,expression,3.623813
11,nt_transformer_v2_500m,0.481761,12298.463809,1024,expression,3.41624
2,nt_transformer_1000g,0.445022,11866.15231,2560,expression,3.296153


In [33]:
expression_df = results_df[results_df['experiment']=='expression'].sort_values(by='rocauc_score', ascending=False, inplace=False)
expression_df

Unnamed: 0,version,rocauc_score,running_time,embeddings_dim,experiment,running_time (hours)
4,convnet,0.554684,3050.762704,256,expression,0.847434
8,nt_transformer_human_ref,0.545587,13045.727227,1280,expression,3.623813
5,nt_transformer_ms,0.544748,21699.136028,2560,expression,6.027538
12,awdlstm,0.528155,153282.511492,64,expression,42.578475
3,hyenadna_large_1m,0.506909,3001.749707,256,expression,0.833819
6,hyenadna_medium_450k,0.499602,3080.028716,256,expression,0.855564
10,nt_transformer_500m_1000g,0.490628,6042.815266,1280,expression,1.67856
9,hyenadna_medium_160k,0.484846,2110.877263,256,expression,0.586355
11,nt_transformer_v2_500m,0.481761,12298.463809,1024,expression,3.41624
7,hyenadna_tiny_1k,0.470906,842.249206,128,expression,0.233958


In [34]:
disease_df = results_df[results_df['experiment']=='disease'].sort_values(by='rocauc_score', ascending=False, inplace=False)
disease_df

Unnamed: 0,version,rocauc_score,running_time,embeddings_dim,experiment,running_time (hours)
18,nt_transformer_ms,0.770924,33044.683999,2560,disease,9.179079
17,convnet,0.547044,8036.965109,256,disease,2.23249
15,nt_transformer_1000g,0.490184,34064.111537,2560,disease,9.462253
13,dnabert2,0.483299,10669.326907,768,disease,2.963702
22,nt_transformer_human_ref,0.481363,36854.628603,1280,disease,10.237397
25,nt_transformer_v2_500m,0.475146,34559.918712,1024,disease,9.599977
24,nt_transformer_500m_1000g,0.454699,16993.703686,1280,disease,4.720473
21,awdlstm,0.454502,962.388486,64,disease,0.26733
16,hyenadna_large_1m,0.445812,8419.811007,256,disease,2.338836
19,hyenadna_medium_450k,0.445232,8348.137631,256,disease,2.318927
