Run: `seq 1 1 10 | xargs  -i papermill /data/work/home/rmalanij/bdg-seqtender/performance/bdg_perf/pipeline-benchmark-manual.ipynb /data/work/home/rmalanij/bdg-seqtender/performance/bdg_perf/pipeline-benchmark-manual_run.ipynb -p executor_num {} -k seq-edu"'`

In [None]:
executor_num = 4
executor_mem  = 2
min_partitions = 40

In [None]:
import os
bdg_perf_pass = os.environ.get("BDG_PERF_PASS")
bdg_perf_db = os.environ.get("BDG_PERF_DB")
bdg_perf_user = os.environ.get("BDG_PERF_USER")
bdg_perf_table = "bdg_perf_tests"

In [None]:
#vcf_path= "/igap/all/split/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz"
#anno_vcf_path = '/edugen/vcf/NA12878_anno.vcf'

vcf_path= "/data/work/home/rmalanij/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz"
anno_vcf_path = '/data/work/home/rmalanij/NA12878_anno.vcf'

cache_dir = "/data/samples/vep_data/vep/95"
vep_version="95"
annotate_cmd = f"""docker run --rm -i -v /data/work/home/rmalanij:/data/work/home/rmalanij -v {cache_dir}:/opt/vep/.vep biodatageeks/bdg-vep:{vep_version} 
        vep
        --dir /opt/vep/.vep
        --pick_allele
        --format vcf
        --no_stats
        --force_overwrite
        --everything
        -cache
        --vcf
        -offline
        --input_file {vcf_path}
        --output_file {anno_vcf_path} 
        --fork {executor_num} """.replace("\n   ", "") 
#print(annotate_cmd)


In [None]:
import timeit
import hashlib
import re
import datetime
import os
import pandas as pd
from sqlalchemy import create_engine




def time(command, 
         tag: str = None, 
         executor_name: str = None,
         tool_name: str = None,
         tool_version: str = None,
         docker_image: str = None,
         num = 1, 
         executor_num = 1, 
         executor_mem = 1, 
         global_vars = None,
         docker_command: str = None,
         input_file: str = None):
    results = []
    for i in range(0, num):
        wall_time = timeit.timeit(command, number=1, globals = global_vars )
        command_hash = hashlib.md5(re.sub(r'\W', '', command).encode() ).hexdigest()
        perf_record = [command_hash, 
                       tag,
                       executor_name,
                       tool_name,
                       tool_version,
                       docker_image,
                       datetime.datetime.now(), 
                       command, 
                       docker_command,
                       input_file,
                       executor_num, 
                       executor_mem, 
                       wall_time ]
        results.append(perf_record)
        dfw=pd.DataFrame(results, columns=['test_id',
                                          'tag',
                                          'executor_name',
                                          'tool_name',
                                          'tool_version',
                                          'docker_image',
                                          'time_stamp', 
                                          'command',
                                          'docker_command',
                                          'input_file',
                                          'exec_total_cores', 
                                          'exec_mem', 
                                          'wall_time'])
    engine = create_engine(f'postgresql://{bdg_perf_user}:{bdg_perf_pass}@cdh00:5435/bdg_perf')
    connection = engine.connect()
    dfw.to_sql('bdg_perf_tests', con = connection, if_exists = 'append',index=False)
    connection.close()

    return dfw

In [None]:
#!hdfs dfs -rm -r -skipTrash /edugen/vcf/NA12878_anno.vcf*
!yes | rm -r {anno_vcf_path}

In [None]:
%%time
tag = 'vcf_annotation'
anno_code = """import os
os.system(annotate_cmd)"""

global_vars={'vcf_path': vcf_path,
             'annotate_cmd': annotate_cmd,
             'anno_vcf_path': anno_vcf_path
            }

df = time(anno_code,
          num = 1,
          executor_num = executor_num,
          executor_name = 'docker',
          tool_name = 'vep',
          tool_version = vep_version,
          docker_image = f'biodatageeks/bdg-vep:{vep_version}',
          tag = tag,
          executor_mem = executor_mem, 
          docker_command = annotate_cmd,
          input_file = vcf_path,
          global_vars = global_vars)
df

In [None]:
# from sqlalchemy.sql import text
# engine = create_engine(f'postgresql://{bdg_perf_user}:{bdg_perf_pass}@cdh00:5435/bdg_perf')
# con = engine.connect()
# statement = text("select * from bdg_perf_tests where time_stamp > to_date('20200203','YYYYMMDD')")
# #statement = text("delete from bdg_perf_tests where time_stamp > to_date('20200203','YYYYMMDD')")

# rs = con.execute(statement)
# for row in rs:
#         print (row)