## Run times in interactive session
On a mem1_hdd1_v2_x8 instance (?4 nodes):

- 1 file, ~1100 rows, 38s
- 2 files, 2489 rows, 148s
- 4 files 5705 rows, 501s

In [None]:
import os
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
import hail as hl

builder = SparkSession.builder.enableHiveSupport()
spark = builder.getOrCreate()
hl.init(sc=spark.sparkContext)

In [None]:
from hail.plot import show
from pprint import pprint

hl.plot.output_notebook()

In [None]:
files = [f for f in os.listdir(".") if os.path.isfile(f)]
split_file = [f for f in files if f.startswith("split_paths")][0]
print(split_file)

file_list = [f.strip() for f in open(split_file)]

In [None]:
file_url = "file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)/ukb24068_c1_b244_v1.vcf.gz"
second_file = "file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)/ukb24068_c1_b245_v1.vcf.gz"
two_files = [
    "file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)/ukb24068_c1_b244_v1.vcf.gz",
    "file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)/ukb24068_c1_b245_v1.vcf.gz",
]
three_files = [
    "file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)/ukb24068_c1_b244_v1.vcf.gz",
    "file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)/ukb24068_c1_b245_v1.vcf.gz",
    "file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)/ukb24068_c1_b246_v1.vcf.gz",
]

In [None]:
# For iterative processing
"""
%%time
for file in file_list:
    mt = hl.import_vcf(f"file://{file}", force_bgz=True, reference_genome="GRCh38", array_elements_required=False)
    mt = mt.annotate_rows(median_dp = hl.agg.approx_median(mt.DP))
    dp_stats = mt.rows().select("median_dp")
    name = file.split("/")[-1].split(".")[0]
    %time dp_stats.export(f"{name}_dp.tsv", delimiter="\t")
"""

In [None]:
%%time
# For collective processing

name = f"ukb24068_{split_file}"

mt = hl.import_vcf(
    [f"file://{file}" for file in file_list],
    force_bgz=True,
    reference_genome="GRCh38",
    array_elements_required=False,
)
mt = mt.annotate_rows(median_dp=hl.agg.approx_median(mt.DP))
dp_stats = mt.rows().select("median_dp")
name = file.split("/")[-1].split(".")[0]
%time dp_stats.export(f"{name}_dp.tsv", delimiter="\t", header=False)

%%bash
hdfs dfs -get ukb24068_*

In [None]:
%%bash
dx upload --destination /outputs/gnomad_coverage/split_coverage/ ukb24068_*