## This notebook focus on adding the END of the SV and extracting high confidence SVs using GT2 recommended filters

In [1]:
%%configure -f
{"driverMemory": "6000M"}

In [2]:
import hail as hl
hl.init(sc)

Starting Spark application


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.1.2-amzn-0
SparkUI available at
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.80-4ccfae1ff293
LOGGING: writing to 

In [3]:
## list all resources  used in this notebook 
release14_mt_uri =   "SG10K-SV-Release-1.4.mt"


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Read the matrix table
mt = hl.read_matrix_table(release14_mt_uri)
print("Samples: %d; Variants: %d; Entries: %d" % (mt.count_cols(), mt.count_rows(), mt.entries().count()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Samples: 5487; Variants: 93284; Entries: 511849308
2024-04-17 02:39:56 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'

In [5]:
# add the END 
mt = mt.annotate_rows(
    info = mt.info.annotate(
        END = hl.coalesce(
            hl.if_else(mt.info.manta.END > 0, mt.info.manta.END, hl.missing(hl.tint32)),
            hl.if_else(mt.info.survindel2.END > 0, mt.info.survindel2.END, hl.missing(hl.tint32)),
        ),
    )
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
mt.rows().show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+----------------------------------------+
| locus         | alleles                                |
+---------------+----------------------------------------+
| locus<GRCh38> | array<str>                             |
+---------------+----------------------------------------+
| chr1:10546    | ["N","<DEL:SVSIZE=170464:AGGREGATED>"] |
| chr1:54720    | ["C","<DUP>"]                          |
| chr1:66224    | ["N","<DEL:SVSIZE=51:AGGREGATED>"]     |
| chr1:66534    | ["T","<DUP>"]                          |
| chr1:83963    | ["A","<DUP>"]                          |
| chr1:99061    | ["C","<DUP>"]                          |
| chr1:109468   | ["N","<DEL:SVSIZE=141:AGGREGATED>"]    |
| chr1:187139   | ["C","<INS:ME:LINE1>"]                 |
| chr1:600342   | ["T","<DUP>"]                          |
| chr1:600743   | ["T","<DUP>"]                          |
+---------------+----------------------------------------+

+----------------------------------+-----------+-------

In [7]:
# Extract Variants that PASS GT2 filters
mt2 = mt.filter_rows((hl.is_missing(mt.info.manta.PASS_GT2_filter)) | (mt.info.manta.PASS_GT2_filter=="PASS"), keep=True)
print("Samples: %d; Variants: %d; Entries: %d" % (mt2.count_cols(), mt2.count_rows(), mt2.entries().count()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Samples: 5487; Variants: 73036; Entries: 400748532

In [11]:
# Check for duplicated duplication mt.filter_rows( (mt.row.rsid == "SG10K_SV_DUP_chr18_76585529_284")
mt2.filter_rows(mt2.rsid == "SG10K_SV_DUP_chr18_76585529_284").rows().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+---------------+-----------------------------------+
| locus          | alleles       | rsid                              |
+----------------+---------------+-----------------------------------+
| locus<GRCh38>  | array<str>    | str                               |
+----------------+---------------+-----------------------------------+
| chr18:76585529 | ["C","<DUP>"] | "SG10K_SV_DUP_chr18_76585529_284" |
| chr18:76585529 | ["C","<DUP>"] | "SG10K_SV_DUP_chr18_76585529_284" |
+----------------+---------------+-----------------------------------+

+-----------+----------+------------------+------------------+
|      qual | filters  | info.manta.ABHet | info.manta.ABHom |
+-----------+----------+------------------+------------------+
|   float64 | set<str> |          float64 |          float64 |
+-----------+----------+------------------+------------------+
| -1.00e+01 | {}       |               NA |               NA |
| -1.00e+01 | {}       |               NA |          

In [12]:
## Remove the duplicated Duplication with N-Sample = 7
mt2 = mt2.filter_rows( (mt2.rsid == "SG10K_SV_DUP_chr18_76585529_284") & (mt2.info.survindel2.N_SAMPLES == 7) , keep=False)
print("Samples: %d; Variants: %d; Entries: %d" % (mt2.count_cols(), mt2.count_rows(), mt2.entries().count()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Samples: 5487; Variants: 73035; Entries: 400743045

In [13]:
svtype_stats = mt2.aggregate_rows(hl.struct( svtype_stat = hl.agg.counter(mt2.info.SVTYPE)))
print(svtype_stats.svtype_stat)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

frozendict({'DEL': 11560, 'DUP': 32464, 'INS': 29011})

In [14]:
# check the numbers before applying GT2 filters
svtype_stats = mt.aggregate_rows(hl.struct( svtype_stat = hl.agg.counter(mt.info.SVTYPE)))
print(svtype_stats.svtype_stat)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

frozendict({'DEL': 24561, 'DUP': 32465, 'INS': 36258})

In [15]:
## write the resulting mt
release14_hc_mt_uri =   "SG10K-SV-Release-1.4-HighConfidenceSV.mt"
mt2.write(release14_hc_mt_uri, overwrite=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2024-04-17 02:44:53 Hail: INFO: wrote matrix table with 73035 rows and 5487 columns in 91 partitions to SG10K-SV-Release-1.4-HighConfidenceSV.mt