In [1]:
# Daniel Marten 05-03-23
# Notebook to update column names from GA4GH VRS-Python Branch 0.8.1 to 0.8.2 convention
# No changes to contents, only: VRS_Allele -> VRS_Allele_Id VRS_Alt to VRS_State
# See: https://github.com/ga4gh/vrs-python/releases/tag/0.8.2 

import hail as hl

In [2]:
# Read in Hail Table to have its GA4GH VRS Schema updated from VRS_Python Branch 0.8.1 to 0.8.2
import_path = 'gs://gnomad-vrs-io-finals/ht-outputs/0421_1miltest_v3.1.2-Full-ht-release-output.ht'
ht = hl.read_table(import_path)
ht = ht.sample(0.01)

Initializing Hail with default parameters...
Running on Apache Spark version 3.3.0
SparkUI available at http://dm2-m.c.broad-mpg-gnomad.internal:37963
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.113-cf32652c5077
LOGGING: writing to /home/hail/hail-20230504-1514-0.2.113-cf32652c5077.log


In [3]:
# Code which actually changes naming schema from VRS-Python Branch 0.8.1 to 0.8.2

ht = ht.annotate(
        info = ht.info.annotate(
                vrs = ht.info.vrs.rename({'VRS_Allele':'VRS_Allele_IDs',
                                          'VRS_Start':'VRS_Starts',
                                          'VRS_End':'VRS_Ends',
                                          'VRS_Alt':'VRS_States'})
        )
    )

In [4]:
ht_reformat = ht.annotate(
        info = ht.info.annotate(
                vrs = ht.info.vrs.annotate(
                    VRS_Allele_IDs = ht.info.vrs.VRS_Allele_IDs.split(','),
                    VRS_Starts = [hl.int(ht.info.vrs.VRS_Starts.split(',')[0]),hl.int(ht.info.vrs.VRS_Starts.split(',')[1])],
                    VRS_Ends = [hl.int(ht.info.vrs.VRS_Ends.split(',')[0]),hl.int(ht.info.vrs.VRS_Ends.split(',')[1])],
                    VRS_States = ht.info.vrs.VRS_States.split(',')
                )
        )
    )

ht_reformat.info.vrs.show()

Unnamed: 0_level_0,Unnamed: 1_level_0,<expr>,<expr>,<expr>,<expr>
locus,alleles,VRS_Allele_IDs,VRS_Starts,VRS_Ends,VRS_States
locus<GRCh38>,array<str>,array<str>,array<int32>,array<int32>,array<str>
chr1:181359,"[""G"",""A""]","[""ga4gh:VA.eEh6g89W5T6BYEBfqr5iWlCzstrr4prv"",""ga4gh:VA.B39ZMi5XUoXDpQtD99ybfWd1yHVvWbS1""]","[181358,181358]","[181359,181359]","[""G"",""A""]"
chr1:188025,"[""CT"",""C""]","[""ga4gh:VA.HKkRXLry1zagNqsl87kCnE8GtpZAwlEk"",""ga4gh:VA.1yuc7shZw62uIMSnSyEhs2o8024o3Lgk""]","[188024,188025]","[188026,188026]","[""CT"",""""]"
chr1:656991,"[""C"",""A""]","[""ga4gh:VA.5e-OUKRjOOQDsY-s4VM4REi7QlWYxXH7"",""ga4gh:VA.FP-E9i9g1JWQYY4s_dOrd_SYzzUvdjTu""]","[656990,656990]","[656991,656991]","[""C"",""A""]"
chr1:784850,"[""T"",""C""]","[""ga4gh:VA.xvz-aCVre6vRzUWwqgprqk_IaX8dod3f"",""ga4gh:VA.6_yOdU2NGy0m059esikx9YmWZtn8kgWb""]","[784849,784849]","[784850,784850]","[""T"",""C""]"
chr1:1195500,"[""C"",""CTTTTTTTTTTTTTTTTTTTTTTTTTTT""]","[""ga4gh:VA.zuu1f6UjSdHuOfgpUnsDjr8Hp1IZR3ZR"",""ga4gh:VA.GDE0YwThYWkYr9I7TDCfAI1EYzH1EjAN""]","[1195499,1195500]","[1195500,1195518]","[""C"",""TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT""]"
chr1:1437566,"[""T"",""A""]","[""ga4gh:VA.MGa22tLs4xWOumLLFextoncdocaNPqd9"",""ga4gh:VA.pLE3-w_dyB3B9fYJNVM5ot7kYZ-gCu6p""]","[1437565,1437565]","[1437566,1437566]","[""T"",""A""]"
chr1:1565352,"[""G"",""C""]","[""ga4gh:VA.DMBcDb_RePs2pqpQjkU_ceFOOzqMq2b4"",""ga4gh:VA.FuA1UgSggrG3cjNbXamC_YZqK0tmK1ek""]","[1565351,1565351]","[1565352,1565352]","[""G"",""C""]"
chr1:1609797,"[""C"",""A""]","[""ga4gh:VA.OaVZm5YGXcfStVXavWNfuRTrBd1DDkOE"",""ga4gh:VA.hdvh-eOrjr2zE22vE2wwfisCFyDqIJLs""]","[1609796,1609796]","[1609797,1609797]","[""C"",""A""]"
chr1:1956929,"[""C"",""T""]","[""ga4gh:VA.oFBHgCTUxlabhFGJoIJUUb2Wc06dm8Wq"",""ga4gh:VA.nc_gOGG1k3w-smD9heq-rGsNZiygWLmg""]","[1956928,1956928]","[1956929,1956929]","[""C"",""T""]"
chr1:2050192,"[""G"",""T""]","[""ga4gh:VA.0HH5nj2aKhq42gXIGck_qsel5txfVeGG"",""ga4gh:VA.WXXb8CGqm53oi-V5iWrQosSI6pCkXAvu""]","[2050191,2050191]","[2050192,2050192]","[""G"",""T""]"


In [None]:
# Outputting updated table with path appended to include and state the update 

ht_output_path = ''.join(import_path.split('.')[:-1]) +'_ga4gh0_8_2_schema.ht'
print('Outputting to: ',ht_output_path)
ht_reformat = ht_reformat.checkpoint(ht_output_path,overwrite=True)

Outputting to:  gs://gnomad-vrs-io-finals/ht-outputs/0421_1miltest_v312-Full-ht-release-output_ga4gh0_8_2_schema.ht
