## This notebook focus on correcting the repeated RSID for SVs

In [1]:
%%configure -f
{"driverMemory": "6000M"}

In [2]:
import hail as hl
hl.init(sc)

Starting Spark application


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.1.2-amzn-0
SparkUI available at
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.80-4ccfae1ff293
LOGGING: writing to 

In [7]:
release14_hc_meta_mt_uri = "SG10K-SV-Release-1.4-HighConfidenceSV-WithMetadata.mt"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# Read the matrix table
mt = hl.read_matrix_table(release14_hc_meta_mt_uri)
print("Samples: %d; Variants: %d; Entries: %d" % (mt.count_cols(), mt.count_rows(), mt.entries().count()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Samples: 5487; Variants: 73035; Entries: 400743045
2024-05-06 05:38:21 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'

In [9]:
def rename_duplicate_rows(dataset, field='rsid', name=None, override=False):
    """Rename duplicated row-wide field
       inspired from https://hail.is/docs/0.2/methods/misc.html#hail.methods.rename_duplicates
    """

    if (override is True) and (name is not None):
        hl.utils.java.info('rename_duplicate_rows cannnot specfiy both a new field name and overrride the exisiting one')
        raise ValueError('rename_duplicate_rows cannnot specfiy both a new field name and overrride the exisiting one')

    if (override is False) and (name is None):
        name = 'uniq_' + field

    ids = dataset.aggregate_rows(hl.agg.collect(dataset[field]))
    mapping, new_ids = hl.utils.deduplicate(ids)
    if mapping:
        hl.utils.java.info(
            f'Renamed {len(mapping)} duplicate `{field}` {hl.utils.misc.plural("field", len(mapping))} as `{name}`. Mangled IDs as follows:'
            + ''.join(f'\n  "{pre}" => "{post}"' for pre, post in mapping)
        )
    else:
        hl.utils.java.info('No duplicate field found.')

    return dataset.annotate_rows(**{name: hl.literal(new_ids)[hl.int(hl.scan.count())]})

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
ori_mt = mt
mt = mt.rename({'rsid':'_old_sv_id',
                'info' : 'callers_info'
                })
mt = rename_duplicate_rows(mt, field='_old_sv_id', name='sv_id')
mt.describe()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'metadata': struct {
        Multiplex_Pool_ID: str, 
        Supplier_ID: str, 
        GIS_Internal_Sample_ID: str, 
        Site_Supplying_Sample: str, 
        Year_Of_Birth: int32, 
        Supplied_Gender: str, 
        Self_Reported_Ethnicity: str, 
        Extraction_Kit: str, 
        Date_Of_DNA_Extraction: str, 
        Plate_Position: str, 
        Plate_Name: str, 
        Version_Of_Consent_Form_Signed: str, 
        Sequencing_Depth: str, 
        NPM_Research_ID_Created_By_Username: str, 
        NPM_Research_ID_Creation_Date: str, 
        Comments_Entered_When_NPM_Research_ID_Created: str, 
        Description_Entered_When_NPM_Research_ID_Created: str, 
        ELM_Project_ID: str, 
        ELM_Project_Title: str, 
        ELM_Project_PI: str, 
        Species_Of_Sample_Sequenced: str, 
        Tehcnique_For_Sequencing: str, 
      

In [12]:
mt.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(73035, 5487)

In [15]:
## write the resulting mt
release14_hc_mt_uri = "SG10K-SV-Release-1.4-HighConfidenceSV-WithMetadata-correctrsid.mt"
mt.write(release14_hc_mt_uri, overwrite=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2024-05-06 06:56:06 Hail: INFO: wrote matrix table with 73035 rows and 5487 columns in 91 partitions to SG10K-SV-Release-1.4-HighConfidenceSV-WithMetadata-correctrsid.mt
    Total size: 3.42 GiB
    * Rows/entries: 3.42 GiB
    * Columns: 785.10 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  1391 rows (77.84 MiB)