# Redo partitions

In this notebook, we re-write the any matrix tables that are too sparse to have fewer partitions.

# Setup 

In [None]:
from datetime import datetime
import hail as hl
import os
import time

## Define constants

In [None]:
MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210621/merged-filtered-chr2.mt'
SMALLER_NUM_MT_PARTITIONS = 1000

In [None]:
RESULT_BUCKET = os.getenv("WORKSPACE_BUCKET")
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

# Outputs
FEWER_PARTITIONS_MT = f'{os.getenv("WORKSPACE_BUCKET")}/data/merged/{DATESTAMP}/{os.path.basename(MT)}'
HAIL_LOG = f'{WORK_DIR[0]}/hail-redo-partitions-{TIMESTAMP}.log'
HAIL_LOG_DIR_FOR_PROVENANCE = f'{os.getenv("WORKSPACE_BUCKET")}/hail-logs/{DATESTAMP}/'

## Check access

In [None]:
!gsutil ls {MT}

In [None]:
# In general, this should not exist
print(FEWER_PARTITIONS_MT)
!gsutil ls {FEWER_PARTITIONS_MT}

## Start Hail 

In [None]:
# See also https://towardsdatascience.com/fetch-failed-exception-in-apache-spark-decrypting-the-most-common-causes-b8dff21075c
# See https://spark.apache.org/docs/2.4.7/configuration.html

EXTRA_SPARK_CONFIG = {
    # If set to "true", performs speculative execution of tasks. This means if one or more tasks are running
    # slowly in a stage, they will be re-launched.
    'spark.speculation': 'true', # Default is false.
    
    # Fraction of tasks which must be complete before speculation is enabled for a particular stage.
    'spark.speculation.quantile': '0.95', # Default is 0.75

    # Default timeout for all network interactions. This config will be used in place of 
    # spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, 
    # spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.
    'spark.network.timeout': '180s', # Default is 120s
        
    # (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a
    # non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient
    # network connectivity issues.
    'spark.shuffle.io.maxRetries': '10',  # Default is 3
    
    # (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds
    # by default, calculated as maxRetries * retryWait.
    'spark.shuffle.io.retryWait': '15s',  # Default is 5s
    
    # Number of failures of any particular task before giving up on the job. The total number of failures spread
    # across different tasks will not cause the job to fail; a particular task has to fail this number of attempts.
    # Should be greater than or equal to 1. Number of allowed retries = this value - 1.
    'spark.task.maxFailures': '10', # Default is 4.

    # Number of consecutive stage attempts allowed before a stage is aborted.
    'spark.stage.maxConsecutiveAttempts': '10' # Default is 4.
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

Check the configuration.

In [None]:
sc = hl.spark_context()
config = sc._conf.getAll()
config.sort()
config

# Read AoU matrix table

In [None]:
mt = hl.read_matrix_table(MT)

In [None]:
mt.n_partitions()

In [None]:
mt.describe()

# Re-partition the matrix table

From https://discuss.hail.is/t/improving-pipeline-performance/1344

In [None]:
start = datetime.now()
print(start)

In [None]:
# https://discuss.hail.is/t/improving-pipeline-performance/1344
intervals = mt._calculate_new_partitions(SMALLER_NUM_MT_PARTITIONS)

hl.read_matrix_table(MT, _intervals=intervals).write(FEWER_PARTITIONS_MT)

In [None]:
end = datetime.now()
print(end)
print(end - start)

# Provenance

In [None]:
# Copy the Hail log to the workspace bucket so that we can retain it.
!gsutil cp {HAIL_LOG} {HAIL_LOG_DIR_FOR_PROVENANCE}

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze