# Create Shards

## Purpose: 

* Read Necessary Data
* Create Shards by Shard Size
* Write to File

## Packages & Options

In [1]:
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Literals and Read Data

In [2]:
# IMPORTANT: determine this by looking at: 
# 1. total input data size
# 2. max memory allowed for node on chosen partition
shard_size = 100000000

In [3]:
indexed_fastq = pd.read_csv(
    "/home/yraghav/Joint-Genotyping-Fall-2021/inputs/indexed_fasta/GRCh38_full_analysis_set_plus_decoy_hla.chr1-chrY.fa.fai", 
    sep="\t", 
    header=None, 
)

In [4]:
indexed_fastq.head(n=25)

Unnamed: 0,0,1,2,3,4
0,chr1,248956422,112,70,71
1,chr2,242193529,252513167,70,71
2,chr3,198295559,498166716,70,71
3,chr4,190214555,699295181,70,71
4,chr5,181538259,892227221,70,71
5,chr6,170805979,1076358996,70,71
6,chr7,159345973,1249605173,70,71
7,chr8,145138636,1411227630,70,71
8,chr9,138394717,1558439788,70,71
9,chr10,133797422,1698811686,70,71


In [5]:
indexed_fastq.index.size

24

## Create Shards with Chosen Shard Size

In [6]:
# list of final output shards
shard_list = []

# per each row 
for index, row in indexed_fastq.iterrows():
    
    # start position
    start_position = 1
    # save chromosome length as int 
    chromosome_length = int(row[1])
    
    # if shard size is larger than chromosome size: 
    if shard_size > int(row[1]): 
        # make whole chromosome as shard
        shard_list.append(
            "{}:{}-{}".format(
                row[0], 
                start_position, 
                row[1]
            )
        )
        
        start_position = chromosome_length
        
     
    # shard size is smaller than chromosome size (normal)
    else: 
        # append the first shard for the chromosome
        shard_list.append(
            "{}:{}-{}".format(
                row[0], 
                start_position,
                shard_size
            )
        )
        
        # update start position 
        start_position = start_position + shard_size
    
    # still need to make shards 
    while(start_position < chromosome_length):
        
        # if using 'shard_size' pushes us over chromsome boundary
        if start_position + shard_size > chromosome_length: 
            # just take from start position to end of chromosome 
            shard_list.append(
                "{}:{}-{}".format(
                    row[0], 
                    start_position, 
                    chromosome_length
                )
            )
            
            # update start position 
            start_position = start_position + shard_size
        
        # make full size shard 
        else: 
            shard_list.append(
                "{}:{}-{}".format(
                    row[0],
                    start_position, 
                    (start_position + shard_size -1 )
                )
            )
            
            # update start position 
            start_position = start_position + shard_size
            
        


In [7]:
pd.Series(shard_list).head(n=20)
pd.Series(shard_list).tail(n=20)

0             chr1:1-100000000
1     chr1:100000001-200000000
2     chr1:200000001-248956422
3             chr2:1-100000000
4     chr2:100000001-200000000
5     chr2:200000001-242193529
6             chr3:1-100000000
7     chr3:100000001-198295559
8             chr4:1-100000000
9     chr4:100000001-190214555
10            chr5:1-100000000
11    chr5:100000001-181538259
12            chr6:1-100000000
13    chr6:100000001-170805979
14            chr7:1-100000000
15    chr7:100000001-159345973
16            chr8:1-100000000
17    chr8:100000001-145138636
18            chr9:1-100000000
19    chr9:100000001-138394717
dtype: object

22            chr11:1-100000000
23    chr11:100000001-135086622
24            chr12:1-100000000
25    chr12:100000001-133275309
26            chr13:1-100000000
27    chr13:100000001-114364328
28            chr14:1-100000000
29    chr14:100000001-107043718
30            chr15:1-100000000
31    chr15:100000001-101991189
32             chr16:1-90338345
33             chr17:1-83257441
34             chr18:1-80373285
35             chr19:1-58617616
36             chr20:1-64444167
37             chr21:1-46709983
38             chr22:1-50818468
39             chrX:1-100000000
40     chrX:100000001-156040895
41              chrY:1-57227415
dtype: object

## Write to File

In [8]:
with open(
    "/home/yraghav/Joint-Genotyping-Fall-2021/sentieon_joint_genotyping/shards/output/shards.txt", 
    'w'
) as out_file: 
    
    for shard in shard_list: 
        _ = out_file.write(shard+"\n")