# Create Shards

## Purpose: 

* Read Necessary Data
* Create Shards by Shard Size
* Write to File

## Packages & Options

In [1]:
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Literals and Read Data

In [2]:
# IMPORTANT: determine this by looking at: 
# 1. total input data size
# 2. max memory allowed for node on chosen partition
shard_size = 1000000

In [3]:
indexed_fastq = pd.read_csv(
    "/home/yraghav/Joint-Genotyping-Fall-2021/inputs/indexed_fasta/GRCh38_full_analysis_set_plus_decoy_hla.chr1-chrY.fa.fai", 
    sep="\t", 
    header=None, 
)

In [4]:
indexed_fastq.head()
indexed_fastq.tail()

Unnamed: 0,0,1,2,3,4
0,chr1,248956422,112,70,71
1,chr2,242193529,252513167,70,71
2,chr3,198295559,498166716,70,71
3,chr4,190214555,699295181,70,71
4,chr5,181538259,892227221,70,71


Unnamed: 0,0,1,2,3,4
19,chr20,64444167,2751788762,70,71
20,chr21,46709983,2817153685,70,71
21,chr22,50818468,2864531079,70,71
22,chrX,156040895,2916075638,70,71
23,chrY,57227415,3074345836,70,71


## Create Shards with Chosen Shard Size

In [5]:
# list of final output shards
shard_list = []

# per each row 
for index, row in indexed_fastq.iterrows():
    
    # start position
    start_position = 1
    # save chromosome length as int 
    chromosome_length = int(row[1])
    
    # if shard size is larger than chromosome size (unlikely): 
    if shard_size > int(row[1]): 
        # make whole chromosome as shard
        shard_list.append(
            "{}:{}-{}".format(
                row[0], 
                start_position, 
                row[1])
        )
        
        # break out of iteration 
        break
     
    # shard size is smaller than chromosome size (normal)
    else: 
        # append the first shard for the chromosome
        shard_list.append(
            "{}:{}-{}".format(
                row[0], 
                start_position,
                shard_size
            )
        )
        
        # update start position 
        start_position = start_position + shard_size
    
    # still need to make shards 
    while(start_position < chromosome_length):
        
        # if using 'shard_size' pushes us over chromsome boundary
        if start_position + shard_size > chromosome_length: 
            # just take from start position to end of chromosome 
            shard_list.append(
                "{}:{}-{}".format(
                    row[0], 
                    start_position, 
                    chromosome_length
                )
            )
            
            # update start position 
            start_position = start_position + shard_size
        
        # make full size shard 
        else: 
            shard_list.append(
                "{}:{}-{}".format(
                    row[0],
                    start_position, 
                    (start_position + shard_size -1 )
                )
            )
            
            # update start position 
            start_position = start_position + shard_size
            
        


In [6]:
pd.Series(shard_list).head(n=20)
pd.Series(shard_list).tail(n=20)

0             chr1:1-1000000
1       chr1:1000001-2000000
2       chr1:2000001-3000000
3       chr1:3000001-4000000
4       chr1:4000001-5000000
5       chr1:5000001-6000000
6       chr1:6000001-7000000
7       chr1:7000001-8000000
8       chr1:8000001-9000000
9      chr1:9000001-10000000
10    chr1:10000001-11000000
11    chr1:11000001-12000000
12    chr1:12000001-13000000
13    chr1:13000001-14000000
14    chr1:14000001-15000000
15    chr1:15000001-16000000
16    chr1:16000001-17000000
17    chr1:17000001-18000000
18    chr1:18000001-19000000
19    chr1:19000001-20000000
dtype: object

3082    chrY:38000001-39000000
3083    chrY:39000001-40000000
3084    chrY:40000001-41000000
3085    chrY:41000001-42000000
3086    chrY:42000001-43000000
3087    chrY:43000001-44000000
3088    chrY:44000001-45000000
3089    chrY:45000001-46000000
3090    chrY:46000001-47000000
3091    chrY:47000001-48000000
3092    chrY:48000001-49000000
3093    chrY:49000001-50000000
3094    chrY:50000001-51000000
3095    chrY:51000001-52000000
3096    chrY:52000001-53000000
3097    chrY:53000001-54000000
3098    chrY:54000001-55000000
3099    chrY:55000001-56000000
3100    chrY:56000001-57000000
3101    chrY:57000001-57227415
dtype: object

## Write to File

In [7]:
with open(
    "/home/yraghav/Joint-Genotyping-Fall-2021/sentieon_joint_genotyping/shards/output/shards.txt", 
    'w'
) as out_file: 
    
    for shard in shard_list: 
        _ = out_file.write(shard+"\n")