# Define Input Regions

## Purpose: 

* Create 200 Base Pair Regions from Consensus Peak Regions
* Make sure Region Identifiers are Unique 
* Create Intervals File
* Output Regions & Interval File

## Packages and Options

In [1]:
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_rows', 500)

## Read in Data

In [2]:
all_regions = pd.read_csv(
"/home/yraghav/CHDI-NeuroLINCS-Multi-Epigenomics-Annotation/analysis/1_get_consensus_peaks/output/regions.tsv", 
sep="\t"
).astype({'START':'int32', 'END':'int32'})

all_regions

Unnamed: 0,CHR,START,END
0,chr1,0,50536
1,chr1,56033,56529
2,chr1,60536,82500
3,chr1,92846,126395
4,chr1,136395,177709
...,...,...,...
190926,chrY,56872741,56873254
190927,chrY,56873557,56875175
190928,chrY,56877477,56878064
190929,chrY,56878522,56878896


In [3]:
chromosomes = all_regions["CHR"]
start_coordinates = all_regions["START"]
end_coordinates = all_regions["END"]

## Create 200 Base Pair Regions from Input BED File

In [4]:
new_chr_list = []
new_start_list = []
new_end_list = []
identifier_list=[]

counter = 1

for i in range(0, len(end_coordinates)): 
    
    tmp_start = start_coordinates[i]
    tmp_end = end_coordinates[i]
    
    for _ in range(0, ((tmp_end-tmp_start)//200)): 
        new_chr_list.append(chromosomes[i])
        new_start_list.append(tmp_start)
        new_end_list.append(tmp_start+200)
        identifier_list.append("R{}".format(str(counter)))
        
        tmp_start += 200
        counter+=1
    
    if tmp_end - tmp_start > 0: 
        new_chr_list.append(chromosomes[i])
        new_start_list.append(tmp_start)
        new_end_list.append(tmp_end)
        identifier_list.append("R{}".format(str(counter)))
        
        counter+=1
            

In [5]:
input_regions = pd.DataFrame([new_chr_list, new_start_list, new_end_list, identifier_list]).T

In [6]:
input_regions.head()
input_regions.tail()

Unnamed: 0,0,1,2,3
0,chr1,0,200,R1
1,chr1,200,400,R2
2,chr1,400,600,R3
3,chr1,600,800,R4
4,chr1,800,1000,R5


Unnamed: 0,0,1,2,3
11454527,chrY,56879230,56879430,R11454528
11454528,chrY,56879430,56879630,R11454529
11454529,chrY,56879630,56879830,R11454530
11454530,chrY,56879830,56880030,R11454531
11454531,chrY,56880030,56880082,R11454532


## Make sure Identifier Region IDs are Unique

In [7]:
input_regions[3].value_counts().value_counts()

1    11454532
Name: 3, dtype: int64

## Create Interval File

In [8]:
chromosomes = input_regions[0].unique().tolist()
chromosomes

['chr1',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr2',
 'chr20',
 'chr21',
 'chr22',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chrX',
 'chrY']

In [9]:
intervals = pd.DataFrame()

for chromosome in chromosomes: 
    
    intervals = intervals.append(
        pd.Series(
            [
                chromosome,
                int(input_regions[input_regions[0]==chromosome][3].tolist()[0].split("R")[1])-1, 
                int(input_regions[input_regions[0]==chromosome][3].tolist()[-1].split("R")[1])
            ]
        ), 
        
        ignore_index=True
    )


In [10]:
intervals = intervals.astype({0:'str', 1:'int32', 2: 'int32'})

In [11]:
intervals

Unnamed: 0,0,1,2
0,chr1,0,944532
1,chr10,944532,1434914
2,chr11,1434914,1927542
3,chr12,1927542,2418282
4,chr13,2418282,2853557
5,chr14,2853557,3264439
6,chr15,3264439,3667083
7,chr16,3667083,3970501
8,chr17,3970501,4279795
9,chr18,4279795,4577532


## Output Regions & Intervals to File

In [12]:
input_regions.to_csv(
    "/home/yraghav/CHDI-NeuroLINCS-Multi-Epigenomics-Annotation/analysis/2_define_input_regions/output/all_regions.bed",
    sep=" ", 
    header=False, 
    index=False
)

In [13]:
intervals.to_csv(
    "/home/yraghav/CHDI-NeuroLINCS-Multi-Epigenomics-Annotation/analysis/2_define_input_regions/output/all_regions.bed.inv", 
    sep=" ",
    header=False, 
    index = False
)