In [1]:
import sys, os, re
import pyranges as pr
import pandas as pd
import logging
import argparse
import math

In [4]:
genome_fa = "AmexG_v6.0-DD.fa"
out_prefix = "ladeda"
max_chunk_size = 500000000
gene_annot_gtf = "AmexT_v47-AmexG_v6.0-DD.gtf"
gene_spans_file = "AmexT_v47-AmexG_v6.0-DD.gtf.gene_spans"
N_regions_file = "AmexG_v6.0-DD.fa.N_ranges"
   

In [35]:
genome_fai_file = genome_fa + ".fai"
genome_contig_info = pd.read_csv(genome_fai_file, sep="\t", names=["Chromosome", "Length", "Offset", "Linebases", "Linewidth"])
    
chromosomes_require_chunking = genome_contig_info[ genome_contig_info['Length'] > max_chunk_size]['Chromosome'].values.tolist()

gene_spans_info = pd.read_csv(gene_spans_file, sep="\t", names=["gene_id", "Chromosome", "Start", "End", "Strand", "gene_sym", "gene_type"]) 

N_regions_info = pd.read_csv(N_regions_file, sep="\t", names=["Chromosome", "Start", "End"])
N_regions_info['Length'] = N_regions_info['End'] - N_regions_info['Start'] + 1
N_regions_info = N_regions_info[N_regions_info['Length'] > 10 ]
    
pr_N_regions = pr.PyRanges(N_regions_info)

# merge gene spans
pr_gene_spans = pr.PyRanges(gene_spans_info)
pr_gene_spans = pr_gene_spans.merge()
df_gene_spans_merged = pr_gene_spans.as_df()



In [114]:
# want intergenic regions:

df_intergenic = df_gene_spans_merged.copy()

df_intergenic['intergenic_lend'] = df_intergenic['End'] + 1
df_intergenic['intergenic_rend'] = df_intergenic.groupby('Chromosome').Start.shift(-1) -1






In [115]:
df_intergenic

Unnamed: 0,Chromosome,Start,End,Strand,intergenic_lend,intergenic_rend
0,C0000568,109816,110194,+,110195,
1,C0000570,39532,39934,+,39935,128077.0
2,C0000570,128078,128467,-,128468,
3,C0000571,72845,78966,+,78967,
4,C0000572,73068,80532,+,80533,93873.0
...,...,...,...,...,...,...
93878,chr14q,432916569,432921442,-,432921443,435821472.0
93879,chr14q,435821473,435824892,-,435824893,435827708.0
93880,chr14q,435827709,435828046,-,435828047,435889952.0
93881,chr14q,435889953,435915694,-,435915695,436410137.0


In [116]:
df_intergenic = df_intergenic.drop(['Start','End', 'Strand'], axis=1)
df_intergenic = df_intergenic.rename(columns={'intergenic_lend' : 'Start', 'intergenic_rend' : 'End'})
df_intergenic = df_intergenic[ ~ df_intergenic['End'].isna() ]

In [117]:
df_intergenic

Unnamed: 0,Chromosome,Start,End
1,C0000570,39935,128077.0
4,C0000572,80533,93873.0
7,C0000575,107064,107676.0
8,C0000575,108082,108853.0
9,C0000575,110808,112307.0
...,...,...,...
93877,chr14q,432263607,432916568.0
93878,chr14q,432921443,435821472.0
93879,chr14q,435824893,435827708.0
93880,chr14q,435828047,435889952.0


In [118]:
# find intergenic regions that encompass N regions.
pr_intergenic = pr.PyRanges(df_intergenic)




pr_intergenic_N = pr_N_regions.join(pr_intergenic)

pr_intergenic_N



Unnamed: 0,Chromosome,Start,End,Length,Start_b,End_b
0,chr1p,109908,109927,20,93301,145253
1,chr1p,140582,140601,20,93301,145253
2,chr1p,172125,172144,20,156130,186984
3,chr1p,172125,172144,20,157662,768306
4,chr1p,197336,197355,20,157662,768306
...,...,...,...,...,...,...
327369,chr14q,436027940,436027959,20,436017712,436072863
327370,chr14q,436127140,436127159,20,435915695,436410137
327371,chr14q,436127140,436127159,20,436120504,436141246
327372,chr14q,436232102,436232121,20,435915695,436410137


In [119]:
genome_contigs_to_chunk_info = genome_contig_info[ genome_contig_info['Chromosome'].isin(chromosomes_require_chunking)]

genome_contigs_to_chunk_info_itertuples = genome_contigs_to_chunk_info.itertuples()



In [120]:

contig_row = next(genome_contigs_to_chunk_info_itertuples)

In [121]:


chromosome = contig_row.Chromosome
chr_len = contig_row.Length
        
# estimate number of chunks:
num_chunks = math.ceil(chr_len/max_chunk_size)
chunk_size = round(chr_len/num_chunks)
chunk_brkpts = [ chunk_size * i for i in range(1, num_chunks) ]

chunk_brkpts


[360758634, 721517268]

In [122]:
pr_brkpts = pr.from_dict({ 'Chromosome' : chromosome, 
                          'Start' : chunk_brkpts, 
                          'End' : chunk_brkpts })

pr_brkpts

Unnamed: 0,Chromosome,Start,End
0,chr10p,360758634,360758634
1,chr10p,721517268,721517268


In [123]:
pr_intergenic_N = pr_intergenic_N.drop(['Start_b', 'End_b'])

In [124]:
pr_brkpts_nearest = pr_brkpts.nearest(pr_intergenic_N)

In [125]:
pr_brkpts_nearest

Unnamed: 0,Chromosome,Start,End,Start_b,End_b,Length,Distance
0,chr10p,360758634,360758634,360766489,360766508,20,7856
1,chr10p,721517268,721517268,721689004,721689023,20,171737


In [126]:
df_brkpts_nearest = pr_brkpts_nearest.as_df().copy()

df_brkpts_nearest['brkpt'] = df_brkpts_nearest.apply(lambda x: int( (x['Start_b'] + x['End_b'])/2), axis=1)

df_brkpts_nearest

Unnamed: 0,Chromosome,Start,End,Start_b,End_b,Length,Distance,brkpt
0,chr10p,360758634,360758634,360766489,360766508,20,7856,360766498
1,chr10p,721517268,721517268,721689004,721689023,20,171737,721689013


In [128]:
df_brkpts_nearest['chunksize_offset_frac'] = df_brkpts_nearest['Distance'] / max_chunk_size

df_brkpts_nearest

Unnamed: 0,Chromosome,Start,End,Start_b,End_b,Length,Distance,brkpt,chunksize_offset_frac
0,chr10p,360758634,360758634,360766489,360766508,20,7856,360766498,1.6e-05
1,chr10p,721517268,721517268,721689004,721689023,20,171737,721689013,0.000343
