In [1]:
import sys, os, re
import pyranges as pr
import pandas as pd

In [2]:
genome_fa =  "AmexG_v6.0-DD.fa"                                                                                                                 
out_prefix = "ladeda"                                                                                                                       
chunks_file = "chunks.tsv"

In [6]:
                                                                                                      
genome_fai_file = genome_fa + ".fai"                                                                                                               
genome_contig_info = pd.read_csv(genome_fai_file, sep="\t", 
                                 names=["Chromosome", "Length", "Offset", "Linebases", "Linewidth"])                                                                                                                                                                          
chromosomes = genome_contig_info['Chromosome'].values.tolist()     

genome_contig_info.head()                                                                                                      
   

Unnamed: 0,Chromosome,Length,Offset,Linebases,Linewidth
0,chr10p,1082275901,8,80,81
1,chr10q,525957290,1095804366,80,81
2,chr11p,305732125,1628336131,80,81
3,chr11q,1079741261,1937889916,80,81
4,chr12p,289110169,3031127951,80,81


In [37]:

chr_lengths = dict()

for x in genome_contig_info.itertuples():
    chr_lengths[ x.Chromosome]  = x.Length


In [8]:
                                                                                                                                                     
chunks = pd.read_csv(chunks_file, sep="\t") 

chunks.head()

Unnamed: 0,Chromosome,brkpt,chunksize_offset_frac
0,chr10p,360766498,1.6e-05
1,chr10p,721257583,0.000519
2,chr10q,263007889,5.8e-05
3,chr11q,358409797,0.003008
4,chr11q,719828585,2e-06


In [19]:
annotation_gtf = pd.read_csv("AmexT_v47-AmexG_v6.0-DD.gtf", sep="\t", names=[
    "Chromosome", "Source", "ev_type", "Start", "End", "Score", "Strand", "dot", "info"])

In [20]:
chromosomes_need_chunking = chunks['Chromosome'].unique().tolist()
chromosomes_need_chunking

['chr10p',
 'chr10q',
 'chr11q',
 'chr12q',
 'chr1p',
 'chr1q',
 'chr2p',
 'chr2q',
 'chr3p',
 'chr3q',
 'chr4p',
 'chr4q',
 'chr5p',
 'chr5q',
 'chr6p',
 'chr6q',
 'chr7p',
 'chr7q',
 'chr8p',
 'chr8q',
 'chr9q']

In [21]:
chromosome_to_chunk = chromosomes_need_chunking[0]

In [22]:
chr_gtf = annotation_gtf[ annotation_gtf.Chromosome == chromosome_to_chunk ]

chr_gtf.head()

Unnamed: 0,Chromosome,Source,ev_type,Start,End,Score,Strand,dot,info
0,chr10p,ambMex60DD,gene,313039,315424,1000,+,.,"gene_id ""AMEX60DD000001""; gene_name ""ZFP37 [nr..."
1,chr10p,ambMex60DD,transcript,313039,315424,1000,+,.,"gene_id ""AMEX60DD000001""; transcript_id ""ZFP37..."
2,chr10p,ambMex60DD,exon,313039,314183,1000,+,.,"gene_id ""AMEX60DD000001""; transcript_id ""ZFP37..."
3,chr10p,ambMex60DD,exon,315024,315424,1000,+,.,"gene_id ""AMEX60DD000001""; transcript_id ""ZFP37..."
4,chr10p,ambMex60DD,transcript,313800,315023,1000,+,.,"gene_id ""AMEX60DD000001""; transcript_id ""ZNF79..."


In [38]:
breakpoints = chunks[chunks.Chromosome == chromosome_to_chunk ]['brkpt'].values

breakpoints = sorted(breakpoints)

breakpoints.insert(0, 0)
breakpoints.append(chr_lengths[chromosome_to_chunk])

breakpoints

[0, 360766498, 721257583, 1082275901]

In [66]:


partitioned_chrom_df = None
for i in range(len(breakpoints)-1):
    lend_brkpt = breakpoints[i]
    rend_brkpt = breakpoints[i+1]
    print(lend_brkpt, rend_brkpt)
    
    chrom_partition_name = chromosome_to_chunk + f"^c{i}^o{lend_brkpt}"
    
    chr_gtf_partition = chr_gtf[ (chr_gtf.Start >= lend_brkpt) & (chr_gtf.End < rend_brkpt) ].copy()
    print(chr_gtf_partition.shape)
    
    # make adjustment to the gtf partition
    chr_gtf_partition['Chromosome'] = chrom_partition_name
    if lend_brkpt != 0:
        chr_gtf_partition['Start'] = chr_gtf_partition.Start - lend_brkpt
        chr_gtf_partition['End'] = chr_gtf_partition.End - lend_brkpt
        
    partitioned_chrom_df = pd.concat([partitioned_chrom_df, chr_gtf_partition])
    
    
partitioned_chrom_df   

0 360766498
(30691, 9)
360766498 721257583
(22588, 9)
721257583 1082275901
(17884, 9)


Unnamed: 0,Chromosome,Source,ev_type,Start,End,Score,Strand,dot,info
0,chr10p^c0^o0,ambMex60DD,gene,313039,315424,1000,+,.,"gene_id ""AMEX60DD000001""; gene_name ""ZFP37 [nr..."
1,chr10p^c0^o0,ambMex60DD,transcript,313039,315424,1000,+,.,"gene_id ""AMEX60DD000001""; transcript_id ""ZFP37..."
2,chr10p^c0^o0,ambMex60DD,exon,313039,314183,1000,+,.,"gene_id ""AMEX60DD000001""; transcript_id ""ZFP37..."
3,chr10p^c0^o0,ambMex60DD,exon,315024,315424,1000,+,.,"gene_id ""AMEX60DD000001""; transcript_id ""ZFP37..."
4,chr10p^c0^o0,ambMex60DD,transcript,313800,315023,1000,+,.,"gene_id ""AMEX60DD000001""; transcript_id ""ZNF79..."
...,...,...,...,...,...,...,...,...,...
71158,chr10p^c2^o721257583,ambMex60DD,CDS,360598108,360598226,1000,+,.,"gene_id ""AMEX60DD002204""; transcript_id ""LOC10..."
71159,chr10p^c2^o721257583,ambMex60DD,CDS,360685489,360685531,1000,+,.,"gene_id ""AMEX60DD002204""; transcript_id ""LOC10..."
71160,chr10p^c2^o721257583,ambMex60DD,gene,360695766,360695965,1000,+,.,"gene_id ""AMEX60DD002205""; gene_name ""AMEX60DD0..."
71161,chr10p^c2^o721257583,ambMex60DD,transcript,360695766,360695965,1000,+,.,"gene_id ""AMEX60DD002205""; transcript_id ""AMEX6..."


In [69]:
[chrom for chrom in chromosomes if chrom not in chromosomes_need_chunking]  

['chr11p',
 'chr12p',
 'chr13p',
 'chr13q',
 'chr14p',
 'chr14q',
 'chr9p',
 'C0128467',
 'C0128569',
 'C0128574',
 'C0128687',
 'C0128715',
 'C0128790',
 'C0128851',
 'C0128909',
 'C0128923',
 'C0129016',
 'C0129038',
 'C0129066',
 'C0129093',
 'C0129146',
 'C0129181',
 'C0129217',
 'C0129219',
 'C0129252',
 'C0129279',
 'C0129288',
 'C0129302',
 'C0129304',
 'C0129315',
 'C0129323',
 'C0129458',
 'C0129529',
 'C0129583',
 'C0129600',
 'C0129681',
 'C0129705',
 'C0129855',
 'C0129944',
 'C0130288',
 'C0130458',
 'C0130541',
 'C0130758',
 'C0130922',
 'C0131115',
 'C0131119',
 'C0131185',
 'C0131218',
 'C0131234',
 'C0131257',
 'C0131269',
 'C0131338',
 'C0131357',
 'C0131362',
 'C0131400',
 'C0131426',
 'C0131549',
 'C0131557',
 'C0131725',
 'C0131753',
 'C0131828',
 'C0131834',
 'C0131869',
 'C0131871',
 'C0131996',
 'C0132009',
 'C0132040',
 'C0132054',
 'C0132069',
 'C0132119',
 'C0132131',
 'C0132252',
 'C0132311',
 'C0132394',
 'C0132416',
 'C0132423',
 'C0132441',
 'C0132452',
 