In [2]:
import numpy as np
import pandas as pd
from copy import deepcopy
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [3]:
fasta_path = 'GCF_000046845.1_ASM4684v1_cds_from_genomic.fna'
genbank_path = 'GCF_000046845.1_ASM4684v1_genomic.gbff'

In [4]:
df = pd.read_csv('TPM_matrix.csv')
locus_tags = list(df.tag)
df.head()

Unnamed: 0,gene,tag,target_id,LB0R1,LB0R2,LB0R3,LB1R1,LB1R2,LB1R3,M9LQR1,M9LQR2,M9LQR3,M9SFR1,M9SFR2,M9SFR3
0,dnaA,ACIAD_RS00005,lcl|NC_005966.1_cds_WP_004930068.1_1,254.471,195.265,258.473,275.958,240.025,217.334,56.1754,55.1445,45.2804,147.637,127.761,157.642
1,dnaN,ACIAD_RS00010,lcl|NC_005966.1_cds_WP_004930066.1_2,257.947,222.565,246.455,334.218,211.942,223.863,27.4375,51.0805,53.3345,181.774,142.968,218.187
2,recF,ACIAD_RS00015,lcl|NC_005966.1_cds_WP_004930061.1_3,217.036,206.512,152.505,141.987,180.946,144.018,81.6377,64.7067,70.099,48.584,52.2221,57.3814
3,gyrB,ACIAD_RS00020,lcl|NC_005966.1_cds_WP_004930060.1_4,496.668,476.138,357.317,399.349,494.216,271.789,84.4136,91.607,105.723,143.101,141.51,151.854
4,,ACIAD_RS00025,lcl|NC_005966.1_cds_WP_004930057.1_5,47.7092,56.7863,48.9246,133.345,55.1851,44.6848,1739.33,2147.75,1420.14,292.355,458.401,280.924


In [5]:
# get_interregions

seq_record = SeqIO.read(open(genbank_path,'r'), 'genbank')
genome_size = len(seq_record)
cds_list = []
# Loop over the gnome, get CDS features on each of the strands (locations)
for feature in seq_record.features:
    if feature.type == 'CDS':
        mystart = feature.location.start.position
        myend = feature.location.end.position
        strand = feature.strand
        locus_tag = feature.qualifiers['locus_tag'][0] # each cds should have only a single tag
        cds_list.append([mystart,myend,strand,locus_tag])

In [6]:
def get_fwd_start_and_end(this_start,curr_cds,strand):
    init_cds = deepcopy(curr_cds)
    for ii in range(curr_cds,-1,-1): # get nearest upstream cds on same strand
        if cds_list[ii][2] == strand:
            last_end = cds_list[ii][1]
            if curr_cds == (len(cds_list)-1) and this_start < last_end: 
                if genome_size - last_end + this_start > 40: 
                    return this_start, last_end
            elif curr_cds != (len(cds_list)-1) and this_start - last_end > 40:
                return this_start, last_end
            else: 
                this_start = cds_list[ii][0]
                curr_cds = ii - 1
                this_start, last_end = get_fwd_start_and_end(this_start,curr_cds,strand)
                
def get_rev_start_and_end(this_start,curr_cds,strand):
    for ii in range(curr_cds,len(cds_list)): # get nearest downstream cds on same strand
        if cds_list[ii][2] == strand:
            last_end = cds_list[ii][0]
            if last_end - this_start > 40: 
                return this_start, last_end
            else: 
                this_start = cds_list[ii][1]
                curr_cds = ii + 1
                if curr_cds == len(cds_list) or ii == len(cds_list): # then we have exhausted the downstream coding sequences on this strand
                    return this_start, last_end
                else: 
                    this_start, last_end = get_rev_start_and_end(this_start,curr_cds,strand)

In [7]:
intergenic_records = []

# forward strand
for ii in range(len(cds_list)):
    strand = cds_list[ii][2]
    if strand == 1: # only look upstream for intergenic region
        
        if ii == 0: 
            this_start = cds_list[ii][0]
            this_start, last_end = get_fwd_start_and_end(this_start,len(cds_list)-1,strand)
            if this_start < last_end: 
                intergene_seq = seq_record.seq[last_end:genome_size] + seq_record.seq[0:this_start]
            else: 
                intergene_seq = seq_record.seq[last_end:this_start]
            if len(intergene_seq) > 1000: 
                intergene_seq = intergene_seq[-1000:]
            intergenic_records.append(SeqRecord(intergene_seq,id=cds_list[ii][3],
                         description='%d,%d,%d'%(last_end,this_start,strand)))  
        
        if ii > 0:
            this_start = cds_list[ii][0]
            this_start, last_end = get_fwd_start_and_end(this_start,ii-1,strand) # recursively search for intergenic region on same strand
            intergene_seq = seq_record.seq[last_end:this_start]
            if len(intergene_seq) > 1000: 
                intergene_seq = intergene_seq[-1000:]
            intergenic_records.append(SeqRecord(intergene_seq,id=cds_list[ii][3],
                         description='%d,%d,%d'%(last_end,this_start,strand)))                
                    

# STILL NEED TO TAKE CARE OF BOUNDARIES
# REVERSE MODE NOT WORKING PROPERLY 

# reverse strand
for ii, features in reversed(list(enumerate(cds_list))):               
    strand = cds_list[ii][2]
    if strand == -1: # only look downstream for intergenic region
        if ii < len(cds_list)-1:
            this_start = cds_list[ii][1]
            this_start, last_end = get_rev_start_and_end(this_start,ii+1,strand)
            
            if last_end < this_start: 
                intergene_seq = seq_record.seq[this_start:genome_size] + seq_record.seq[0:last_end]
            else:
                intergene_seq = seq_record.seq[this_start:last_end]
            if len(intergene_seq) > 1000:
                intergene_seq = intergene_seq[0:1000]
            intergenic_records.append(SeqRecord(intergene_seq.reverse_complement(),id=cds_list[ii][3],
                         description='%d,%d,%d'%(last_end,this_start,strand)))                


In [8]:
# place records in dataframe

intergenic_record_dict = {'locus_tag':[],'start':[],'end':[],'strand':[],'seq':[]}
for record in intergenic_records: 
    intergenic_record_dict['locus_tag'].append(record.id)
    mystart, myend, strand = record.description.split(',')
    mystart, myend, strand = int(mystart), int(myend), int(strand)
    intergenic_record_dict['start'].append(mystart)
    intergenic_record_dict['end'].append(myend)    
    intergenic_record_dict['strand'].append(strand)    
    intergenic_record_dict['seq'].append(str(record.seq))

In [9]:
intergenic_record_df = pd.DataFrame(intergenic_record_dict)

In [10]:
# intergenic_record_df.to_csv('intergenic_records.csv',index=False)