In [3]:
import numpy as np
import pandas as pd
from copy import deepcopy
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [4]:
fasta_path = 'GCA_000046845.1_ASM4684v1_cds_from_genomic.fna'
genbank_path = 'GCA_000046845.1_ASM4684v1_genomic.gbff'

In [5]:
df = pd.read_csv('TPM_matrix.csv')
locus_tags = list(df.tag)
df.head()

Unnamed: 0,gene,tag,target_id,LB0R1,LB0R2,LB0R3,LB1R1,LB1R2,LB1R3,M9LQR1,M9LQR2,M9LQR3,M9SFR1,M9SFR2,M9SFR3
0,dnaA,ACIAD0001,lcl|CR543861.1_cds_CAG66986.1_1,298.274,222.816,306.261,302.868,300.466,307.041,60.4627,59.4571,48.8814,154.061,135.488,164.693
1,dnaN,ACIAD0002,lcl|CR543861.1_cds_CAG66987.1_2,302.359,253.969,292.017,366.812,265.31,316.267,29.5314,55.0744,57.5752,189.685,151.612,227.946
2,recF,ACIAD0003,lcl|CR543861.1_cds_CAG66988.1_3,254.409,235.65,180.698,155.834,226.509,203.465,87.8677,69.7656,75.6724,50.6984,55.3793,59.9481
3,gyrB,ACIAD0004,lcl|CR543861.1_cds_CAG66989.1_4,582.122,543.316,423.388,438.286,618.666,383.969,90.857,98.774,114.133,149.325,150.072,158.646
4,,ACIAD0005,lcl|CR543861.1_cds_CAG66990.1_5,48.0218,59.8179,57.9693,146.44,60.4595,63.1651,1780.57,2163.66,1451.08,269.374,476.839,277.298


In [6]:
# get_interregions

seq_record = SeqIO.read(open(genbank_path,'r'), 'genbank')
genome_size = len(seq_record)
cds_list = []
# Loop over the gnome, get CDS features on each of the strands (locations)
for feature in seq_record.features:
    if feature.type == 'CDS':
        mystart = feature.location.start.position
        myend = feature.location.end.position
        strand = feature.strand
        locus_tag = feature.qualifiers['locus_tag'][0] # each cds should have only a single tag
        cds_list.append([mystart,myend,strand,locus_tag])

In [7]:
def get_fwd_start_and_end(this_start,curr_cds,strand):
    init_cds = deepcopy(curr_cds)
    for ii in range(curr_cds,-1,-1): # get nearest upstream cds on same strand
        if cds_list[ii][2] == strand:
            last_end = cds_list[ii][1]
            if curr_cds == (len(cds_list)-1) and this_start < last_end: 
                if genome_size - last_end + this_start > 40: 
                    return this_start, last_end
            elif curr_cds != (len(cds_list)-1) and this_start - last_end > 40:
                return this_start, last_end
            else: 
                this_start = cds_list[ii][0]
                curr_cds = ii - 1
                this_start, last_end = get_fwd_start_and_end(this_start,curr_cds,strand)
                
def get_rev_start_and_end(this_start,curr_cds,strand):
    for ii in range(curr_cds,len(cds_list)): # get nearest downstream cds on same strand
        if cds_list[ii][2] == strand:
            last_end = cds_list[ii][0]
            if last_end - this_start > 40: 
                return this_start, last_end
            else: 
                this_start = cds_list[ii][1]
                curr_cds = ii + 1
                if curr_cds == len(cds_list) or ii == len(cds_list): # then we have exhausted the downstream coding sequences on this strand
                    return this_start, last_end
                else: 
                    this_start, last_end = get_rev_start_and_end(this_start,curr_cds,strand)

In [8]:
# get putative promoter region for each CDS
intergenic_records = []

# forward strand
for ii in range(len(cds_list)):
    strand = cds_list[ii][2]
    if strand == 1: # only look upstream for intergenic region
        
        if ii == 0: 
            this_start = cds_list[ii][0]
            this_start, last_end = get_fwd_start_and_end(this_start,len(cds_list)-1,strand)
            if this_start < last_end: 
                intergene_seq = seq_record.seq[last_end:genome_size] + seq_record.seq[0:this_start]
            else: 
                intergene_seq = seq_record.seq[last_end:this_start]
            if len(intergene_seq) > 1000: 
                intergene_seq = intergene_seq[-1000:]
            intergenic_records.append(SeqRecord(intergene_seq,id=cds_list[ii][3],
                         description='%d,%d,%d'%(last_end,this_start,strand)))  
        
        if ii > 0:
            this_start = cds_list[ii][0]
            this_start, last_end = get_fwd_start_and_end(this_start,ii-1,strand) # recursively search for intergenic region on same strand
            intergene_seq = seq_record.seq[last_end:this_start]
            if len(intergene_seq) > 1000: 
                intergene_seq = intergene_seq[-1000:]
            intergenic_records.append(SeqRecord(intergene_seq,id=cds_list[ii][3],
                         description='%d,%d,%d'%(last_end,this_start,strand)))                
                    

# reverse strand
for ii, features in reversed(list(enumerate(cds_list))):               
    strand = cds_list[ii][2]
    if strand == -1: # only look downstream for intergenic region
        if ii < len(cds_list)-1:
            this_start = cds_list[ii][1]
            this_start, last_end = get_rev_start_and_end(this_start,ii+1,strand)
            
            if last_end < this_start: 
                intergene_seq = seq_record.seq[this_start:genome_size] + seq_record.seq[0:last_end]
            else:
                intergene_seq = seq_record.seq[this_start:last_end]
            if len(intergene_seq) > 1000:
                intergene_seq = intergene_seq[0:1000]
            intergenic_records.append(SeqRecord(intergene_seq.reverse_complement(),id=cds_list[ii][3],
                         description='%d,%d,%d'%(last_end,this_start,strand)))                


In [9]:
# place records in dataframe

intergenic_record_dict = {'locus_tag':[],'start':[],'end':[],'strand':[],'seq':[]}
for record in intergenic_records: 
    intergenic_record_dict['locus_tag'].append(record.id)
    mystart, myend, strand = record.description.split(',')
    mystart, myend, strand = int(mystart), int(myend), int(strand)
    intergenic_record_dict['start'].append(mystart)
    intergenic_record_dict['end'].append(myend)    
    intergenic_record_dict['strand'].append(strand)    
    intergenic_record_dict['seq'].append(str(record.seq))

In [10]:
intergenic_record_df = pd.DataFrame(intergenic_record_dict)

In [40]:
# intergenic_record_df.to_csv('intergenic_records.csv',index=False)