### Parsing Code

This script calls the PhiX-174 Genebank file & parses the output to extract gene and promoter locations of the circular DNA. Next, it recalculates the genomic coordinates by linearizing the genome. 

In [58]:
from Bio import Entrez, SeqIO
import pandas as pd

#### Parse genomic coordinates from Genebank File

In [59]:
Entrez.email = "tanvi.ingle@utexas.edu"
handle = Entrez.efetch(db="nuccore",
id=["MN385565.1"], rettype="gb",retmode="text")

record = SeqIO.read(handle, "genbank")

df = pd.DataFrame(columns=['type', 'name', 'start1', \
                           'end1','start2','end2'])


'''

The following code parses the file above into a Pandas
Dataframe. An example:

type       | name | start1 | end1 | start2 | end2
---------------------------|------|--------|-----
Promoter   | D    | 312    | 357  | None   | None
Gene       | A    | 3980   | 5386 | 0      | 136
...
'''

for feature in record.features:
    name = ""
    feature_type = feature.type
    start1 = None
    end1 = None
    start2 = None
    end2 = None
    if feature_type == 'gene':
        name = feature.qualifiers.get('gene')[0]
        if feature.location_operator == 'join':
            first = feature.location.parts[0]
            second = feature.location.parts[1]
            start1 = first.start
            end1 = first.end
            start2 = second.start
            end2 = second.end
        else:
            start1 = feature.location.start
            end1 = feature.location.end

        new_row = {'type':feature_type, 'name':name, \
                   'start1':start1, 'end1':end1, \
                   'start2': start2, 'end2':end2}
        df = df.append(new_row, ignore_index=True)

    # we assume that regulatory (promoter) feature types do not
    # have CompoundLocation ``location'' fields
    if feature_type == 'regulatory':
        name = feature.qualifiers.get('note')[0]
        # remove Promoter from the name
        name = name.replace('Promoter ', '')
        if feature.location_operator == 'join':
            print('join not implemented for regulatory ',
                  'feature type!')
            continue
        else:
            start1 = feature.location.start
            end1 = feature.location.end

        new_row = {'type':'promoter', 'name':name, \
                   'start1':start1, 'end1':end1, \
                   'start2': start2, 'end2':end2}
        #append row to the dataframe (check efficiency)
        df = df.append(new_row, ignore_index=True)

display(df)

Unnamed: 0,type,name,start1,end1,start2,end2
0,gene,A,3980,5386,0.0,136.0
1,gene,A*,4496,5386,0.0,136.0
2,gene,B,5074,5386,0.0,51.0
3,gene,K,50,221,,
4,gene,C,132,393,,
5,promoter,D,312,357,,
6,gene,D,389,848,,
7,gene,E,567,843,,
8,gene,J,847,964,,
9,gene,F,1000,2284,,


#### Linearize genomic positions

In [60]:
i=0
df["new_start"] = ""
df["new_end"] = ""

index = df['start1'][12]
end_index =  df['end1'][0]

while i < len(df):
    row = df.loc[i]
    if(row['type'] == "promoter" and row['name'] == "A"):
        #print("pA")
        df.at[i,'new_start'] = 1
        df.at[i, 'new_end'] = row['end1'] - row['start1'] + 1
        
    elif(row['type'] == "promoter" and row['name'] in ["B1","B2"]):
        #print("Displaced promoters")
        df.at[i,'new_start'] = (row['start1'] - index)
        df.at[i, 'new_end'] = row['end1'] - row['start1'] + df.at[i,'new_start']
        
    elif(row['type'] == "gene" and row['name'] in ["A", "A*", "B"]):
        #print("looping genes")
        df.at[i,'new_start'] = (row['start1'] - index)
        df.at[i, 'new_end'] = (row['end1'] - row['start1']) + (row['end2'] - row['start2']) + df.at[i,'new_start']
    
    else:
        #print("remaining genes")
        df.at[i,'new_start'] = row['start1'] + (end_index-index)
        df.at[i, 'new_end'] = (row['end1'] - row['start1']) + df.at[i,'new_start'] 

    i = i+1

display(df)

Unnamed: 0,type,name,start1,end1,start2,end2,new_start,new_end
0,gene,A,3980,5386,0.0,136.0,63,1605
1,gene,A*,4496,5386,0.0,136.0,579,1605
2,gene,B,5074,5386,0.0,51.0,1157,1520
3,gene,K,50,221,,,1519,1690
4,gene,C,132,393,,,1601,1862
5,promoter,D,312,357,,,1781,1826
6,gene,D,389,848,,,1858,2317
7,gene,E,567,843,,,2036,2312
8,gene,J,847,964,,,2316,2433
9,gene,F,1000,2284,,,2469,3753


In [61]:
df.to_csv(r"/Users/t/Documents/Wilke/phix174/output/genomic_coords.csv")

