#### Read GenBank files in Python

- Genbank files store information that include
  - DNA/Protein sequence.
  - Annotations of the sequences
- Contains more information than fasta format
- Genbank files can be read using Biopython package

In [1]:
from Bio import SeqIO

In [2]:
file_path = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/sequence.gb"

In [3]:
# Read GenBank file

genbank_object = SeqIO.read(file_path, 'gb')

In [4]:
record_id = genbank_object.id
print(record_id)

CP000325.1


In [5]:
record_name = genbank_object.name
print(record_name)

CP000325


In [6]:
record_seq = genbank_object.seq
sequence_length = len(record_seq)
print(sequence_length)

5631606


In [7]:
description = genbank_object.description
print(description)

Mycobacterium ulcerans Agy99, complete genome


In [8]:
annotations = genbank_object.annotations
print(annotations)

{'molecule_type': 'DNA', 'topology': 'circular', 'data_file_division': 'BCT', 'date': '30-JAN-2014', 'accessions': ['CP000325'], 'sequence_version': 1, 'keywords': [''], 'source': 'Mycobacterium ulcerans Agy99', 'organism': 'Mycobacterium ulcerans Agy99', 'taxonomy': ['Bacteria', 'Actinomycetota', 'Actinomycetes', 'Mycobacteriales', 'Mycobacteriaceae', 'Mycobacterium', 'Mycobacterium ulcerans group'], 'references': [Reference(title='Reductive evolution and niche adaptation inferred from the genome of Mycobacterium ulcerans, the causative agent of Buruli ulcer', ...), Reference(title='Direct Submission', ...)], 'comment': 'Source DNA is available from Dr Tim Stinear, Department\nMicrobiology, Monash University, Melbourne, Australia,\ntim.stinear@med.monash.edu.au; Bacteria are available from Dr Janet\nFyfe, Mycobacterial Reference Laboratory, Victorian Infectious\nDiseases Reference Laboratory, Melbourne, Australia,\nJanet.Fyfe@mh.org.au.'}


In [9]:
features = genbank_object.features
feature_types = [ feature.type for feature in features ]
feature_types = set(feature_types)
feature_types = list(feature_types)

In [11]:
print(feature_types)

['gene', 'rRNA', 'mobile_element', 'repeat_region', 'misc_feature', 'misc_RNA', 'source', 'CDS', 'tRNA']


In [12]:
feature_types = ['CDS', 'gene', 'mobile_element', 'tRNA' , 'rRNA']

In [13]:
print(feature_types)

['CDS', 'gene', 'mobile_element', 'tRNA', 'rRNA']


In [15]:
for feature in feature_types:
    all_features = [ i for i in features if i.type == feature ]
    number_of_fearures = len(all_features)
    print('Number of Features:', number_of_fearures)

Number of Features: 4160
Number of Features: 4981
Number of Features: 292
Number of Features: 45
Number of Features: 3


In [17]:
for feature in feature_types:
    all_features = [ i for i in features if i.type == feature ]
    number_of_fearures = len(all_features)
    print('%s : %d'%(feature,number_of_fearures))

CDS : 4160
gene : 4981
mobile_element : 292
tRNA : 45
rRNA : 3
