#### Count features in a single GenBank Files

- Libraries
    - Biopython (pip install biopython --user)
    - Pandas (pip install pandas -- user)
- A publicly available data will be used

In [18]:
from Bio import SeqIO # type: ignore
import pandas as pd
from collections import Counter

In [3]:
file_path = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/M48.gb"

In [5]:
genbank_object = SeqIO.read(file_path, 'gb')

In [6]:
# Get the feature types

all_features_types = [feature.type for feature in genbank_object.features]

In [7]:
len(all_features_types)

6513

In [9]:
feature_types = set(all_features_types)
print(feature_types)

{'CDS', 'regulatory', 'ncRNA', 'tRNA', 'source', 'tmRNA', 'rRNA', 'gene'}


In [10]:
# Count the feature types

feature_counts = Counter(all_features_types)
feature_counts.keys()

dict_keys(['source', 'gene', 'CDS', 'regulatory', 'tRNA', 'ncRNA', 'rRNA', 'tmRNA'])

In [14]:
feature_counts['gene']

3251

In [12]:
for key,value in feature_counts.items():
    print(key,value)

source 1
gene 3251
CDS 3171
regulatory 10
tRNA 60
ncRNA 3
rRNA 16
tmRNA 1


In [15]:
del feature_counts['source']
del feature_counts['regulatory']

In [16]:
feature_counts.keys()

dict_keys(['gene', 'CDS', 'tRNA', 'ncRNA', 'rRNA', 'tmRNA'])

In [19]:
# Save a result to output file

dataframe = pd.DataFrame(feature_counts.items(),columns=['Feature', 'Count'])

In [20]:
dataframe.shape

(6, 2)

In [21]:
dataframe

Unnamed: 0,Feature,Count
0,gene,3251
1,CDS,3171
2,tRNA,60
3,ncRNA,3
4,rRNA,16
5,tmRNA,1


In [22]:
outputfile = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/feature_count.csv"

In [23]:
dataframe.to_csv(outputfile, index=False)