#### Count features in a single GenBank Files

- Libraries
    - Biopython (pip install biopython --user)
    - Pandas (pip install pandas -- user)
- A publicly available data will be used

In [18]:
from Bio import SeqIO # type: ignore
import pandas as pd
from collections import Counter

In [3]:
file_path = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/M48.gb"

In [5]:
genbank_object = SeqIO.read(file_path, 'gb')

In [6]:
# Get the feature types

all_features_types = [feature.type for feature in genbank_object.features]

In [7]:
len(all_features_types)

6513

In [9]:
feature_types = set(all_features_types)
print(feature_types)

{'CDS', 'regulatory', 'ncRNA', 'tRNA', 'source', 'tmRNA', 'rRNA', 'gene'}


In [10]:
# Count the feature types

feature_counts = Counter(all_features_types)
feature_counts.keys()

dict_keys(['source', 'gene', 'CDS', 'regulatory', 'tRNA', 'ncRNA', 'rRNA', 'tmRNA'])

In [14]:
feature_counts['gene']

3251

In [12]:
for key,value in feature_counts.items():
    print(key,value)

source 1
gene 3251
CDS 3171
regulatory 10
tRNA 60
ncRNA 3
rRNA 16
tmRNA 1


In [15]:
del feature_counts['source']
del feature_counts['regulatory']

In [16]:
feature_counts.keys()

dict_keys(['gene', 'CDS', 'tRNA', 'ncRNA', 'rRNA', 'tmRNA'])

In [19]:
# Save a result to output file

dataframe = pd.DataFrame(feature_counts.items(),columns=['Feature', 'Count'])

In [20]:
dataframe.shape

(6, 2)

In [21]:
dataframe

Unnamed: 0,Feature,Count
0,gene,3251
1,CDS,3171
2,tRNA,60
3,ncRNA,3
4,rRNA,16
5,tmRNA,1


In [22]:
outputfile = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/feature_count.csv"

In [23]:
dataframe.to_csv(outputfile, index=False)

#### Count features in multipe GenBank Files

Activities
- Download genbank files
- Put all files in the same folder/directory
- Count features using Biopython
- Save the result for individual files to their respective output files
- Combine all results and store in a single dataframe and save to a single output file

Libraries
- Biopython (pip install biopython --user)
- Pandas (pip install pandas --user)

In [24]:
import glob
from Bio import SeqIO
from collections import Counter
import pandas as pd
import os

In [29]:
file_directory = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files"

In [30]:
gfiles =  glob.glob("%s/*.gb"%file_directory)

In [31]:
gfiles

['D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\AR465.gb',
 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\M48.gb',
 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\P10.gb',
 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\R50.gb',
 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\V521.gb']

In [32]:
print(len(gfiles))

5


In [33]:
gfiles[0]

'D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\AR465.gb'

In [34]:
def count_features(gfile):
    genbank_object = SeqIO.read(gfile, "gb")
    features = genbank_object.features
    feature_types = [feature.type for feature in features]
    feature_count = Counter(feature_types)
    print('Features have been counted')

    dataframe = pd.DataFrame(feature_count.items(), columns=['Feature' , 'Count'])

    directory, filename = os.path.split(gfile)
    filename = filename.strip('.gb')

    basedir = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files"

    outputfile = "%s/%s.csv"%(basedir, filename)

    dataframe.to_csv(outputfile, index=False)

    print('Count data has been saved')

In [35]:
# Use 'for' loop to iterate over the files and count the features

for gfile in gfiles:
    count_features(gfile)

Features have been counted
Count data has been saved
Features have been counted
Count data has been saved
Features have been counted
Count data has been saved
Features have been counted
Count data has been saved
Features have been counted
Count data has been saved


#### Combine output file in a single CSV file

In [1]:
# import libraries

import glob
from Bio import SeqIO
from collections import Counter
import pandas as pd
import os

In [2]:
file_directory = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files"

In [3]:
# Get the filenames
gfiles =  glob.glob("%s/*.gb"%file_directory)
gfiles

['D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\AR465.gb',
 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\M48.gb',
 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\P10.gb',
 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\R50.gb',
 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files\\V521.gb']

In [5]:
# function to read genbank files and get the features
def read_file(gfile):
    genbank_object = SeqIO.read(gfile, 'gb')
    features = genbank_object.features
    feature_types = [feature.type for feature in features]
    return feature_types

In [6]:
# Count features of the genbank files
def count_features(feature_types):
    feature_count = Counter(feature_types)
    print('Features have been counted')
    return feature_count

In [7]:
# function to identify features in the genbank 
def scan_all_features(files):
    allfeatures = []
    for gfile in gfiles:
        feature_types = read_file(gfile)
        allfeatures.extend(feature_types)

    allfeatures = set(allfeatures)
    allfeatures = list(allfeatures)
    print('All features have been identified')
    return allfeatures   

In [8]:
# Get all features
 
allfeatures = scan_all_features(gfiles)

All features have been identified


In [9]:
print(allfeatures)

['CDS', 'regulatory', 'misc_binding', 'rRNA', 'tRNA', 'ncRNA', 'gene', 'tmRNA', 'source']


In [10]:
# Create an empty list to store the counts

allfeature_count = []

In [11]:
# for loop to iterate over the files and count the features

for gfile in gfiles:
    directory,filename = os.path.split(gfile)
    filename = filename.strip('.gb')
    feature_types = read_file(gfile)
    feature_count = count_features(feature_types)
    temp_count=[]

    temp_count.append(filename)

    for feature in allfeatures:
        if feature in feature_count.keys():
            temp_count.append(feature_count[feature])
        else:
            temp_count.append(0)

    allfeature_count.append(temp_count)


Features have been counted
Features have been counted
Features have been counted
Features have been counted
Features have been counted


In [12]:
len(allfeature_count)

5

In [13]:
allfeature_count[0]

['AR465', 2740, 0, 0, 19, 60, 0, 2819, 0, 1]

In [14]:
allfeature_count[1]

['M48', 3171, 10, 0, 16, 60, 3, 3251, 1, 1]

In [15]:
print(allfeatures)

['CDS', 'regulatory', 'misc_binding', 'rRNA', 'tRNA', 'ncRNA', 'gene', 'tmRNA', 'source']


In [16]:
allfeature_count

[['AR465', 2740, 0, 0, 19, 60, 0, 2819, 0, 1],
 ['M48', 3171, 10, 0, 16, 60, 3, 3251, 1, 1],
 ['P10', 3179, 10, 0, 8, 59, 3, 3250, 1, 1],
 ['R50', 2935, 10, 0, 10, 59, 3, 3008, 1, 1],
 ['V521', 3114, 0, 10, 16, 60, 3, 3194, 1, 1]]

In [17]:
# Create Columns for the dataframe

columns = []
columns.append('File')
columns.extend(allfeatures)

In [18]:
columns

['File',
 'CDS',
 'regulatory',
 'misc_binding',
 'rRNA',
 'tRNA',
 'ncRNA',
 'gene',
 'tmRNA',
 'source']

In [19]:
# Create dataframe using pandas

dataframe = pd.DataFrame(allfeature_count, columns=columns)

In [21]:
dataframe

Unnamed: 0,File,CDS,regulatory,misc_binding,rRNA,tRNA,ncRNA,gene,tmRNA,source
0,AR465,2740,0,0,19,60,0,2819,0,1
1,M48,3171,10,0,16,60,3,3251,1,1
2,P10,3179,10,0,8,59,3,3250,1,1
3,R50,2935,10,0,10,59,3,3008,1,1
4,V521,3114,0,10,16,60,3,3194,1,1


In [22]:
# set index to 'File'

final_data = dataframe.set_index('File')

In [23]:
final_data

Unnamed: 0_level_0,CDS,regulatory,misc_binding,rRNA,tRNA,ncRNA,gene,tmRNA,source
File,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AR465,2740,0,0,19,60,0,2819,0,1
M48,3171,10,0,16,60,3,3251,1,1
P10,3179,10,0,8,59,3,3250,1,1
R50,2935,10,0,10,59,3,3008,1,1
V521,3114,0,10,16,60,3,3194,1,1


In [24]:
# removing some columns from dataframe

columns_to_delete = ['source', 'misc_binding', 'regulatory']
final_data.drop(columns=columns_to_delete, inplace=True)

In [25]:
final_data

Unnamed: 0_level_0,CDS,rRNA,tRNA,ncRNA,gene,tmRNA
File,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AR465,2740,19,60,0,2819,0
M48,3171,16,60,3,3251,1
P10,3179,8,59,3,3250,1
R50,2935,10,59,3,3008,1
V521,3114,16,60,3,3194,1


In [26]:
final_data.loc['V521',:]

CDS      3114
rRNA       16
tRNA       60
ncRNA       3
gene     3194
tmRNA       1
Name: V521, dtype: int64

In [27]:
final_data.loc['V521', 'gene']

3194

In [28]:
# To save final_data dataframe in outputfile define the output filename

outputfile = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/GB Files/FeatureCount.csv"



In [29]:
# save the final_data to a csv file format

final_data.to_csv(outputfile,index=True)