#### Calculate GC Content of a sequence in a single fasta file

**Activities**
- Download fasta file from NCBI Database
- Read fasta file
- Calculate GC Content

**Python Library**
- Biopython

In [1]:
filename = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/sequence_mycobacterium.fasta"

In [3]:
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction

In [4]:
seq_object = SeqIO.read(filename, "fasta")
sequence = seq_object.seq

In [5]:
len(sequence)

5631606

In [7]:
print(sequence[0:20])

TTGACCAATGACCCCGGTTC


In [13]:
gc_content = gc_fraction(sequence)
print(gc_content)

0.6547269109380166


In [15]:
print(gc_content * 100)

65.47269109380166


In [17]:
round(gc_content * 100, 2)

65.47

In [25]:
#Create own custome sequence
sequence2 = "AGCCTAC"
gc_fraction(sequence2)

0.5714285714285714

#### Calculate GC Content of a sequence in a multiple fasta file

**Python Libraries**
- Biopython ( pip install biopython --user )
- Pandas ( pip install pandas --user )

In [65]:
# Import Libraries

from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
import pandas as pd

In [66]:
# Set file path

filepath = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/multi-fasta.fasta"

In [67]:
# Read Fasta File

seq_objects = SeqIO.parse(filepath, 'fasta')

In [68]:
sequences = [seq for seq in seq_objects]

In [69]:
number_of_sequences = len(sequences)
print(number_of_sequences)

3


In [74]:
for seq in sequences:
    seq_id = seq.id
    sequence = seq.seq 
    gc_content = gc_fraction(sequence) * 100  
    print(f"{seq_id} {round(gc_content, 2)}")  

SeqID_01 34.58
SeqID_02 34.21
SeqID_03 40.91


In [75]:
# Using Pandas to store analysis result

seq_ids = []
gc_contents = []

for seq in sequences:
    seq_id = seq.id
    sequence = seq.seq
    gc_content = gc_fraction(sequence) * 100
    gc_content = round(gc_content, 2)

    seq_ids.append(seq_id)
    gc_contents.append(gc_content)
    print("GC Content has been computed")

GC Content has been computed
GC Content has been computed
GC Content has been computed


In [76]:
print(seq_ids)

['SeqID_01', 'SeqID_02', 'SeqID_03']


In [77]:
print(gc_contents)

[34.58, 34.21, 40.91]


In [78]:
print(seq_ids[0])
print(gc_contents[0])

SeqID_01
34.58


In [79]:
# Create pandas dataframe

dataframe = pd.DataFrame()
dataframe['Sequence_ID'] = seq_ids
dataframe['GC_Content'] = gc_contents

In [80]:
print(dataframe.shape)

(3, 2)


In [81]:
print(dataframe)

  Sequence_ID  GC_Content
0    SeqID_01       34.58
1    SeqID_02       34.21
2    SeqID_03       40.91


In [84]:
# Save dataframe to output file

output_file = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/gc_content.csv"
dataframe.to_csv(output_file, index=False)

#### GC Content of sequences in multiple fasta files

- Download multiple fasta files
- Use loops to
    - read files
    - calculate GC content
- Save result to a file

- Required Libraries
    - Biopython
    - Pandas

In [4]:
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
import pandas as pd
import os
import glob

In [5]:
# Set directory of fasta files

file_directory = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/FastaFiles"

In [6]:
fasta_files = glob.glob('%s/*.fasta'%file_directory)

In [7]:
len(fasta_files)

5

In [8]:
fasta_files[0]

'D:/Bionome Internship/Bioinformatics Practicals/BioPython/FastaFiles\\AR465.fasta'

In [9]:
print(fasta_files)

['D:/Bionome Internship/Bioinformatics Practicals/BioPython/FastaFiles\\AR465.fasta', 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/FastaFiles\\M48.fasta', 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/FastaFiles\\P10.fasta', 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/FastaFiles\\R50.fasta', 'D:/Bionome Internship/Bioinformatics Practicals/BioPython/FastaFiles\\V521.fasta']


In [11]:
# Create a function for the sequence analysis codes

def calculate_gc(fasta):
    seq_obj = SeqIO.read(fasta, 'fasta')
    sequence = seq_obj.seq

    gc_content = gc_fraction(sequence) * 100
    gc_content = round(gc_content, 2)

    filename = os.path.split(fasta)
    filename = filename[1]
    filename = filename.strip('.fasta')

In [12]:
for fasta in fasta_files:
    calculate_gc(fasta)

AR465
32.92
M48
32.81
P10
32.71
R50
32.91
V521
32.81


In [14]:
filenames = []
gc_contents = []

In [15]:
# Create a function for the sequence analysis codes

def calculate_gc(fasta):
    seq_obj = SeqIO.read(fasta, 'fasta')
    sequence = seq_obj.seq

    gc_content = gc_fraction(sequence) * 100
    gc_content = round(gc_content, 2)

    filename = os.path.split(fasta)
    filename = filename[1]
    filename = filename.strip('.fasta')

    print("GC Content has been calculated")
    return filename, gc_content

In [16]:
for fasta in fasta_files:
    filename, gc_content = calculate_gc(fasta)

    filenames.append(filename)
    gc_contents.append(gc_content)

GC Content has been calculated
GC Content has been calculated
GC Content has been calculated
GC Content has been calculated
GC Content has been calculated


In [17]:
len(filenames)

5

In [18]:
len(gc_contents)

5

In [19]:
filenames[0]

'AR465'

In [20]:
gc_contents[0]

32.92

In [21]:
# Using Pandas to store analysis result

dataframe = pd.DataFrame()
dataframe['filename'] = filenames
dataframe['gc_content'] = gc_contents

In [23]:
dataframe.shape

(5, 2)

In [24]:
dataframe.head()

Unnamed: 0,filename,gc_content
0,AR465,32.92
1,M48,32.81
2,P10,32.71
3,R50,32.91
4,V521,32.81


In [25]:
# Save dataframe to the output file

outputfile = "D:/Bionome Internship/Bioinformatics Practicals/BioPython/FastaFiles/gc_content.csv"


In [27]:
dataframe.to_csv(outputfile, index=False)