# Biopython demo
Biopython is set of tools for bioinformaticians.

More info here: http://biopython.org/wiki/Main_Page

You can download Biopython here: http://biopython.org/wiki/Download

Documentation: http://biopython.org/wiki/Documentation

Tutorial is available here: http://biopython.org/DIST/docs/tutorial/Tutorial.html or here: http://biopython.org/wiki/Getting_Started

In [None]:
# to install biopython in anaconda
!conda install biopython

In [1]:
# you can import Biopython by including following line in your code
import Bio

## Working with sequence data

In [23]:
import Bio.Seq

my_seq = Bio.Seq.Seq('CATGTAGACTAG')
print my_seq

CATGTAGACTAG


In [2]:
from Bio.Seq import Seq
# create a sequence object
my_seq = Seq('CATGTAGACTAG')
print my_seq

CATGTAGACTAG


In [3]:
type(my_seq)

Bio.Seq.Seq

In [4]:
#print out some details about it
print 'seq %s is %i bases long' % (my_seq, len(my_seq))
print 'reverse complement is %s' % my_seq.reverse_complement()
print 'protein translation is %s' % my_seq.translate()

seq CATGTAGACTAG is 12 bases long
reverse complement is CTAGTCTACATG
protein translation is HVD*


## Working with files in FASTA format

![caption](orchid.jpg)

In [5]:
from Bio import SeqIO
# for each sequence print: 1. Sequence ID, 2. Sequence length
for seq_record in SeqIO.parse("ls_orchid.fasta", "fasta"):
    print seq_record.id
    print(len(seq_record))

gi|2765658|emb|Z78533.1|CIZ78533
740
gi|2765657|emb|Z78532.1|CCZ78532
753
gi|2765656|emb|Z78531.1|CFZ78531
748
gi|2765655|emb|Z78530.1|CMZ78530
744
gi|2765654|emb|Z78529.1|CLZ78529
733
gi|2765652|emb|Z78527.1|CYZ78527
718
gi|2765651|emb|Z78526.1|CGZ78526
730
gi|2765650|emb|Z78525.1|CAZ78525
704
gi|2765649|emb|Z78524.1|CFZ78524
740
gi|2765648|emb|Z78523.1|CHZ78523
709
gi|2765647|emb|Z78522.1|CMZ78522
700
gi|2765646|emb|Z78521.1|CCZ78521
726
gi|2765645|emb|Z78520.1|CSZ78520
753
gi|2765644|emb|Z78519.1|CPZ78519
699
gi|2765643|emb|Z78518.1|CRZ78518
658
gi|2765642|emb|Z78517.1|CFZ78517
752
gi|2765641|emb|Z78516.1|CPZ78516
726
gi|2765640|emb|Z78515.1|MXZ78515
765
gi|2765639|emb|Z78514.1|PSZ78514
755
gi|2765638|emb|Z78513.1|PBZ78513
742
gi|2765637|emb|Z78512.1|PWZ78512
762
gi|2765636|emb|Z78511.1|PEZ78511
745
gi|2765635|emb|Z78510.1|PCZ78510
750
gi|2765634|emb|Z78509.1|PPZ78509
731
gi|2765633|emb|Z78508.1|PLZ78508
741
gi|2765632|emb|Z78507.1|PLZ78507
740
gi|2765631|emb|Z78506.1|PLZ78506
727
g

In [6]:
handle = open("ls_orchid.fasta", "rU")
records = list(SeqIO.parse(handle, "fasta"))
handle.close()
print type(records)
print type(records[0])

<type 'list'>
<class 'Bio.SeqRecord.SeqRecord'>


In [7]:
print records[0].id  #first record
print records[-1].id #last record

gi|2765658|emb|Z78533.1|CIZ78533
gi|2765564|emb|Z78439.1|PBZ78439


## Working with files in GenBank format

In [8]:
from Bio import SeqIO
input_handle = open("ls_orchid.gbk", "rU")
for record in SeqIO.parse(input_handle, "genbank") :
    print record
input_handle.close()

ID: Z78533.1
Name: Z78533
Description: C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA.
Number of features: 5
/sequence_version=1
/source=Cypripedium irapeanum
/taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliophyta', 'Liliopsida', 'Asparagales', 'Orchidaceae', 'Cypripedioideae', 'Cypripedium']
/keywords=['5.8S ribosomal RNA', '5.8S rRNA gene', 'internal transcribed spacer', 'ITS1', 'ITS2']
/references=[Reference(title='Phylogenetics of the slipper orchids (Cypripedioideae: Orchidaceae): nuclear rDNA ITS sequences', ...), Reference(title='Direct Submission', ...)]
/accessions=['Z78533']
/data_file_division=PLN
/date=30-NOV-2006
/organism=Cypripedium irapeanum
/gi=2765658
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', IUPACAmbiguousDNA())
ID: Z78532.1
Name: Z78532
Description: C.californicum 5.8S rRNA gene and ITS1 and ITS2 DNA.
Number of features: 5
/sequence_version=1
/source=Cypripedium californicum

In [9]:
# We can covnert from GenBank format to Fasta
count = SeqIO.convert("ls_orchid.gbk", "genbank", "ls_orchid_converted.fasta", "fasta")
print "Converted %i records" % count

Converted 94 records


In [10]:
# Load one record from GenBank
record = SeqIO.read("NC_005816.gb", "genbank")
print record

ID: NC_005816.1
Name: NC_005816
Description: Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence.
Database cross-references: Project:58037
Number of features: 41
/comment=PROVISIONAL REFSEQ: This record has not yet been subject to final
NCBI review. The reference sequence was derived from AE017046.
COMPLETENESS: full length.
/sequence_version=1
/source=Yersinia pestis biovar Microtus str. 91001
/taxonomy=['Bacteria', 'Proteobacteria', 'Gammaproteobacteria', 'Enterobacteriales', 'Enterobacteriaceae', 'Yersinia']
/keywords=['']
/references=[Reference(title='Genetics of metabolic variations between Yersinia pestis biovars and the proposal of a new biovar, microtus', ...), Reference(title='Complete genome sequence of Yersinia pestis strain 91001, an isolate avirulent to humans', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/accessions=['NC_005816']
/data_file_division=BCT
/date=21-JUL-2008
/organism=Yersinia pestis biov

In [11]:
print record.seq
print record.id
print record.description

TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGGGGGTAATCTGCTCTCCTGATTCAGGAGAGTTTATGGTCACTTTTGAGACAGTTATGGAAATTAAAATCCTGCACAAGCAGGGAATGAGTAGCCGGGCGATTGCCAGAGAACTGGGGATCTCCCGCAATACCGTTAAACGTTATTTGCAGGCAAAATCTGAGCCGCCAAAATATACGCCGCGACCTGCTGTTGCTTCACTCCTGGATGAATACCGGGATTATATTCGTCAACGCATCGCCGATGCTCATCCTTACAAAATCCCGGCAACGGTAATCGCTCGCGAGATCAGAGACCAGGGATATCGTGGCGGAATGACCATTCTCAGGGCATTCATTCGTTCTCTCTCGGTTCCTCAGGAGCAGGAGCCTGCCGTTCGGTTCGAAACTGAACCCGGACGACAGATGCAGGTTGACTGGGGCACTATGCGTAATGGTCGCTCACCGCTTCACGTGTTCGTTGCTGTTCTCGGATACAGCCGAATGCTGTACATCGAATTCACTGACAATATGCGTTATGACACGCTGGAGACCTGCCATCGTAATGCGTTCCGCTTCTTTGGTGGTGTGCCGCGCGAAGTGTTGTATGACAATATGAAAACTGTGGTTCTGCAACGTGACGCATATCAGACCGGTCAGCACCGGTTCCATCCTTCGCTGTGGCAGTTCGGCAAGGAGATGGGCTTCTCTCCCCGACTGTGTCGCCCCTTCAGGGCACAGACTAAAGGTAAGGTGGAACGGATGGTGCAGTACACCCGTAACAGTTTTTACATCCCACTAATGACTCGCCTGCGCCCGATGGGGATCACTGTCGATGTTGAAACAGCCAACCGCCACGGTCTGCGCTGGCTGCACGATGTCGCTAACCAACGAAAGCATGAAACAATCCAGGCCCGTCCCTGCGATCGCTGGCTCGAAGAGCAGCAGTCCATGCTGGCACTGCCTCCGGA

## Example: count GC%

In [12]:
# without biopython SeqIO
my_seq = 'GATCGATGGGCCTATATAGGATCGAAAATCGC'
100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq)

46.875

In [13]:
# with SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
my_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC', IUPAC.unambiguous_dna)
GC(my_seq)

46.875

## Example: Transcription

In [14]:
# Transcription is done from template DNA
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
print coding_dna
template_dna = coding_dna.reverse_complement()
print template_dna

ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG
CTATCGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT


In [15]:
# Transcribe DNA from coding strand
messenger_rna_coding = coding_dna.transcribe()
print messenger_rna_coding
# Transcribe DNA as it is done in cell - from template strand
messenger_rna_template = template_dna.reverse_complement().transcribe()
print messenger_rna_template

AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG
AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG


In [None]:
import inspect
print "".join(inspect.getsourcelines(coding_dna.transcribe)[0])

## Example: Translation

In [16]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", IUPAC.unambiguous_rna)
# Translate RNA using different translation tables
print messenger_rna.translate()
print messenger_rna.translate(table="Vertebrate Mitochondrial")
print messenger_rna.translate(table="Bacterial")

MAIVMGR*KGAR*
MAIVMGRWKGAR*
MAIVMGR*KGAR*


In [17]:
# Translate directly from coding DNA
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
print coding_dna.translate()

MAIVMGR*KGAR*


In [18]:
#  Customize stop symbol in the output
print coding_dna.translate(table="Vertebrate Mitochondrial", stop_symbol="STOP")

MAIVMGRWKGARSTOP


## Translation tables

In [19]:
# Load tables from Data module 
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]

In [20]:
# Print standard codon table
print(standard_table)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [21]:
# Print codon table for mitochondia in vertebrates
print(mito_table)

Table 2 Vertebrate Mitochondrial, SGC1

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA W   | A
T | TTG L   | TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L   | CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I(s)| ACT T   | AAT N   | AGT S   | T
A | ATC I(s)| ACC T   | AAC N   | AGC S   | C
A | ATA M(s)| ACA T   | AAA K   | AGA Stop| A
A | ATG M(s)| ACG T   | AAG K   | AGG Stop| G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V(s)| GCG A   | GAG E   | GGG G   