In [13]:
import Bio
from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

print("Biopython version: ", Bio.__version__)

Biopython version:  1.73


# Tutorial

- Files in the supported formats can be iterated over record by record or indexed and accessed via a Dictionary interface.
- A standard sequence class that deals with sequences, ids on sequences, and sequence features.
- Tools for performing common operations on sequences, such as translation, transcription and weight calculations.
- GUI-based programs to do basic sequence manipulations, translations, BLASTing, etc.
- Integration with [BioSQL](https://biosql.org/wiki/Main_Page), a [sequence database schema](https://biosql.org/wiki/Schema_Overview) also supported by the BioPerl and BioJava projects.
- See also [genbank schema for accession, GI and version](https://www.ncbi.nlm.nih.gov/Class/MLACourse/Modules/Format/exercises/qa_accession_vs_gi.html)

# Bio.SeqIO and Bio.AlignIO
- What file formats do Bio.SeqIO and Bio.AlignIO read and write?
Check the built in docstrings (from Bio import SeqIO, then help(SeqIO)), or see http://biopython.org/wiki/SeqIO and http://biopython.org/wiki/AlignIO on the wiki for the latest listing.

# Bio.Blast
- Why doesn’t Bio.Blast work with the latest **plain text NCBI blast output**?
The NCBI keep tweaking the plain text output from the BLAST tools, and keeping our parser up to date is/was an ongoing struggle. If you aren’t using the latest version of Biopython, you could try upgrading. However, we (and the NCBI) recommend you use the **XML output** instead, which is designed to be read by a computer program.

# Quick Start – What can you do with Biopython?
## Working with sequences

### Seq object

Sequences are essentially strings of letters like AGTACACTGGT, which seems very natural since this is the most common way that sequences are seen in biological file formats.

Seq - has an alphabet type and methods

In [3]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("AGTACACTGGT")
print(my_seq)
print(my_seq.alphabet)
print(my_seq.complement())
print(my_seq.reverse_complement(), "\n")

my_seq = Seq("AGTACACTGGT", IUPAC.unambiguous_dna)
print(my_seq)
print(my_seq.alphabet)
print(my_seq.complement())
print(my_seq.reverse_complement())

AGTACACTGGT
Alphabet()
TCATGTGACCA
ACCAGTGTACT 

AGTACACTGGT
IUPACUnambiguousDNA()
TCATGTGACCA
ACCAGTGTACT


## sequences act like strings

Note that just like a normal Python string, the Seq object is in some ways **“read-only”**. If you need to edit your sequence, for example simulating a point mutation, look at the Section 3.12 below which talks about the MutableSeq object.

In [12]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GATCG", IUPAC.unambiguous_dna)

for index, letter in enumerate (my_seq):
    print("%i %s" %(index, letter))
    
print("\nlength", len(my_seq))

print("my_seq[0]: ", my_seq[0], sep = "")

0 G
1 A
2 T
3 C
4 G

length 5
my_seq[0]: G


## count method

In [19]:
print(Seq("AAAA").count("AA"))
print(Seq("AAAA").count("A"))

my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
print("sequence length:", len(my_seq))
GC_count = 100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq)
print(GC_count)

from Bio.SeqUtils import GC
GC_count = GC(my_seq)
print(GC_count)

2
4
sequence length: 32
46.875
46.875


## slicing a sequence

In [25]:
print(my_seq)
print("positions 4 to 12:", my_seq[4:12])

# pegar o primeiro codon:
print("all the first codons:", my_seq[0::3])

# inverter a sequência:
print("inverse sequence:", my_seq[::-1])

# another way to write the reverse:
print("inverse sequence:", my_seq.reverse_complement().complement())

GATCGATGGGCCTATATAGGATCGAAAATCGC
positions 4 to 12: GATGGGCC
all the first codons: GCTGTAGTAAG
inverse sequence: CGCTAAAAGCTAGGATATATCCGGGTAGCTAG
inverse sequence: CGCTAAAAGCTAGGATATATCCGGGTAGCTAG


## concatenating and adding sequences

In [31]:
#protein_seq = Seq("EVRNAK", IUPAC.protein)
#dna_seq = Seq("ACGT", IUPAC.unambiguous_dna)
#protein_seq + dna_seq

# alphabets must be the same
from Bio.Alphabet import generic_alphabet
protein_seq.alphabet = generic_alphabet
dna_seq.alphabet = generic_alphabet
protein_seq + dna_seq
Seq('EVRNAKACGT')

Seq('EVRNAKACGT')

In [35]:
from Bio.Alphabet import generic_dna
list_of_seqs = [Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna)]
concatenated = Seq("", generic_dna)

for s in list_of_seqs:
    concatenated += s
    
concatenated

Seq('ACGTAACCGGTT', DNAAlphabet())

## changing case

In [36]:
dna_seq = Seq("acgtACGT", generic_dna)
dna_seq

Seq('acgtACGT', DNAAlphabet())

In [37]:
dna_seq.upper()

Seq('ACGTACGT', DNAAlphabet())

In [38]:
dna_seq.lower()

Seq('acgtacgt', DNAAlphabet())

#### These are useful for doing case insensitive matching:

In [39]:
"GTAC" in dna_seq

False

In [40]:
"GTAC" in dna_seq.upper()

True

## Transcription!

biopyton works with the coding strand. This means it just switches U's for T's.

If you want to simulate the real transcription process using the template strand, you can use "transcribe reverse complement" on the template.
- Reverse: put it in 5'-3' direction.
- Complement: transform from template to coding strand.
- Transcribe: T's to U's.

In [42]:
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())

In [43]:
template_dna = coding_dna.reverse_complement()
template_dna

Seq('CTATCGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT', IUPACUnambiguousDNA())

In [44]:
messenger_rna = coding_dna.transcribe()
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())

In [45]:
template_dna.reverse_complement().transcribe()

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())

### SeqRecord 
Holds a sequence (Seq object) with additional annotation (ID, name and description).
The **Bio.SeqIO** module for reading and writing sequence file formats works with **SeqRecord** objects.

In [22]:
from Bio import SeqIO
for seq_record in SeqIO.parse("ls_orchid.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', SingleLetterAlphabet())
740
gi|2765657|emb|Z78532.1|CCZ78532
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', SingleLetterAlphabet())
753
gi|2765656|emb|Z78531.1|CFZ78531
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', SingleLetterAlphabet())
748
gi|2765655|emb|Z78530.1|CMZ78530
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', SingleLetterAlphabet())
744
gi|2765654|emb|Z78529.1|CLZ78529
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', SingleLetterAlphabet())
733
gi|2765652|emb|Z78527.1|CYZ78527
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC', SingleLetterAlphabet())
718
gi|2765651|emb|Z78526.1|CGZ78526
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT', SingleLetterAlphabet())
730
gi|2765650|emb|Z78525.1|CAZ78525
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GC

# Now testing with yellow fever sequences

In [23]:
for seq_record in SeqIO.parse("../DATA/sequencias_github_paper_Science/65_outbreak_YFV_all_edited.final.aln", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

NC|ES504|NHPrimate|DomingosMartins|EspiritoSanto|20-02-2017
Seq('ATGTCTGGTCGTAAAGCTCAGGGAAAAACCCTGGGCGTCAATATGGTTCGACGA...TGA', SingleLetterAlphabet())
10236
NC|ES505|NHPrimate|DomingosMartins|EspiritoSanto|22-02-2017
Seq('ATGTCTGGTCGTAAAGCTCAGGGAAAAACCCTGGGCGTCAATATGGTTCGACGA...TGA', SingleLetterAlphabet())
10236
FioRJ|Library9YIBRA_M218_2176|Primate|Bahia|10-03-2017
Seq('NNNNNNNNNNNNNNNNNNNNNNNNAAAACCCTGGGCGTCAATATGGTTCGACGA...---', SingleLetterAlphabet())
10236
FioRJ|460|Human|MinasGerais_NovoCruzeiro|30-01-2017
Seq('------------------------AAAACCCTGGGCGTCAATATGGTTCGACGA...---', SingleLetterAlphabet())
10236
FioRJ|1818|Human|EspiritoSanto_Cariacia|10-03-2017
Seq('------------------------AAAACCCTGGGCGTCAATATGGTTCGACGA...---', SingleLetterAlphabet())
10236
FioRJ|2115|Monkey|EspiritoSanto_Cariacia|09-03-2017
Seq('-----------------------AAAAACCCTGGGCGTCAATATGGTTCGACGA...---', SingleLetterAlphabet())
10236
FioRJ|3919|Human|EspiritoSanto_DomingosMartins|10-04-2017
Seq('-------------------

In [24]:
for seq_record in SeqIO.parse("ls_orchid.gbk", "genbank"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

Z78533.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', IUPACAmbiguousDNA())
740
Z78532.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', IUPACAmbiguousDNA())
753
Z78531.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', IUPACAmbiguousDNA())
748
Z78530.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', IUPACAmbiguousDNA())
744
Z78529.1
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', IUPACAmbiguousDNA())
733
Z78527.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC', IUPACAmbiguousDNA())
718
Z78526.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT', IUPACAmbiguousDNA())
730
Z78525.1
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA', IUPACAmbiguousDNA())
704
Z78524.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATAGTAG...AGC', IUPACAmbiguousDNA())
740
Z78523.1
Seq('CGTAACCAGGTTTCCGTAGGTGAACCTGCGGCAGGATCATTGTTGAGACAGCAG...AAG', IUPAC

You have to parse the fasta file in order to read the sequences one by one.

How to store them in other data structures?

In [27]:
seqs = list(SeqIO.parse("../DATA/sequencias_github_paper_Science/65_outbreak_YFV_all_edited.final.aln", "fasta"))
print("Found %i records" % len(seqs))
#index = [rec.id for rec in seqs.rec]
seqs

Found 65 records


[SeqRecord(seq=Seq('ATGTCTGGTCGTAAAGCTCAGGGAAAAACCCTGGGCGTCAATATGGTTCGACGA...TGA', SingleLetterAlphabet()), id='NC|ES504|NHPrimate|DomingosMartins|EspiritoSanto|20-02-2017', name='NC|ES504|NHPrimate|DomingosMartins|EspiritoSanto|20-02-2017', description='NC|ES504|NHPrimate|DomingosMartins|EspiritoSanto|20-02-2017', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGTCTGGTCGTAAAGCTCAGGGAAAAACCCTGGGCGTCAATATGGTTCGACGA...TGA', SingleLetterAlphabet()), id='NC|ES505|NHPrimate|DomingosMartins|EspiritoSanto|22-02-2017', name='NC|ES505|NHPrimate|DomingosMartins|EspiritoSanto|22-02-2017', description='NC|ES505|NHPrimate|DomingosMartins|EspiritoSanto|22-02-2017', dbxrefs=[]),
 SeqRecord(seq=Seq('NNNNNNNNNNNNNNNNNNNNNNNNAAAACCCTGGGCGTCAATATGGTTCGACGA...---', SingleLetterAlphabet()), id='FioRJ|Library9YIBRA_M218_2176|Primate|Bahia|10-03-2017', name='FioRJ|Library9YIBRA_M218_2176|Primate|Bahia|10-03-2017', description='FioRJ|Library9YIBRA_M218_2176|Primate|Bahia|10-03-2017', dbxrefs=[]),
 SeqRecord(seq=Seq('-----

# parei aqui




In [6]:
print("The last record")
last_record = seqs[-1] #using Python's list tricks
print(last_record.id)
print(repr(last_record.seq))
print(len(last_record))



The last record
MF170971|Monkey|MinasGerais_SaoRoqueDeMinas|NA|2017-01-30
Seq('ATGTCTGGTCGTAAAGCTCAGGGAAAAACCCTGGGCGTCAATATGGTTCGACGA...ATT', SingleLetterAlphabet())
10233


In [7]:
print("The first record")
first_record = seqs[0] #remember, Python counts from zero
print(first_record.id)
print(repr(first_record.seq))
print(len(first_record))

The first record
NC|ES504|NHPrimate|DomingosMartins|EspiritoSanto|20-02-2017
Seq('ATGTCTGGTCGTAAAGCTCAGGGAAAAACCCTGGGCGTCAATATGGTTCGACGA...TGA', SingleLetterAlphabet())
10236


In [8]:

seqs = pd.DataFrame(SeqIO.parse("65_outbreak_YFV_all_edited.final.aln", "fasta"))
print("Found %i records" % len(seqs))



Found 65 records


In [9]:
seqs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10226,10227,10228,10229,10230,10231,10232,10233,10234,10235
0,A,T,G,T,C,T,G,G,T,C,...,G,C,T,C,A,T,T,T,G,A
1,A,T,G,T,C,T,G,G,T,C,...,G,C,T,C,A,T,T,T,G,A
2,N,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,-,-,-
3,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
4,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
5,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
6,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
7,-,-,-,T,C,T,G,G,T,C,...,-,-,-,-,-,-,-,-,-,-
8,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
9,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [10]:
seqs.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10226,10227,10228,10229,10230,10231,10232,10233,10234,10235
count,65,65,65,65,65,65,65,65,65,65,...,65,65,65,65,65,65,65,52,52,52
unique,4,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,2,2,2
top,N,N,G,T,C,T,G,G,T,C,...,N,N,N,N,N,N,N,-,-,-
freq,36,36,38,39,39,41,39,42,42,42,...,36,36,36,36,36,36,36,50,50,50


# Usage example
            
<p>Hi all, I need some help. In fact in the contexte of my work I have a dataframe of seq names paired together, in each row there is one column for the seq_id of the sp1 and one fore the seq _idof the sp2. In another hand I have two fasta files which contain all these sequences (same seq Id) + the sequences in fasta format. But in these files sequences are totaly mixed and what I need to do is to reorganize two new fasta file by  parsing my dataframe and say, ok for each row, put the seqx_A in fasta file 1 and seqx_B in fasta file 2. By keeping the order in the dataframe. Here is an exemple: </p>

<p>I actually have one dataframe with sequences in order such :</p>

<pre class="pre"><code class="language-bash">Seq_1.id    Seq_2.id
seq1_A     seq8_B
seq2_A     Seq9_B
seq3_A     Seq10_B
seq4_A     Seq11_B
</code></pre>

<p>and two fasta files such :
<code>first one</code>:</p>

<pre class="pre"><code class="language-bash">&gt;Seq11_B
ACTG
&gt;seq8_B
ATGC
&gt;seq3_A
ACTG
&gt;seq2_A
ATGC

second one: 
&gt;seq4_A
 ACTG
&gt;seq1_A
 ACTG
&gt;Seq10_B
 ATGC
&gt;Seq9_B
 ATCG
</code></pre>

<p>As you can see _A and _B are mixed in bot fasta file but I would like to order my fastafiles by creating a new ones and put all seq A in a file and all seqB in another file in the same order as in the dataframe (paires sequence as always to be added in the same time in the file).
here would be the output of the exemple:</p>

<p>fasta1:</p>

<pre class="pre"><code class="language-bash">&gt;seq1_A
ATGC
&gt;seq2_A 
ATGG
&gt;seq3_A 
ATGC
&gt;seq4_A 
ATGC
</code></pre>

<p>and fasta2:</p>

<pre class="pre"><code class="language-bash">&gt;seq8_B
ATGc
&gt;Seq9_B
ATGC
&gt;Seq10_B
ATGC
&gt;Seq11_B
ATGC
</code></pre>

<p>Here would be the name of the files: </p>

<pre class="pre"><code class="language-bash">candidate_df.read_csv("dn_ds.out_test",sep='\t')

#--------------------------------------
#Load the sequences comming from the cluster filtering and range them into ordered files per species

#here is the two columns of the dataframe
seq1_id=candidate_df["seq1_id"]
seq2_id=candidate_df["seq2_id"]

#Here is the output desired files:
output_aa_sp1 = open('candidates_aa_0042.fasta','w')
output_aa_sp2 = open('candidates_aa_0035.fasta','w')


#Here are the 2 fasta file to be modified
record_dict_sp1_aa = SeqIO.to_dict(SeqIO.parse("result1_aa.fasta", "fasta"))
record_dict_sp2_aa = SeqIO.to_dict(SeqIO.parse("result2_aa.fasta", "fasta"))
</code></pre>

<p>Does someone have an idea?</p>

<p>Thank you :)</p>
</span>




In [None]:
import pandas as pd
from Bio import SeqIO

output_handle1 = open("new_fasta1.fasta", "a")
output_handle2 = open("new_fasta2.fasta", "a")

records1 = SeqIO.index("fasta1.fasta", "fasta")
records2 = SeqIO.index("fasta2.fasta", "fasta")

candidate_df=pd.read_csv("dispatch.csv",sep='\t')

for i in candidate_df['Seq_1.id']:
    if i in records1:
        SeqIO.write(records1[i], output_handle1, 'fasta')
    elif i in records2:
        SeqIO.write(records2[i], output_handle1, 'fasta')

for i in candidate_df['Seq_2.id']:
    if i in records1:
        SeqIO.write(records1[i], output_handle2, 'fasta')
    elif i in records2:
        SeqIO.write(records2[i], output_handle2, 'fasta')

# Another parsing and putting in dataframe

In [None]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
with open('sequences.fasta') as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    for title, sequence in SimpleFastaParser(fasta_file):
        identifiers.append(title.split(None, 1)[0])  # First word is ID
        lengths.append(len(sequence))

In [None]:
with open('sequences.fasta') as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers.append(seq_record.id)
        lengths.append(len(seq_record.seq))