# Chapter 2 and 3

## Reading fasta files

In [8]:
from Bio import SeqIO
fastas = list(SeqIO.parse("files/ls_orchid.fasta","fasta"))  # generator to list
for seq_record in fastas[:5]:
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', SingleLetterAlphabet())
740
gi|2765657|emb|Z78532.1|CCZ78532
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', SingleLetterAlphabet())
753
gi|2765656|emb|Z78531.1|CFZ78531
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', SingleLetterAlphabet())
748
gi|2765655|emb|Z78530.1|CMZ78530
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', SingleLetterAlphabet())
744
gi|2765654|emb|Z78529.1|CLZ78529
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', SingleLetterAlphabet())
733


## Sequence objects

In [11]:
from Bio.Seq import Seq
my_seq = Seq("AGTACACTGGT")
my_seq

Seq('AGTACACTGGT')

In [12]:
from Bio.Alphabet import IUPAC
my_prot = Seq("AGTACACTGGT", IUPAC.protein)
my_prot

Seq('AGTACACTGGT', IUPACProtein())

### Sequences as strings

In [14]:
for index, letter in enumerate(my_seq):
    print("%i %s" % (index, letter))

0 A
1 G
2 T
3 A
4 C
5 A
6 C
7 T
8 G
9 G
10 T


In [15]:
print(len(my_seq))

11


In [17]:
my_seq.count("GT")

2

In [20]:
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
GC(my_seq)

46.875

### Slicing sequence

In [22]:
my_seq[4:12]

Seq('GATGGGCC', IUPACUnambiguousDNA())

The code below prints every other 3rd nucleotide

In [24]:
my_seq[0::3]

Seq('GCTGTAGTAAG', IUPACUnambiguousDNA())

The Python trick, `[::-1]` can be used to get reverse of a string (sequence in our case)

In [26]:
my_seq[::-1]

Seq('CGCTAAAAGCTAGGATATATCCGGGTAGCTAG', IUPACUnambiguousDNA())

### Concatenation

protein and dna sequences can not be concatenated

In [28]:
my_prot + my_seq

TypeError: Incompatible alphabets IUPACProtein() and IUPACUnambiguousDNA()

`sum` is acting funny, thus `+` is better way to concatenate strings (*in the book, the `sum` was used with list elements their order was okay*)

In [30]:
seq1 = Seq("AGTC")
seq2 = Seq("TTTT")
sum(seq1,seq2)

Seq('TTTTAGTC')

### Case and (reverse) complement

In [34]:
my_seq.lower()

Seq('gatcgatgggcctatataggatcgaaaatcgc', DNAAlphabet())

In [37]:
my_seq.reverse_complement()

Seq('GCGATTTTCGATCCTATATAGGCCCATCGATC', IUPACUnambiguousDNA())

In [38]:
my_seq.lower().complement()

Seq('ctagctacccggatatatcctagcttttagcg', DNAAlphabet())

## Transcription and translation

In [41]:
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
template_dna = coding_dna.reverse_complement()
messenger_rna = coding_dna.transcribe()
print("%s\n%s\n%s" % (coding_dna, template_dna, messenger_rna))

ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG
CTATCGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT
AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG


In [43]:
template_dna.reverse_complement().transcribe()

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())

OR

In [45]:
coding_dna.transcribe()

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())

AND

In [50]:
coding_dna.translate()    # no need for coding_dna.transcribe().translate()

Seq('MAIVMGR*KGAR*', HasStopCodon(IUPACProtein(), '*'))

In [51]:
coding_dna.translate(to_stop=True)

Seq('MAIVMGR', IUPACProtein())

## MutableSeq objects

Seq objects are not mutable

In [53]:
my_seq[5] = "G"

TypeError: 'Seq' object does not support item assignment

In [56]:
mutable_seq = my_seq.tomutable()
mutable_seq[5] = "C"
print("%s\n%s" % (my_seq,mutable_seq))

GATCGATGGGCCTATATAGGATCGAAAATCGC
GATCGCTGGGCCTATATAGGATCGAAAATCGC


Mutating actions, such as `remove()` and `reverse()` can be used with MutableSeq objects.

MutableSeq objects can be transferred to Seq (immutable) objects by `toseq()` method. (e.g. `new_seq = mutable_seq.toseq()`)