diff --git a/Doc/Tutorial.tex b/Doc/Tutorial.tex index 46b0d0c3cf6..5fd9d2322aa 100644 --- a/Doc/Tutorial.tex +++ b/Doc/Tutorial.tex @@ -80,7 +80,7 @@ \author{Jeff Chang, Brad Chapman, Iddo Friedberg, Thomas Hamelryck, \\ Michiel de Hoon, Peter Cock, Tiago Antao, Eric Talevich, Bartek Wilczy\'{n}ski} -\date{Last Update -- 5 September 2013 (Biopython 1.62+)} +\date{Last Update -- 7 September 2013 (Biopython 1.62+)} %Hack to get the logo at the start of the HTML front page: %(hopefully this isn't going to be too wide for most people) @@ -223,7 +223,7 @@ \section{Frequently Asked Questions (FAQ)} Use this: \begin{verbatim} >>> import Bio - >>> print Bio.__version__ + >>> print(Bio.__version__) ... \end{verbatim} If the ``\verb|import Bio|'' line fails, Biopython is not installed. @@ -350,7 +350,7 @@ \section{Working with sequences} >>> my_seq = Seq("AGTACACTGGT") >>> my_seq Seq('AGTACACTGGT', Alphabet()) ->>> print my_seq +>>> print(my_seq) AGTACACTGGT >>> my_seq.alphabet Alphabet() @@ -423,9 +423,9 @@ \subsection{Simple FASTA parsing example} \begin{verbatim} from Bio import SeqIO for seq_record in SeqIO.parse("ls_orchid.fasta", "fasta"): - print seq_record.id - print repr(seq_record.seq) - print len(seq_record) + print(seq_record.id) + print(repr(seq_record.seq)) + print(len(seq_record)) \end{verbatim} \noindent You should get something like this on your screen: @@ -449,9 +449,9 @@ \subsection{Simple GenBank parsing example} \begin{verbatim} from Bio import SeqIO for seq_record in SeqIO.parse("ls_orchid.gbk", "genbank"): - print seq_record.id - print repr(seq_record.seq) - print len(seq_record) + print(seq_record.id) + print(repr(seq_record.seq)) + print(len(seq_record)) \end{verbatim} \noindent This should give: @@ -570,13 +570,13 @@ \section{Sequences act like strings} >>> from Bio.Alphabet import IUPAC >>> my_seq = Seq("GATCG", IUPAC.unambiguous_dna) >>> for index, letter in enumerate(my_seq): -... print index, letter +... print(index, letter) 0 G 1 A 2 T 3 C 4 G ->>> print len(my_seq) +>>> print(len(my_seq)) 5 \end{verbatim} @@ -584,11 +584,11 @@ \section{Sequences act like strings} %cont-doctest \begin{verbatim} ->>> print my_seq[0] #first letter +>>> print(my_seq[0]) #first letter G ->>> print my_seq[2] #third letter +>>> print(my_seq[2]) #third letter T ->>> print my_seq[-1] #last letter +>>> print(my_seq[-1]) #last letter G \end{verbatim} @@ -687,10 +687,11 @@ \section{Turning Seq objects into strings} Since calling \verb|str()| on a \verb|Seq| object returns the full sequence as a string, you often don't actually have to do this conversion explicitly. -Python does this automatically with a print statement: +Python does this automatically in the print function +(and the print statement under Python 2): %cont-doctest \begin{verbatim} ->>> print my_seq +>>> print(my_seq) GATCGATGGGCCTATATAGGATCGAAAATCGC \end{verbatim} @@ -698,7 +699,7 @@ \section{Turning Seq objects into strings} %cont-doctest \begin{verbatim} >>> fasta_format_string = ">Name\n%s\n" % my_seq ->>> print fasta_format_string +>>> print(fasta_format_string) >Name GATCGATGGGCCTATATAGGATCGAAAATCGC @@ -1070,7 +1071,7 @@ \section{Translation Tables} You can compare the actual tables visually by printing them: %TODO - handle automatically in doctest? \begin{verbatim} ->>> print standard_table +>>> print(standard_table) Table 1 Standard, SGC0 | T | C | A | G | @@ -1098,7 +1099,7 @@ \section{Translation Tables} \end{verbatim} \noindent and: \begin{verbatim} ->>> print mito_table +>>> print(mito_table) Table 2 Vertebrate Mitochondrial, SGC1 | T | C | A | G | @@ -1292,7 +1293,7 @@ \section{UnknownSeq objects} >>> unk = UnknownSeq(20) >>> unk UnknownSeq(20, alphabet = Alphabet(), character = '?') ->>> print unk +>>> print(unk) ???????????????????? >>> len(unk) 20 @@ -1308,7 +1309,7 @@ \section{UnknownSeq objects} >>> unk_dna = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna) >>> unk_dna UnknownSeq(20, alphabet = IUPACAmbiguousDNA(), character = 'N') ->>> print unk_dna +>>> print(unk_dna) NNNNNNNNNNNNNNNNNNNN \end{verbatim} @@ -1328,7 +1329,7 @@ \section{UnknownSeq objects} >>> unk_protein = unk_dna.translate() >>> unk_protein UnknownSeq(6, alphabet = ProteinAlphabet(), character = 'X') ->>> print unk_protein +>>> print(unk_protein) XXXXXX >>> len(unk_protein) 6 @@ -1438,7 +1439,7 @@ \subsection{SeqRecord objects from scratch} '' >>> simple_seq_r.id = "AC12345" >>> simple_seq_r.description = "Made up sequence I wish I could write a paper about" ->>> print simple_seq_r.description +>>> print(simple_seq_r.description) Made up sequence I wish I could write a paper about >>> simple_seq_r.seq Seq('GATC', Alphabet()) @@ -1461,9 +1462,9 @@ \subsection{SeqRecord objects from scratch} %cont-doctest \begin{verbatim} >>> simple_seq_r.annotations["evidence"] = "None. I just made it up." ->>> print simple_seq_r.annotations +>>> print(simple_seq_r.annotations) {'evidence': 'None. I just made it up.'} ->>> print simple_seq_r.annotations["evidence"] +>>> print(simple_seq_r.annotations["evidence"]) None. I just made it up. \end{verbatim} @@ -1474,9 +1475,9 @@ \subsection{SeqRecord objects from scratch} %cont-doctest \begin{verbatim} >>> simple_seq_r.letter_annotations["phred_quality"] = [40,40,38,30] ->>> print simple_seq_r.letter_annotations +>>> print(simple_seq_r.letter_annotations) {'phred_quality': [40, 40, 38, 30]} ->>> print simple_seq_r.letter_annotations["phred_quality"] +>>> print(simple_seq_r.letter_annotations["phred_quality"]) [40, 40, 38, 30] \end{verbatim} @@ -1801,7 +1802,7 @@ \subsubsection{Fuzzy Positions} %cont-doctest \begin{verbatim} ->>> print my_location +>>> print(my_location) [>5:(8^9)] \end{verbatim} @@ -1811,11 +1812,11 @@ \subsubsection{Fuzzy Positions} \begin{verbatim} >>> my_location.start AfterPosition(5) ->>> print my_location.start +>>> print(my_location.start) >5 >>> my_location.end BetweenPosition(9, left=8, right=9) ->>> print my_location.end +>>> print(my_location.end) (8^9) \end{verbatim} @@ -1849,7 +1850,7 @@ \subsubsection{Fuzzy Positions} %cont-doctest \begin{verbatim} >>> exact_location = SeqFeature.FeatureLocation(5, 9) ->>> print exact_location +>>> print(exact_location) [5:9] >>> exact_location.start ExactPosition(5) @@ -1881,7 +1882,7 @@ \subsubsection{Location testing} >>> record = SeqIO.read("NC_005816.gb", "genbank") >>> for feature in record.features: ... if my_snp in feature: -... print feature.type, feature.qualifiers.get('db_xref') +... print(feature.type, feature.qualifiers.get('db_xref')) ... source ['taxon:229193'] gene ['GeneID:2767712'] @@ -1911,7 +1912,7 @@ \subsection{Sequence described by a feature or location} %cont-doctest \begin{verbatim} >>> feature_seq = example_parent[example_feature.location.start:example_feature.location.end].reverse_complement() ->>> print feature_seq +>>> print(feature_seq) AGCCTTTGCCGTC \end{verbatim} @@ -1920,7 +1921,7 @@ \subsection{Sequence described by a feature or location} %cont-doctest \begin{verbatim} >>> feature_seq = example_feature.extract(example_parent) ->>> print feature_seq +>>> print(feature_seq) AGCCTTTGCCGTC \end{verbatim} @@ -1929,13 +1930,13 @@ \subsection{Sequence described by a feature or location} %cont-doctest \begin{verbatim} ->>> print example_feature.extract(example_parent) +>>> print(example_feature.extract(example_parent)) AGCCTTTGCCGTC ->>> print len(example_feature.extract(example_parent)) +>>> print(len(example_feature.extract(example_parent))) 13 ->>> print len(example_feature) +>>> print(len(example_feature)) 13 ->>> print len(example_feature.location) +>>> print(len(example_feature.location)) 13 \end{verbatim} @@ -1974,7 +1975,7 @@ \section{The format method} id="gi|14150838|gb|AAK54648.1|AF376133_1", description="chalcone synthase [Cucumis sativus]") -print record.format("fasta") +print(record.format("fasta")) \end{verbatim} \noindent which should give: \begin{verbatim} @@ -2032,7 +2033,7 @@ \section{Slicing a SeqRecord} %cont-doctest \begin{verbatim} ->>> print record.features[20] +>>> print(record.features[20]) type: gene location: [4342:4780](+) qualifiers: @@ -2043,7 +2044,7 @@ \section{Slicing a SeqRecord} \end{verbatim} %This one is truncated so can't use for doctest \begin{verbatim} ->>> print record.features[21] +>>> print(record.features[21]) type: CDS location: [4342:4780](+) qualifiers: @@ -2085,7 +2086,7 @@ \section{Slicing a SeqRecord} %cont-doctest \begin{verbatim} ->>> print sub_record.features[0] +>>> print(sub_record.features[0]) type: gene location: [42:480](+) qualifiers: @@ -2095,7 +2096,7 @@ \section{Slicing a SeqRecord} \end{verbatim} \begin{verbatim} ->>> print sub_record.features[20] +>>> print(sub_record.features[20]) type: CDS location: [42:480](+) qualifiers: @@ -2146,7 +2147,7 @@ \section{Slicing a SeqRecord} \begin{verbatim} >>> sub_record.description = "Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, partial." ->>> print sub_record.format("genbank") +>>> print(sub_record.format("genbank")) ... \end{verbatim} @@ -2172,12 +2173,12 @@ \section{Adding SeqRecord objects} >>> record = SeqIO.parse("example.fastq", "fastq").next() >>> len(record) 25 ->>> print record.seq +>>> print(record.seq) CCCTTCTTGTCTTCAGCGTTTCTCC \end{verbatim} %TODO - doctest wrapping \begin{verbatim} ->>> print record.letter_annotations["phred_quality"] +>>> print(record.letter_annotations["phred_quality"]) [26, 26, 18, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 26, 26, 26, 26, 26, 26, 26, 23, 23] \end{verbatim} @@ -2190,14 +2191,14 @@ \section{Adding SeqRecord objects} %cont-doctest \begin{verbatim} >>> left = record[:20] ->>> print left.seq +>>> print(left.seq) CCCTTCTTGTCTTCAGCGTT ->>> print left.letter_annotations["phred_quality"] +>>> print(left.letter_annotations["phred_quality"]) [26, 26, 18, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 26, 26, 26, 26] >>> right = record[21:] ->>> print right.seq +>>> print(right.seq) CTCC ->>> print right.letter_annotations["phred_quality"] +>>> print(right.letter_annotations["phred_quality"]) [26, 26, 23, 23] \end{verbatim} @@ -2208,11 +2209,11 @@ \section{Adding SeqRecord objects} >>> edited = left + right >>> len(edited) 24 ->>> print edited.seq +>>> print(edited.seq) CCCTTCTTGTCTTCAGCGTTCTCC \end{verbatim} \begin{verbatim} ->>> print edited.letter_annotations["phred_quality"] +>>> print(edited.letter_annotations["phred_quality"]) [26, 26, 18, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 26, 26, 26, 26, 26, 26, 23, 23] \end{verbatim} @@ -2339,7 +2340,7 @@ \section{Reverse-complementing SeqRecord objects} \begin{verbatim} >>> from Bio import SeqIO >>> record = SeqIO.read("NC_005816.gb", "genbank") ->>> print record.id, len(record), len(record.features), len(record.dbxrefs), len(record.annotations) +>>> print(record.id, len(record), len(record.features), len(record.dbxrefs), len(record.annotations)) NC_005816.1 9609 41 1 11 \end{verbatim} @@ -2349,7 +2350,7 @@ \section{Reverse-complementing SeqRecord objects} %cont-doctest \begin{verbatim} >>> rc = record.reverse_complement(id="TESTING") ->>> print rc.id, len(rc), len(rc.features), len(rc.dbxrefs), len(rc.annotations) +>>> print(rc.id, len(rc), len(rc.features), len(rc.dbxrefs), len(rc.annotations)) TESTING 9609 41 0 0 \end{verbatim} @@ -2390,9 +2391,9 @@ \subsection{Reading Sequence Files} \begin{verbatim} from Bio import SeqIO for seq_record in SeqIO.parse("ls_orchid.fasta", "fasta"): - print seq_record.id - print repr(seq_record.seq) - print len(seq_record) + print(seq_record.id) + print(repr(seq_record.seq)) + print(len(seq_record)) \end{verbatim} The above example is repeated from the introduction in Section~\ref{sec:sequence-parsing}, and will load the orchid DNA sequences in the FASTA format file \href{http://biopython.org/DIST/docs/tutorial/examples/ls_orchid.fasta}{ls\_orchid.fasta}. If instead you wanted to load a GenBank format file like \href{http://biopython.org/DIST/docs/tutorial/examples/ls_orchid.gbk}{ls\_orchid.gbk} then all you need to do is change the filename and the format string: @@ -2400,9 +2401,9 @@ \subsection{Reading Sequence Files} \begin{verbatim} from Bio import SeqIO for seq_record in SeqIO.parse("ls_orchid.gbk", "genbank"): - print seq_record.id - print seq_record.seq - print len(seq_record) + print(seq_record.id) + print(seq_record.seq) + print(len(seq_record)) \end{verbatim} Similarly, if you wanted to read in a file in another file format, then assuming \verb|Bio.SeqIO.parse()| supports it you would just need to change the format string as appropriate, for example ``swiss'' for SwissProt files or ``embl'' for EMBL text files. There is a full listing on the wiki page (\url{http://biopython.org/wiki/SeqIO}) and in the built in documentation (also \href{http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html}{online}). @@ -2435,12 +2436,12 @@ \subsection{Iterating over the records in a sequence file} record_iterator = SeqIO.parse("ls_orchid.fasta", "fasta") first_record = record_iterator.next() -print first_record.id -print first_record.description +print(first_record.id) +print(first_record.description) second_record = record_iterator.next() -print second_record.id -print second_record.description +print(second_record.id) +print(second_record.description) \end{verbatim} Note that if you try and use \verb|.next()| and there are no more results, you'll get the special \verb|StopIteration| exception. @@ -2464,19 +2465,19 @@ \subsection{Getting a list of the records in a sequence file} from Bio import SeqIO records = list(SeqIO.parse("ls_orchid.gbk", "genbank")) -print "Found %i records" % len(records) +print("Found %i records" % len(records)) -print "The last record" +print("The last record") last_record = records[-1] #using Python's list tricks -print last_record.id -print repr(last_record.seq) -print len(last_record) +print(last_record.id) +print(repr(last_record.seq)) +print(len(last_record)) -print "The first record" +print("The first record") first_record = records[0] #remember, Python counts from zero -print first_record.id -print repr(first_record.seq) -print len(first_record) +print(first_record.id) +print(repr(first_record.seq)) +print(len(first_record)) \end{verbatim} \noindent Giving: @@ -2504,7 +2505,7 @@ \subsection{Extracting data} from Bio import SeqIO record_iterator = SeqIO.parse("ls_orchid.gbk", "genbank") first_record = record_iterator.next() -print first_record +print(first_record) \end{verbatim} \noindent That should give something like this: @@ -2532,15 +2533,15 @@ \subsection{Extracting data} The contents of this annotations dictionary were shown when we printed the record above. You can also print them out directly: \begin{verbatim} -print first_record.annotations +print(first_record.annotations) \end{verbatim} \noindent Like any Python dictionary, you can easily get a list of the keys: \begin{verbatim} -print first_record.annotations.keys() +print(first_record.annotations.keys()) \end{verbatim} \noindent or values: \begin{verbatim} -print first_record.annotations.values() +print(first_record.annotations.values()) \end{verbatim} In general, the annotation values are strings, or lists of strings. One special case is any references in the file get stored as reference objects. @@ -2548,14 +2549,14 @@ \subsection{Extracting data} Suppose you wanted to extract a list of the species from the \href{http://biopython.org/DIST/docs/tutorial/examples/ls_orchid.gbk}{ls\_orchid.gbk} GenBank file. The information we want, \emph{Cypripedium irapeanum}, is held in the annotations dictionary under `source' and `organism', which we can access like this: \begin{verbatim} ->>> print first_record.annotations["source"] +>>> print(first_record.annotations["source"]) Cypripedium irapeanum \end{verbatim} \noindent or: \begin{verbatim} ->>> print first_record.annotations["organism"] +>>> print(first_record.annotations["organism"]) Cypripedium irapeanum \end{verbatim} @@ -2570,7 +2571,7 @@ \subsection{Extracting data} all_species = [] for seq_record in SeqIO.parse("ls_orchid.gbk", "genbank"): all_species.append(seq_record.annotations["organism"]) -print all_species +print(all_species) \end{verbatim} Another way of writing this code is to use a list comprehension: @@ -2579,7 +2580,7 @@ \subsection{Extracting data} from Bio import SeqIO all_species = [seq_record.annotations["organism"] for seq_record in \ SeqIO.parse("ls_orchid.gbk", "genbank")] -print all_species +print(all_species) \end{verbatim} \noindent In either case, the result is: @@ -2607,7 +2608,7 @@ \subsection{Extracting data} all_species = [] for seq_record in SeqIO.parse("ls_orchid.fasta", "fasta"): all_species.append(seq_record.description.split()[1]) -print all_species +print(all_species) \end{verbatim} \noindent This gives: @@ -2622,7 +2623,7 @@ \subsection{Extracting data} from Bio import SeqIO all_species == [seq_record.description.split()[1] for seq_record in \ SeqIO.parse("ls_orchid.fasta", "fasta")] -print all_species +print(all_species) \end{verbatim} In general, extracting information from the FASTA description line is not very nice. @@ -2644,7 +2645,7 @@ \section{Parsing sequences from compressed files} %doctest examples \begin{verbatim} >>> from Bio import SeqIO ->>> print sum(len(r) for r in SeqIO.parse("ls_orchid.gbk", "gb")) +>>> print(sum(len(r) for r in SeqIO.parse("ls_orchid.gbk", "gb"))) 67518 \end{verbatim} @@ -2656,7 +2657,7 @@ \section{Parsing sequences from compressed files} \begin{verbatim} >>> from Bio import SeqIO >>> with open("ls_orchid.gbk") as handle: -... print sum(len(r) for r in SeqIO.parse(handle, "gb")) +... print(sum(len(r) for r in SeqIO.parse(handle, "gb"))) 67518 \end{verbatim} @@ -2667,7 +2668,7 @@ \section{Parsing sequences from compressed files} \begin{verbatim} >>> from Bio import SeqIO >>> handle = open("ls_orchid.gbk") ->>> print sum(len(r) for r in SeqIO.parse(handle, "gb")) +>>> print(sum(len(r) for r in SeqIO.parse(handle, "gb"))) 67518 >>> handle.close() \end{verbatim} @@ -2681,7 +2682,7 @@ \section{Parsing sequences from compressed files} >>> import gzip >>> from Bio import SeqIO >>> handle = gzip.open("ls_orchid.gbk.gz", "r") ->>> print sum(len(r) for r in SeqIO.parse(handle, "gb")) +>>> print(sum(len(r) for r in SeqIO.parse(handle, "gb"))) 67518 >>> handle.close() \end{verbatim} @@ -2694,7 +2695,7 @@ \section{Parsing sequences from compressed files} >>> import bz2 >>> from Bio import SeqIO >>> handle = bz2.BZ2File("ls_orchid.gbk.bz2", "r") ->>> print sum(len(r) for r in SeqIO.parse(handle, "gb")) +>>> print(sum(len(r) for r in SeqIO.parse(handle, "gb"))) 67518 >>> handle.close() \end{verbatim} @@ -2740,7 +2741,7 @@ \subsection{Parsing GenBank records from the net} handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id="6273291") seq_record = SeqIO.read(handle, "fasta") handle.close() -print "%s with %i features" % (seq_record.id, len(seq_record.features)) +print("%s with %i features" % (seq_record.id, len(seq_record.features))) \end{verbatim} \noindent Expected output: @@ -2765,7 +2766,7 @@ \subsection{Parsing GenBank records from the net} handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id="6273291") seq_record = SeqIO.read(handle, "gb") #using "gb" as an alias for "genbank" handle.close() -print "%s with %i features" % (seq_record.id, len(seq_record.features)) +print("%s with %i features" % (seq_record.id, len(seq_record.features))) \end{verbatim} \noindent The expected output of this example is: @@ -2819,12 +2820,12 @@ \subsection{Parsing SwissProt sequences from the net} handle = ExPASy.get_sprot_raw("O23729") seq_record = SeqIO.read(handle, "swiss") handle.close() -print seq_record.id -print seq_record.name -print seq_record.description -print repr(seq_record.seq) -print "Length %i" % len(seq_record) -print seq_record.annotations["keywords"] +print(seq_record.id) +print(seq_record.name) +print(seq_record.description) +print(repr(seq_record.seq)) +print("Length %i" % len(seq_record)) +print(seq_record.annotations["keywords"]) \end{verbatim} \noindent Assuming your network connection is OK, you should get back: @@ -2894,7 +2895,7 @@ \subsection{Sequence files as Dictionaries -- In memory} \end{verbatim} %Can't use following for doctest due to abbreviation \begin{verbatim} ->>> print orchid_dict.keys() +>>> orchid_dict.keys() ['Z78484.1', 'Z78464.1', 'Z78455.1', 'Z78442.1', 'Z78532.1', 'Z78453.1', ..., 'Z78471.1'] \end{verbatim} @@ -2909,9 +2910,9 @@ \subsection{Sequence files as Dictionaries -- In memory} %cont-doctest \begin{verbatim} >>> seq_record = orchid_dict["Z78475.1"] ->>> print seq_record.description +>>> print(seq_record.description) P.supardii 5.8S rRNA gene and ITS1 and ITS2 DNA. ->>> print repr(seq_record.seq) +>>> print(repr(seq_record.seq)) Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACAT...GGT', IUPACAmbiguousDNA()) \end{verbatim} @@ -2927,7 +2928,7 @@ \subsubsection{Specifying the dictionary keys} \begin{verbatim} from Bio import SeqIO orchid_dict = SeqIO.to_dict(SeqIO.parse("ls_orchid.fasta", "fasta")) -print orchid_dict.keys() +print(orchid_dict.keys()) \end{verbatim} \noindent This time the keys are: @@ -2957,13 +2958,13 @@ \subsubsection{Specifying the dictionary keys} \begin{verbatim} from Bio import SeqIO orchid_dict = SeqIO.to_dict(SeqIO.parse("ls_orchid.fasta", "fasta"), key_function=get_accession) -print orchid_dict.keys() +print(orchid_dict.keys()) \end{verbatim} \noindent Finally, as desired, the new dictionary keys: \begin{verbatim} ->>> print orchid_dict.keys() +>>> print(orchid_dict.keys()) ['Z78484.1', 'Z78464.1', 'Z78455.1', 'Z78442.1', 'Z78532.1', 'Z78453.1', ..., 'Z78471.1'] \end{verbatim} @@ -2979,7 +2980,7 @@ \subsubsection{Indexing a dictionary using the SEGUID checksum} from Bio import SeqIO from Bio.SeqUtils.CheckSum import seguid for record in SeqIO.parse("ls_orchid.gbk", "genbank"): - print record.id, seguid(record.seq) + print(record.id, seguid(record.seq)) \end{verbatim} \noindent This should give: @@ -3000,9 +3001,9 @@ \subsubsection{Indexing a dictionary using the SEGUID checksum} >>> seguid_dict = SeqIO.to_dict(SeqIO.parse("ls_orchid.gbk", "genbank"), ... lambda rec : seguid(rec.seq)) >>> record = seguid_dict["MN/s0q9zDoCVEEc+k/IFwCNF2pY"] ->>> print record.id +>>> print(record.id) Z78532.1 ->>> print record.description +>>> print(record.description) C.californicum 5.8S rRNA gene and ITS1 and ITS2 DNA. \end{verbatim} @@ -3041,7 +3042,7 @@ \subsection{Sequence files as Dictionaries -- Indexed files} %cont-doctest \begin{verbatim} >>> seq_record = orchid_dict["Z78475.1"] ->>> print seq_record.description +>>> print(seq_record.description) P.supardii 5.8S rRNA gene and ITS1 and ITS2 DNA. >>> seq_record.seq Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATCACAT...GGT', IUPACAmbiguousDNA()) @@ -3094,7 +3095,7 @@ \subsubsection{Specifying the dictionary keys} \begin{verbatim} >>> from Bio import SeqIO >>> orchid_dict = SeqIO.index("ls_orchid.fasta", "fasta", key_function=get_acc) ->>> print orchid_dict.keys() +>>> print(orchid_dict.keys()) ['Z78484.1', 'Z78464.1', 'Z78455.1', 'Z78442.1', 'Z78532.1', 'Z78453.1', ..., 'Z78471.1'] \end{verbatim} @@ -3164,7 +3165,7 @@ \subsection{Sequence files as Dictionaries -- Database indexed files} >>> from Bio import SeqIO >>> files = ["gbvrl%i.seq" % (i+1) for i in range(16)] >>> gb_vrl = SeqIO.index_db("gbvrl.idx", files, "genbank") ->>> print "%i sequences indexed" % len(gb_vrl) +>>> print("%i sequences indexed" % len(gb_vrl)) 958086 sequences indexed \end{verbatim} @@ -3174,7 +3175,7 @@ \subsection{Sequence files as Dictionaries -- Database indexed files} about which file the sequence comes from, e.g. \begin{verbatim} ->>> print gb_vrl["GQ333173.1"].description +>>> print(gb_vrl["GQ333173.1"].description) HIV-1 isolate F12279A1 from Uganda gag protein (gag) gene, partial cds. \end{verbatim} @@ -3185,7 +3186,7 @@ \subsubsection{Getting the raw data for a record} get at the raw text of each record: \begin{verbatim} ->>> print gb_vrl.get_raw("GQ333173.1") +>>> print(gb_vrl.get_raw("GQ333173.1")) LOCUS GQ333173 459 bp DNA linear VRL 21-OCT-2009 DEFINITION HIV-1 isolate F12279A1 from Uganda gag protein (gag) gene, partial cds. @@ -3417,7 +3418,7 @@ \subsection{Converting between sequence file formats} from Bio import SeqIO records = SeqIO.parse("ls_orchid.gbk", "genbank") count = SeqIO.write(records, "my_example.fasta", "fasta") -print "Converted %i records" % count +print("Converted %i records" % count) \end{verbatim} Still, that is a little bit complicated. So, because file conversion is such a @@ -3426,7 +3427,7 @@ \subsection{Converting between sequence file formats} \begin{verbatim} from Bio import SeqIO count = SeqIO.convert("ls_orchid.gbk", "genbank", "my_example.fasta", "fasta") -print "Converted %i records" % count +print("Converted %i records" % count) \end{verbatim} The \verb|Bio.SeqIO.convert()| function will take handles \emph{or} filenames. @@ -3464,8 +3465,8 @@ \subsection{Converting a file of sequences to their reverse complements} \begin{verbatim} >>> from Bio import SeqIO >>> for record in SeqIO.parse("ls_orchid.gbk", "genbank"): -... print record.id -... print record.seq.reverse_complement() +... print(record.id) +... print(record.seq.reverse_complement()) \end{verbatim} Now, if we want to save these reverse complements to a file, we'll need to make \verb|SeqRecord| objects. @@ -3526,7 +3527,7 @@ \subsection{Getting your SeqRecord objects as formatted strings} out_handle = StringIO() SeqIO.write(records, out_handle, "fasta") fasta_data = out_handle.getvalue() -print fasta_data +print(fasta_data) \end{verbatim} This isn't entirely straightforward the first time you see it! On the bright side, for the special case where you would like a string containing a \emph{single} record in a particular file format, use the the \verb|SeqRecord| class' \verb|format()| method (see Section~\ref{sec:SeqRecord-format}). @@ -3632,7 +3633,7 @@ \subsection{Single Alignments} %cont-doctest \begin{verbatim} ->>> print alignment +>>> print(alignment) SingleLetterAlphabet() alignment with 7 rows and 52 columns AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRL...SKA COATB_BPIKE/30-81 AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKL...SRA Q9T0Q8_BPIKE/1-52 @@ -3649,10 +3650,10 @@ \subsection{Single Alignments} \begin{verbatim} >>> from Bio import AlignIO >>> alignment = AlignIO.read("PF05371_seed.sth", "stockholm") ->>> print "Alignment length %i" % alignment.get_alignment_length() +>>> print("Alignment length %i" % alignment.get_alignment_length()) Alignment length 52 >>> for record in alignment: -... print "%s - %s" % (record.seq, record.id) +... print("%s - %s" % (record.seq, record.id)) AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRLFKKFSSKA - COATB_BPIKE/30-81 AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKLFKKFVSRA - Q9T0Q8_BPIKE/1-52 DGTSTATSYATEAMNSLKTQATDLIDQTWPVVTSVAVAGLAIRLFKKFSSKA - COATB_BPI22/32-83 @@ -3670,7 +3671,7 @@ \subsection{Single Alignments} \begin{verbatim} >>> for record in alignment: ... if record.dbxrefs: -... print record.id, record.dbxrefs +... print(record.id, record.dbxrefs) COATB_BPIKE/30-81 ['PDB; 1ifl ; 1-52;'] COATB_BPM13/24-72 ['PDB; 2cpb ; 1-49;', 'PDB; 2cps ; 1-49;'] Q9T0Q9_BPFD/1-49 ['PDB; 1nh4 A; 1-49;'] @@ -3681,7 +3682,7 @@ \subsection{Single Alignments} \begin{verbatim} >>> for record in alignment: -... print record +... print(record) \end{verbatim} Sanger provide a nice web interface at \url{http://pfam.sanger.ac.uk/family?acc=PF05371} which will actually let you download this alignment in several other formats. This is what the file looks like in the FASTA file format: @@ -3708,7 +3709,7 @@ \subsection{Single Alignments} \begin{verbatim} from Bio import AlignIO alignment = AlignIO.read("PF05371_seed.faa", "fasta") -print alignment +print(alignment) \end{verbatim} All that has changed in this code is the filename and the format string. You'll get the same output as before, the sequences and record identifiers are the same. @@ -3771,12 +3772,13 @@ \subsection{Multiple Alignments} If you wanted to read this in using \verb|Bio.AlignIO| you could use: +%TODO - Replace the print blank line with print()? \begin{verbatim} from Bio import AlignIO alignments = AlignIO.parse("resampled.phy", "phylip") for alignment in alignments: - print alignment - print + print(alignment) + print("") \end{verbatim} \noindent This would give the following output, again abbreviated for display: @@ -3888,12 +3890,13 @@ \subsection{Ambiguous Alignments} To interpret these FASTA examples as several separate alignments, we can use \verb|Bio.AlignIO.parse()| with the optional \verb|seq_count| argument which specifies how many sequences are expected in each alignment (in these examples, 3, 2 and 2 respectively). For example, using the third example as the input data: +%TODO - Replace the print blank line with print()? \begin{verbatim} for alignment in AlignIO.parse(handle, "fasta", seq_count=2): - print "Alignment length %i" % alignment.get_alignment_length() + print("Alignment length %i" % alignment.get_alignment_length()) for record in alignment: - print "%s - %s" % (record.seq, record.id) - print + print("%s - %s" % (record.seq, record.id)) + print("") \end{verbatim} \noindent giving: @@ -3997,7 +4000,7 @@ \subsection{Converting between sequence alignment file formats} \begin{verbatim} from Bio import AlignIO count = AlignIO.convert("PF05371_seed.sth", "stockholm", "PF05371_seed.aln", "clustal") -print "Converted %i alignments" % count +print("Converted %i alignments" % count) \end{verbatim} Or, using \verb|Bio.AlignIO.parse()| and \verb|Bio.AlignIO.write()|: @@ -4006,7 +4009,7 @@ \subsection{Converting between sequence alignment file formats} from Bio import AlignIO alignments = AlignIO.parse("PF05371_seed.sth", "stockholm") count = AlignIO.write(alignments, "PF05371_seed.aln", "clustal") -print "Converted %i alignments" % count +print("Converted %i alignments" % count) \end{verbatim} The \verb|Bio.AlignIO.write()| function expects to be given multiple alignment objects. In the example above we gave it the alignment iterator returned by \verb|Bio.AlignIO.parse()|. @@ -4079,7 +4082,7 @@ \subsection{Converting between sequence alignment file formats} for i, record in enumerate(alignment): name_mapping[i] = record.id record.id = "seq%i" % i -print name_mapping +print(name_mapping) AlignIO.write([alignment], "PF05371_seed.phy", "phylip") \end{verbatim} @@ -4120,7 +4123,7 @@ \subsection{Getting your alignment objects as formatted strings} \begin{verbatim} from Bio import AlignIO alignment = AlignIO.read("PF05371_seed.sth", "stockholm") -print alignment.format("clustal") +print(alignment.format("clustal")) \end{verbatim} As described in Section~\ref{sec:SeqRecord-format}), the \verb|SeqRecord| object has a similar method using output formats supported by \verb|Bio.SeqIO|. @@ -4139,7 +4142,7 @@ \subsection{Getting your alignment objects as formatted strings} AlignIO.write(alignments, out_handle, "clustal") clustal_data = out_handle.getvalue() -print clustal_data +print(clustal_data) \end{verbatim} \section{Manipulating Alignments} @@ -4158,10 +4161,10 @@ \subsection{Slicing alignments} \begin{verbatim} >>> from Bio import AlignIO >>> alignment = AlignIO.read("PF05371_seed.sth", "stockholm") ->>> print "Number of rows: %i" % len(alignment) +>>> print("Number of rows: %i" % len(alignment)) Number of rows: 7 >>> for record in alignment: -... print "%s - %s" % (record.seq, record.id) +... print("%s - %s" % (record.seq, record.id)) AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRLFKKFSSKA - COATB_BPIKE/30-81 AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKLFKKFVSRA - Q9T0Q8_BPIKE/1-52 DGTSTATSYATEAMNSLKTQATDLIDQTWPVVTSVAVAGLAIRLFKKFSSKA - COATB_BPI22/32-83 @@ -4178,7 +4181,7 @@ \subsection{Slicing alignments} %cont-doctest \begin{verbatim} ->>> print alignment +>>> print(alignment) SingleLetterAlphabet() alignment with 7 rows and 52 columns AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRL...SKA COATB_BPIKE/30-81 AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKL...SRA Q9T0Q8_BPIKE/1-52 @@ -4187,7 +4190,7 @@ \subsection{Slicing alignments} AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA COATB_BPZJ2/1-49 AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA Q9T0Q9_BPFD/1-49 FAADDATSQAKAAFDSLTAQATEMSGYAWALVVLVVGATVGIKL...SRA COATB_BPIF1/22-73 ->>> print alignment[3:7] +>>> print(alignment[3:7]) SingleLetterAlphabet() alignment with 4 rows and 52 columns AEGDDP---AKAAFNSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA COATB_BPM13/24-72 AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA COATB_BPZJ2/1-49 @@ -4200,7 +4203,7 @@ \subsection{Slicing alignments} %cont-doctest \begin{verbatim} ->>> print alignment[2,6] +>>> print(alignment[2,6]) T \end{verbatim} @@ -4208,7 +4211,7 @@ \subsection{Slicing alignments} %cont-doctest \begin{verbatim} ->>> print alignment[2].seq[6] +>>> print(alignment[2].seq[6]) T \end{verbatim} @@ -4216,7 +4219,7 @@ \subsection{Slicing alignments} %cont-doctest \begin{verbatim} ->>> print alignment[:,6] +>>> print(alignment[:,6]) TTT---T \end{verbatim} @@ -4225,7 +4228,7 @@ \subsection{Slicing alignments} %cont-doctest \begin{verbatim} ->>> print alignment[3:6,:6] +>>> print(alignment[3:6,:6]) SingleLetterAlphabet() alignment with 3 rows and 6 columns AEGDDP COATB_BPM13/24-72 AEGDDP COATB_BPZJ2/1-49 @@ -4236,7 +4239,7 @@ \subsection{Slicing alignments} %cont-doctest \begin{verbatim} ->>> print alignment[:,:6] +>>> print(alignment[:,:6]) SingleLetterAlphabet() alignment with 7 rows and 6 columns AEPNAA COATB_BPIKE/30-81 AEPNAA Q9T0Q8_BPIKE/1-52 @@ -4252,7 +4255,7 @@ \subsection{Slicing alignments} %cont-doctest \begin{verbatim} ->>> print alignment[:,6:9] +>>> print(alignment[:,6:9]) SingleLetterAlphabet() alignment with 7 rows and 3 columns TNY COATB_BPIKE/30-81 TNY Q9T0Q8_BPIKE/1-52 @@ -4267,7 +4270,7 @@ \subsection{Slicing alignments} %cont-doctest \begin{verbatim} ->>> print alignment[:,9:] +>>> print(alignment[:,9:]) SingleLetterAlphabet() alignment with 7 rows and 43 columns ATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRLFKKFSSKA COATB_BPIKE/30-81 ATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKLFKKFVSRA Q9T0Q8_BPIKE/1-52 @@ -4284,7 +4287,7 @@ \subsection{Slicing alignments} %cont-doctest \begin{verbatim} >>> edited = alignment[:,:6] + alignment[:,9:] ->>> print edited +>>> print(edited) SingleLetterAlphabet() alignment with 7 rows and 49 columns AEPNAAATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRLFKKFSSKA COATB_BPIKE/30-81 AEPNAAATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKLFKKFVSRA Q9T0Q8_BPIKE/1-52 @@ -4304,7 +4307,7 @@ \subsection{Slicing alignments} %cont-doctest \begin{verbatim} >>> edited.sort() ->>> print edited +>>> print(edited) SingleLetterAlphabet() alignment with 7 rows and 49 columns DGTSTAATEAMNSLKTQATDLIDQTWPVVTSVAVAGLAIRLFKKFSSKA COATB_BPI22/32-83 FAADDAAKAAFDSLTAQATEMSGYAWALVVLVVGATVGIKLFKKFVSRA COATB_BPIF1/22-73 @@ -4418,7 +4421,7 @@ \subsection{ClustalW} \begin{verbatim} >>> from Bio.Align.Applications import ClustalwCommandline >>> cline = ClustalwCommandline("clustalw2", infile="opuntia.fasta") ->>> print cline +>>> print(cline) clustalw2 -infile=opuntia.fasta \end{verbatim} @@ -4488,7 +4491,7 @@ \subsection{ClustalW} \begin{verbatim} >>> from Bio import AlignIO >>> align = AlignIO.read("opuntia.aln", "clustal") ->>> print align +>>> print(align) SingleLetterAlphabet() alignment with 7 rows and 906 columns TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 @@ -4549,7 +4552,7 @@ \subsection{MUSCLE} \begin{verbatim} >>> from Bio.Align.Applications import MuscleCommandline >>> cline = MuscleCommandline(input="opuntia.fasta", out="opuntia.txt") ->>> print cline +>>> print(cline) muscle -in opuntia.fasta -out opuntia.txt \end{verbatim} @@ -4566,7 +4569,7 @@ \subsection{MUSCLE} \begin{verbatim} >>> from Bio.Align.Applications import MuscleCommandline >>> cline = MuscleCommandline(input="opuntia.fasta", out="opuntia.aln", clw=True) ->>> print cline +>>> print(cline) muscle -in opuntia.fasta -out opuntia.aln -clw \end{verbatim} @@ -4577,7 +4580,7 @@ \subsection{MUSCLE} \begin{verbatim} >>> from Bio.Align.Applications import MuscleCommandline >>> cline = MuscleCommandline(input="opuntia.fasta", out="opuntia.aln", clwstrict=True) ->>> print cline +>>> print(cline) muscle -in opuntia.fasta -out opuntia.aln -clwstrict \end{verbatim} @@ -4607,7 +4610,7 @@ \subsection{MUSCLE using stdout} \begin{verbatim} >>> from Bio.Align.Applications import MuscleCommandline >>> muscle_cline = MuscleCommandline(input="opuntia.fasta") ->>> print muscle_cline +>>> print(muscle_cline) muscle -in opuntia.fasta \end{verbatim} @@ -4622,7 +4625,7 @@ \subsection{MUSCLE using stdout} >>> from StringIO import StringIO >>> from Bio import AlignIO >>> align = AlignIO.read(StringIO(stdout), "fasta") ->>> print align +>>> print(align) SingleLetterAlphabet() alignment with 7 rows and 906 columns TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191663 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191665 @@ -4648,7 +4651,7 @@ \subsection{MUSCLE using stdout} ... shell=(sys.platform!="win32")) >>> from Bio import AlignIO >>> align = AlignIO.read(child.stdout, "fasta") ->>> print align +>>> print(align) SingleLetterAlphabet() alignment with 7 rows and 906 columns TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191663 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191665 @@ -4684,7 +4687,7 @@ \subsection{MUSCLE using stdin and stdout} \begin{verbatim} >>> from Bio.Align.Applications import MuscleCommandline >>> muscle_cline = MuscleCommandline(clwstrict=True) ->>> print muscle_cline +>>> print(muscle_cline) muscle -clwstrict \end{verbatim} @@ -4717,7 +4720,7 @@ \subsection{MUSCLE using stdin and stdout} \begin{verbatim} >>> from Bio import AlignIO >>> align = AlignIO.read(child.stdout, "clustal") ->>> print align +>>> print(align) SingleLetterAlphabet() alignment with 6 rows and 900 columns TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF19166 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF19166 @@ -4762,7 +4765,7 @@ \subsection{MUSCLE using stdin and stdout} >>> stdout, stderr = muscle_cline(stdin=data) >>> from Bio import AlignIO >>> align = AlignIO.read(StringIO(stdout), "clustal") ->>> print align +>>> print(align) SingleLetterAlphabet() alignment with 6 rows and 900 columns TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF19166 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF19166 @@ -4808,7 +4811,7 @@ \subsection{EMBOSS needle and water} >>> from Bio.Emboss.Applications import NeedleCommandline >>> needle_cline = NeedleCommandline(asequence="alpha.faa", bsequence="beta.faa", ... gapopen=10, gapextend=0.5, outfile="needle.txt") ->>> print needle_cline +>>> print(needle_cline) needle -outfile=needle.txt -asequence=alpha.faa -bsequence=beta.faa -gapopen=10 -gapextend=0.5 \end{verbatim} @@ -4854,9 +4857,9 @@ \subsection{EMBOSS needle and water} >>> needle_cline.gapopen=10 >>> needle_cline.gapextend=0.5 >>> needle_cline.outfile="needle.txt" ->>> print needle_cline +>>> print(needle_cline) needle -outfile=needle.txt -asequence=alpha.faa -bsequence=beta.faa -gapopen=10 -gapextend=0.5 ->>> print needle_cline.outfile +>>> print(needle_cline.outfile) needle.txt \end{verbatim} @@ -4866,7 +4869,7 @@ \subsection{EMBOSS needle and water} \begin{verbatim} >>> stdout, stderr = needle_cline() ->>> print stdout + stderr +>>> print(stdout + stderr) Needleman-Wunsch global alignment of two sequences \end{verbatim} @@ -4876,7 +4879,7 @@ \subsection{EMBOSS needle and water} \begin{verbatim} >>> from Bio import AlignIO >>> align = AlignIO.read("needle.txt", "emboss") ->>> print align +>>> print(align) SingleLetterAlphabet() alignment with 2 rows and 149 columns MV-LSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTY...KYR HBA_HUMAN MVHLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRF...KYH HBB_HUMAN @@ -5143,7 +5146,7 @@ \subsection{Standalone NCBI BLAST+} >>> blastx_cline NcbiblastxCommandline(cmd='blastx', out='opuntia.xml', outfmt=5, query='opuntia.fasta', db='nr', evalue=0.001) ->>> print blastx_cline +>>> print(blastx_cline) blastx -out opuntia.xml -outfmt 5 -query opuntia.fasta -db nr -evalue 0.001 >>> stdout, stderr = blastx_cline() \end{verbatim} @@ -5324,13 +5327,13 @@ \section{The BLAST record class} >>> for alignment in blast_record.alignments: ... for hsp in alignment.hsps: ... if hsp.expect < E_VALUE_THRESH: -... print '****Alignment****' -... print 'sequence:', alignment.title -... print 'length:', alignment.length -... print 'e value:', hsp.expect -... print hsp.query[0:75] + '...' -... print hsp.match[0:75] + '...' -... print hsp.sbjct[0:75] + '...' +... print('****Alignment****') +... print('sequence:', alignment.title) +... print('length:', alignment.length) +... print('e value:', hsp.expect) +... print(hsp.query[0:75] + '...') +... print(hsp.match[0:75] + '...') +... print(hsp.sbjct[0:75] + '...') \end{verbatim} This will print out summary reports like the following: @@ -5423,13 +5426,13 @@ \subsection{Parsing plain-text BLAST output} >>> for alignment in blast_record.alignments: ... for hsp in alignment.hsps: ... if hsp.expect < E_VALUE_THRESH: -... print '****Alignment****' -... print 'sequence:', alignment.title -... print 'length:', alignment.length -... print 'e value:', hsp.expect -... print hsp.query[0:75] + '...' -... print hsp.match[0:75] + '...' -... print hsp.sbjct[0:75] + '...' +... print('****Alignment****') +... print('sequence:', alignment.title) +... print('length:', alignment.length) +... print('e value:', hsp.expect) +... print(hsp.query[0:75] + '...') +... print(hsp.match[0:75] + '...') +... print(hsp.sbjct[0:75] + '...') \end{verbatim} If you also read the section~\ref{sec:parsing-blast} on parsing BLAST XML output, you'll notice that the above code is identical to what is found in that section. Once you parse something into a record class you can deal with it independent of the format of the original BLAST info you were parsing. Pretty snazzy! @@ -5469,17 +5472,17 @@ \subsection{Parsing a plain-text BLAST file full of BLAST runs} ... for alignment in blast_record.alignments: ... for hsp in alignment.hsps: ... if hsp.expect < E_VALUE_THRESH: -... print '****Alignment****' -... print 'sequence:', alignment.title -... print 'length:', alignment.length -... print 'e value:', hsp.expect +... print('****Alignment****') +... print('sequence:', alignment.title) +... print('length:', alignment.length) +... print('e value:', hsp.expect) ... if len(hsp.query) > 75: ... dots = '...' ... else: ... dots = '' -... print hsp.query[0:75] + dots -... print hsp.match[0:75] + dots -... print hsp.sbjct[0:75] + dots +... print(hsp.query[0:75] + dots) +... print(hsp.match[0:75] + dots) +... print(hsp.sbjct[0:75] + dots) \end{verbatim} %Notice that \verb|b_iterator.next()| will return \verb|None| when it runs out of records to parse, so it is easy to iterate through the entire file with a while loop that checks for the existence of a record. @@ -5524,7 +5527,7 @@ \subsection{Finding a bad record somewhere in a huge plain-text BLAST file} >>> try: ... next_record = iterator.next() ... except NCBIStandalone.LowQualityBlastError as info: -... print "LowQualityBlastError detected in id %s" % info[1] +... print("LowQualityBlastError detected in id %s" % info[1]) \end{verbatim} The \verb|.next()| method is normally called indirectly via a \verb|for|-loop. @@ -5701,7 +5704,7 @@ \subsection{QueryResult} \begin{verbatim} >>> from Bio import SearchIO >>> blast_qresult = SearchIO.read('my_blast.xml', 'blast-xml') ->>> print blast_qresult +>>> print(blast_qresult) Program: blastn (2.2.27+) Query: 42291 (61) mystery_seq @@ -5766,7 +5769,7 @@ \subsection{QueryResult} %cont-doctest \begin{verbatim} >>> blat_qresult = SearchIO.read('my_blat.psl', 'blat-psl') ->>> print blat_qresult +>>> print(blat_qresult) Program: blat () Query: mystery_seq (61) @@ -5807,9 +5810,9 @@ \subsection{QueryResult} %cont-doctest \begin{verbatim} ->>> print "%s %s" % (blast_qresult.program, blast_qresult.version) +>>> print("%s %s" % (blast_qresult.program, blast_qresult.version)) blastn 2.2.27+ ->>> print "%s %s" % (blat_qresult.program, blat_qresult.version) +>>> print("%s %s" % (blat_qresult.program, blat_qresult.version)) blat >>> blast_qresult.param_evalue_threshold # blast-xml specific 10.0 @@ -5870,7 +5873,7 @@ \subsection{QueryResult} %cont-doctest \begin{verbatim} >>> blast_slice = blast_qresult[:3] # slices the first three hits ->>> print blast_slice +>>> print(blast_slice) Program: blastn (2.2.27+) Query: 42291 (61) mystery_seq @@ -5944,7 +5947,7 @@ \subsection{QueryResult} %cont-doctest \begin{verbatim} >>> for hit in blast_qresult[:5]: # id and sequence length of the first five hits -... print hit.id, hit.seq_len +... print(hit.id, hit.seq_len) ... gi|262205317|ref|NR_030195.1| 61 gi|301171311|ref|NR_035856.1| 60 @@ -5955,7 +5958,7 @@ \subsection{QueryResult} >>> sort_key = lambda hit: hit.seq_len >>> sorted_qresult = blast_qresult.sort(key=sort_key, reverse=True, in_place=False) >>> for hit in sorted_qresult[:5]: -... print hit.id, hit.seq_len +... print(hit.id, hit.seq_len) ... gi|397513516|ref|XM_003827011.1| 6002 gi|390332045|ref|XM_776818.2| 4082 @@ -6008,7 +6011,7 @@ \subsection{QueryResult} >>> len(filtered_qresult) # no. of hits after filtering 37 >>> for hit in filtered_qresult[:5]: # quick check for the hit lengths -... print hit.id, len(hit.hsps) +... print(hit.id, len(hit.hsps)) gi|301171322|ref|NR_035857.1| 2 gi|262205330|ref|NR_030198.1| 2 gi|301171447|ref|NR_035871.1| 2 @@ -6035,7 +6038,7 @@ \subsection{QueryResult} ... >>> mapped_qresult = blast_qresult.hit_map(map_func) >>> for hit in mapped_qresult[:5]: -... print hit.id +... print(hit.id) NR_030195.1 NR_035856.1 NR_032573.1 @@ -6064,7 +6067,7 @@ \subsection{Hit} \end{verbatim} %HACK: because Py2.5 in windows output floating points slightly different \begin{verbatim} ->>> print blast_hit +>>> print(blast_hit) Query: 42291 mystery_seq Hit: gi|301171322|ref|NR_035857.1| (86) @@ -6099,7 +6102,7 @@ \subsection{Hit} \begin{verbatim} >>> blat_qresult = SearchIO.read('my_blat.psl', 'blat-psl') >>> blat_hit = blat_qresult[0] # the only hit ->>> print blat_hit +>>> print(blat_hit) Query: mystery_seq Hit: chr19 (59128983) @@ -6177,7 +6180,7 @@ \subsection{Hit} >>> sliced_hit = blat_hit[4:9] # retrieve multiple items >>> len(sliced_hit) 5 ->>> print sliced_hit +>>> print(sliced_hit) Query: mystery_seq Hit: chr19 (59128983) @@ -6222,9 +6225,9 @@ \subsection{HSP} >>> blast_qresult = SearchIO.read('my_blast.xml', 'blast-xml') >>> blast_hsp = blast_qresult[0][0] # first hit, first hsp \end{verbatim} -%HACK: because Py2.5 in windows output floating points slightly different +%HACK: because Python 2.5 in windows output floating points slightly different \begin{verbatim} ->>> print blast_hsp +>>> print(blast_hsp) Query: 42291 mystery_seq Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520... Query range: [0:61] (1) @@ -6328,7 +6331,7 @@ \subsection{HSP} %cont-doctest \begin{verbatim} ->>> print blast_hsp.aln +>>> print(blast_hsp.aln) DNAAlphabet() alignment with 2 rows and 61 columns CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG 42291 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG gi|262205317|ref|NR_030195.1| @@ -6342,7 +6345,7 @@ \subsection{HSP} \begin{verbatim} >>> blat_qresult = SearchIO.read('my_blat.psl', 'blat-psl') >>> blat_hsp = blat_qresult[0][0] # first hit, first hsp ->>> print blat_hsp +>>> print(blat_hsp) Query: mystery_seq Hit: chr19 Query range: [0:61] (1) @@ -6405,7 +6408,7 @@ \subsection{HSP} %cont-doctest \begin{verbatim} >>> blat_hsp2 = blat_qresult[0][1] # first hit, second hsp ->>> print blat_hsp2 +>>> print(blat_hsp2) Query: mystery_seq Hit: chr19 Query range: [0:61] (1) @@ -6510,7 +6513,7 @@ \subsection{HSPFragment} >>> from Bio import SearchIO >>> blast_qresult = SearchIO.read('my_blast.xml', 'blast-xml') >>> blast_frag = blast_qresult[0][0][0] # first hit, first hsp, first fragment ->>> print blast_frag +>>> print(blast_frag) Query: 42291 mystery_seq Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520... Query range: [0:61] (1) @@ -6528,7 +6531,7 @@ \subsection{HSPFragment} \begin{verbatim} >>> blat_qresult = SearchIO.read('my_blat.psl', 'blat-psl') >>> blat_frag = blat_qresult[0][0][0] # first hit, first hsp, first fragment ->>> print blat_frag +>>> print(blat_frag) Query: mystery_seq Hit: chr19 Query range: [0:61] (1) @@ -6645,12 +6648,12 @@ \section{Reading search output files} >>> from Bio import SearchIO >>> qresults = SearchIO.parse('tab_2226_tblastn_001.txt', 'blast-tab') >>> for qresult in qresults: -... print qresult.id +... print(qresult.id) gi|16080617|ref|NP_391444.1| gi|11464971:4-101 >>> qresults2 = SearchIO.parse('tab_2226_tblastn_005.txt', 'blast-tab', comments=True) >>> for qresult in qresults2: -... print qresult.id +... print(qresult.id) random_s00 gi|16080617|ref|NP_391444.1| gi|11464971:4-101 @@ -6821,7 +6824,7 @@ \section{EInfo: Obtaining information about the Entrez databases} \end{verbatim} The variable \verb+result+ now contains a list of databases in XML format: \begin{verbatim} ->>> print result +>>> print(result) @@ -6906,7 +6909,7 @@ \section{EInfo: Obtaining information about the Entrez databases} \begin{verbatim} >>> for field in record["DbInfo"]["FieldList"]: -... print "%(Name)s, %(FullName)s, %(Description)s" % field +... print("%(Name)s, %(FullName)s, %(Description)s" % field) ALL, All Fields, All terms from all searchable fields UID, UID, Unique number assigned to publication FILT, Filter, Limits the records @@ -6995,7 +6998,7 @@ \section{EPost: Uploading a list of identifiers} >>> from Bio import Entrez >>> Entrez.email = "A.N.Other@example.com" # Always tell NCBI who you are >>> id_list = ["19304878", "18606172", "16403221", "16377612", "14871861", "14630660"] ->>> print Entrez.epost("pubmed", id=",".join(id_list)).read() +>>> print(Entrez.epost("pubmed", id=",".join(id_list)).read()) @@ -7045,7 +7048,7 @@ \section{EFetch: Downloading full records from Entrez} >>> from Bio import Entrez >>> Entrez.email = "A.N.Other@example.com" # Always tell NCBI who you are >>> handle = Entrez.efetch(db="nucleotide", id="186972394", rettype="gb", retmode="text") ->>> print handle.read() +>>> print(handle.read()) LOCUS EU490707 1302 bp DNA linear PLN 05-MAY-2008 DEFINITION Selenipedium aequinoctiale maturase K (matK) gene, partial cds; chloroplast. @@ -7135,7 +7138,7 @@ \section{EFetch: Downloading full records from Entrez} >>> handle = Entrez.efetch(db="nucleotide", id="186972394",rettype="gb", retmode="text") >>> record = SeqIO.read(handle, "genbank") >>> handle.close() ->>> print record +>>> print(record) ID: EU490707.1 Name: EU490707 Description: Selenipedium aequinoctiale maturase K (matK) gene, partial cds; chloroplast. @@ -7159,11 +7162,11 @@ \section{EFetch: Downloading full records from Entrez} out_handle.write(net_handle.read()) out_handle.close() net_handle.close() - print "Saved" + print("Saved") -print "Parsing..." +print("Parsing...") record = SeqIO.read(filename, "genbank") -print record +print(record) \end{verbatim} To get the output in XML format, which you can parse using the \verb+Bio.Entrez.read()+ function, use \verb+retmode="xml"+: @@ -7213,7 +7216,7 @@ \section{ELink: Searching for related items in NCBI Entrez} >>> len(record[0]["LinkSetDb"]) 5 >>> for linksetdb in record[0]["LinkSetDb"]: -... print linksetdb["DbTo"], linksetdb["LinkName"], len(linksetdb["Link"]) +... print(linksetdb["DbTo"], linksetdb["LinkName"], len(linksetdb["Link"])) ... pubmed pubmed_pubmed 110 pubmed pubmed_pubmed_combined 6 @@ -7241,7 +7244,8 @@ \section{ELink: Searching for related items in NCBI Entrez} We can use a loop to print out all PubMed IDs: \begin{verbatim} ->>> for link in record[0]["LinkSetDb"][0]["Link"] : print link["Id"] +>>> for link in record[0]["LinkSetDb"][0]["Link"]: +... print(link["Id"]) 19304878 14630660 18689808 @@ -7267,7 +7271,8 @@ \section{EGQuery: Global Query - counts for search terms} >>> Entrez.email = "A.N.Other@example.com" # Always tell NCBI who you are >>> handle = Entrez.egquery(term="biopython") >>> record = Entrez.read(handle) ->>> for row in record["eGQueryResult"]: print row["DbName"], row["Count"] +>>> for row in record["eGQueryResult"]: +... print(row["DbName"], row["Count"]) ... pubmed 6 pmc 62 @@ -7317,7 +7322,7 @@ \section{Parsing huge Entrez XML files} ... continue ... geneid = record['Entrezgene_track-info']['Gene-track']['Gene-track_geneid'] ... genename = record['Entrezgene_gene']['Gene-ref']['Gene-ref_locus'] -... print geneid, genename +... print(geneid, genename) \end{verbatim} This will print: @@ -7521,7 +7526,7 @@ \subsection{Parsing Medline records} >>> input = open("pubmed_result2.txt") >>> records = Medline.parse(input) >>> for record in records: -... print record["TI"] +... print(record["TI"]) A high level interface to SCOP and ASTRAL implemented in python. GenomeDiagram: a python package for the visualization of large-scale genomic data. Open source clustering software. @@ -7547,7 +7552,7 @@ \subsection{Parsing Medline records} >>> from Bio import Medline >>> records = Medline.parse(handle) >>> for record in records: -... print record["AU"] +... print(record["AU"]) ['Cock PJ', 'Antao T', 'Chang JT', 'Chapman BA', 'Cox CJ', 'Dalke A', ..., 'de Hoon MJ'] ['Munteanu CR', 'Gonzalez-Diaz H', 'Magalhaes AL'] ['Casbon JA', 'Crooks GE', 'Saqi MA'] @@ -7563,7 +7568,7 @@ \subsection{Parsing Medline records} >>> handle = Entrez.efetch(db="pubmed",id=idlist,rettype="medline",retmode="xml") >>> records = Entrez.read(handle) >>> for record in records: -... print record["MedlineCitation"]["Article"]["ArticleTitle"] +... print(record["MedlineCitation"]["Article"]["ArticleTitle"]) Biopython: freely available Python tools for computational molecular biology and bioinformatics. Enzymes/non-enzymes classification model complexity based on composition, sequence, @@ -7595,7 +7600,7 @@ \subsection{Parsing GEO records} >>> handle = open("GSE16.txt") >>> records = Geo.parse(handle) >>> for record in records: -... print record +... print(record) \end{verbatim} You can search the ``gds'' database (GEO datasets) with ESearch: @@ -7699,7 +7704,7 @@ \subsection{Parsing UniGene records} >>> input = open("unigenerecords.data") >>> records = UniGene.parse(input) >>> for record in records: -... print record.ID +... print(record.ID) \end{verbatim} \section{Using a proxy} @@ -7740,7 +7745,7 @@ \subsection{PubMed and Medline} >>> record = Entrez.read(handle) >>> for row in record["eGQueryResult"]: ... if row["DbName"]=="pubmed": -... print row["Count"] +... print(row["Count"]) 463 \end{verbatim} @@ -7749,7 +7754,7 @@ \subsection{PubMed and Medline} >>> handle = Entrez.esearch(db="pubmed", term="orchid", retmax=463) >>> record = Entrez.read(handle) >>> idlist = record["IdList"] ->>> print idlist +>>> print(idlist) \end{verbatim} @@ -7776,12 +7781,13 @@ \subsection{PubMed and Medline} \end{verbatim} Let's now iterate over the records to print out some information about each record: +%TODO - Replace the print blank line with print()? \begin{verbatim} >>> for record in records: -... print "title:", record.get("TI", "?") -... print "authors:", record.get("AU", "?") -... print "source:", record.get("SO", "?") -... print +... print("title:", record.get("TI", "?")) +... print("authors:", record.get("AU", "?")) +... print("source:", record.get("SO", "?")) +... print("") \end{verbatim} The output for this looks like: @@ -7802,7 +7808,7 @@ \subsection{PubMed and Medline} ... if not "AU" in record: ... continue ... if search_author in record["AU"]: -... print "Author %s found: %s" % (search_author, record["SO"]) +... print("Author %s found: %s" % (search_author, record["SO"])) \end{verbatim} Hopefully this section gave you an idea of the power and flexibility of the Entrez and Medline interfaces and how they can be used together. @@ -7820,7 +7826,7 @@ \subsection{Searching, downloading, and parsing Entrez Nucleotide records} >>> record = Entrez.read(handle) >>> for row in record["eGQueryResult"]: ... if row["DbName"]=="nuccore": -... print row["Count"] +... print(row["Count"]) 814 \end{verbatim} @@ -7833,22 +7839,22 @@ \subsection{Searching, downloading, and parsing Entrez Nucleotide records} Here, \verb+record+ is a Python dictionary containing the search results and some auxiliary information. Just for information, let's look at what is stored in this dictionary: \begin{verbatim} ->>> print record.keys() +>>> print(record.keys()) [u'Count', u'RetMax', u'IdList', u'TranslationSet', u'RetStart', u'QueryTranslation'] \end{verbatim} First, let's check how many results were found: \begin{verbatim} ->>> print record["Count"] +>>> print(record["Count"]) '814' \end{verbatim} which is the number we expected. The 814 results are stored in \verb+record['IdList']+: \begin{verbatim} ->>> print len(record["IdList"]) +>>> len(record["IdList"]) 814 \end{verbatim} Let's look at the first five results: \begin{verbatim} ->>> print record["IdList"][:5] +>>> record["IdList"][:5] ['187237168', '187372713', '187372690', '187372688', '187372686'] \end{verbatim} @@ -7859,16 +7865,16 @@ \subsection{Searching, downloading, and parsing Entrez Nucleotide records} \begin{verbatim} >>> idlist = ",".join(record["IdList"][:5]) ->>> print idlist +>>> print(idlist) 187237168,187372713,187372690,187372688,187372686 >>> handle = Entrez.efetch(db="nucleotide", id=idlist, retmode="xml") >>> records = Entrez.read(handle) ->>> print len(records) +>>> len(records) 5 \end{verbatim} Each of these records corresponds to one GenBank record. \begin{verbatim} ->>> print records[0].keys() +>>> print(records[0].keys()) [u'GBSeq_moltype', u'GBSeq_source', u'GBSeq_sequence', u'GBSeq_primary-accession', u'GBSeq_definition', u'GBSeq_accession-version', u'GBSeq_topology', u'GBSeq_length', u'GBSeq_feature-table', @@ -7876,17 +7882,17 @@ \subsection{Searching, downloading, and parsing Entrez Nucleotide records} u'GBSeq_taxonomy', u'GBSeq_references', u'GBSeq_update-date', u'GBSeq_organism', u'GBSeq_locus', u'GBSeq_strandedness'] ->>> print records[0]["GBSeq_primary-accession"] +>>> print(records[0]["GBSeq_primary-accession"]) DQ110336 ->>> print records[0]["GBSeq_other-seqids"] +>>> print(records[0]["GBSeq_other-seqids"]) ['gb|DQ110336.1|', 'gi|187237168'] ->>> print records[0]["GBSeq_definition"] +>>> print(records[0]["GBSeq_definition"]) Cypripedium calceolus voucher Davis 03-03 A maturase (matR) gene, partial cds; mitochondrial ->>> print records[0]["GBSeq_organism"] +>>> print(records[0]["GBSeq_organism"]) Cypripedium calceolus \end{verbatim} @@ -7909,7 +7915,7 @@ \subsection{Searching, downloading, and parsing GenBank records} >>> record = Entrez.read(handle) >>> for row in record["eGQueryResult"]: ... if row["DbName"]=="nuccore": -... print row["Count"] +... print(row["Count"]) ... 9 \end{verbatim} @@ -7934,7 +7940,7 @@ \subsection{Searching, downloading, and parsing GenBank records} \begin{verbatim} >>> text = handle.read() ->>> print text +>>> print(text) LOCUS AY851612 892 bp DNA linear PLN 10-APR-2007 DEFINITION Opuntia subulata rpl16 gene, intron; chloroplast. ACCESSION AY851612 @@ -7961,8 +7967,8 @@ \subsection{Searching, downloading, and parsing GenBank records} \noindent We can now step through the records and look at the information we are interested in: \begin{verbatim} >>> for record in records: ->>> ... print "%s, length %i, with %i features" \ ->>> ... % (record.name, len(record), len(record.features)) +>>> ... print("%s, length %i, with %i features" \ +>>> ... % (record.name, len(record), len(record.features))) AY851612, length 892, with 3 features AY851611, length 881, with 3 features AF191661, length 895, with 3 features @@ -8075,7 +8081,7 @@ \subsection{Searching for and downloading sequences using the history} out_handle = open("orchid_rpl16.fasta", "w") for start in range(0,count,batch_size): end = min(count, start+batch_size) - print "Going to download record %i to %i" % (start+1, end) + print("Going to download record %i to %i" % (start+1, end)) fetch_handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", retstart=start, retmax=batch_size, webenv=webenv, query_key=query_key) @@ -8098,13 +8104,13 @@ \subsection{Searching for and downloading abstracts using the history} reldate=365, datetype="pdat", usehistory="y")) count = int(search_results["Count"]) -print "Found %i results" % count +print("Found %i results" % count) batch_size = 10 out_handle = open("recent_orchid_papers.txt", "w") for start in range(0,count,batch_size): end = min(count, start+batch_size) - print "Going to download record %i to %i" % (start+1, end) + print("Going to download record %i to %i" % (start+1, end)) fetch_handle = Entrez.efetch(db="pubmed", rettype="medline", retmode="text", retstart=start, retmax=batch_size, @@ -8210,17 +8216,18 @@ \subsection{Parsing Swiss-Prot records} This function should be used if the handle points to exactly one Swiss-Prot record. It raises a \verb|ValueError| if no Swiss-Prot record was found, and also if more than one record was found. We can now print out some information about this record: +%TODO - Check the single quotes when printing record.description here: \begin{verbatim} ->>> print record.description +>>> print(record.description) 'RecName: Full=Chalcone synthase 3; EC=2.3.1.74; AltName: Full=Naringenin-chalcone synthase 3;' >>> for ref in record.references: -... print "authors:", ref.authors -... print "title:", ref.title +... print("authors:", ref.authors) +... print("title:", ref.title) ... authors: Liew C.F., Lim S.H., Loh C.S., Goh C.J.; title: "Molecular cloning and sequence analysis of chalcone synthase cDNAs of Bromheadia finlaysoniana."; ->>> print record.organism_classification +>>> print(record.organism_classification) ['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', ..., 'Bromheadia'] \end{verbatim} @@ -8320,8 +8327,8 @@ \subsection{Parsing the Swiss-Prot keyword and category list} >>> handle = open("keywlist.txt") >>> records = KeyWList.parse(handle) >>> for record in records: -... print record['ID'] -... print record['DE'] +... print(record['ID']) +... print(record['DE']) \end{verbatim} This prints @@ -8380,7 +8387,7 @@ \section{Parsing Prosite records} >>> n = 0 >>> for record in records: n+=1 ... ->>> print n +>>> n 2073 \end{verbatim} @@ -8518,7 +8525,7 @@ \subsection{Retrieving a Swiss-Prot record} ... try: ... record = SwissProt.read(handle) ... except ValueException: -... print "WARNING: Accession %s not found" % accession +... print("WARNING: Accession %s not found" % accession) ... records.append(record) \end{verbatim} @@ -8563,7 +8570,7 @@ \subsection{Retrieving Prosite and Prosite documentation records} >>> from Bio import ExPASy >>> handle = ExPASy.get_prosite_raw('PS00001') >>> text = handle.read() ->>> print text +>>> print(text) \end{verbatim} To retrieve a Prosite record and parse it into a \verb|Bio.Prosite.Record| object, use @@ -8893,7 +8900,7 @@ \section{Structure representation} \begin{verbatim} >>> full_id = residue.get_full_id() ->>> print full_id +>>> print(full_id) ("1abc", 0, "A", ("", 10, "A")) \end{verbatim} @@ -9175,10 +9182,10 @@ \subsection{Disordered atoms\label{disordered atoms}} \begin{verbatim} >>> atom.disordered_select('A') # select altloc A atom ->>> print atom.get_altloc() +>>> print(atom.get_altloc()) "A" >>> atom.disordered_select('B') # select altloc B atom ->>> print atom.get_altloc() +>>> print(atom.get_altloc()) "B" \end{verbatim} @@ -9276,7 +9283,7 @@ \subsubsection*{Iterating through all atoms of a structure} ... for chain in model: ... for residue in chain: ... for atom in residue: -... print atom +... print(atom) ... \end{verbatim} @@ -9284,7 +9291,7 @@ \subsubsection*{Iterating through all atoms of a structure} \begin{verbatim} >>> atoms = structure.get_atoms() >>> for atom in atoms: -... print atom +... print(atom) ... \end{verbatim} @@ -9292,7 +9299,7 @@ \subsubsection*{Iterating through all atoms of a structure} \begin{verbatim} >>> atoms = chain.get_atoms() >>> for atom in atoms: -... print atom +... print(atom) ... \end{verbatim} @@ -9302,7 +9309,7 @@ \subsubsection*{Iterating over all residues of a model} \begin{verbatim} >>> residues = model.get_residues() >>> for residue in residues: -... print residue +... print(residue) ... \end{verbatim} @@ -9339,7 +9346,7 @@ \subsubsection*{Print all hetero residues in chain} ... residue_id = residue.get_id() ... hetfield = residue_id[0] ... if hetfield[0]=="H": -... print residue_id +... print(residue_id) ... \end{verbatim} @@ -9352,7 +9359,7 @@ \subsubsection*{Print out the coordinates of all CA atoms in a structure with B ... if residue.has_id("CA"): ... ca = residue["CA"] ... if ca.get_bfactor() > 50.0: -... print ca.get_coord() +... print(ca.get_coord()) ... \end{verbatim} @@ -9367,7 +9374,7 @@ \subsubsection*{Print out all the residues that contain disordered atoms} ... resname = residue.get_resname() ... model_id = model.get_id() ... chain_id = chain.get_id() -... print model_id, chain_id, resname, resseq +... print(model_id, chain_id, resname, resseq) ... \end{verbatim} @@ -9395,7 +9402,7 @@ \subsubsection*{Extracting polypeptides from a \texttt{Structure} object\label{s >>> model_nr = 1 >>> polypeptide_list = build_peptides(structure, model_nr) >>> for polypeptide in polypeptide_list: -... print polypeptide +... print(polypeptide) ... \end{verbatim} @@ -9408,12 +9415,12 @@ \subsubsection*{Extracting polypeptides from a \texttt{Structure} object\label{s # Using C-N >>> ppb=PPBuilder() >>> for pp in ppb.build_peptides(structure): -... print pp.get_sequence() +... print(pp.get_sequence()) ... # Using CA-CA >>> ppb=CaPPBuilder() >>> for pp in ppb.build_peptides(structure): -... print pp.get_sequence() +... print(pp.get_sequence()) ... \end{verbatim} Note that in the above case only model 0 of the structure is considered @@ -9433,7 +9440,7 @@ \subsubsection*{Obtaining the sequence of a structure} \begin{verbatim} >>> seq = polypeptide.get_sequence() ->>> print seq +>>> print(seq) Seq('SNVVE...', ) \end{verbatim} @@ -9499,8 +9506,8 @@ \subsection{Superimposing two structures} # The moving atoms will be put on the fixed atoms >>> sup.set_atoms(fixed, moving) # Print rotation/translation/rmsd ->>> print sup.rotran ->>> print sup.rms +>>> print(sup.rotran) +>>> print(sup.rms) # Apply rotation/translation to the moving atoms >>> sup.apply(moving) \end{verbatim} @@ -9542,7 +9549,7 @@ \subsection{Calculating the Half Sphere Exposure} # Calculate classical coordination number >>> exp_fs = hse.calc_fs_exposure(model) # Print HSEalpha for a residue ->>> print exp_ca[some_residue] +>>> print(exp_ca[some_residue]) \end{verbatim} \subsection{Determining the secondary structure} @@ -10492,7 +10499,7 @@ \section{Demo: What's in a Tree?} Printing the tree object as a string gives us a look at the entire object hierarchy. \begin{verbatim} ->>> print tree +>>> print(tree) Tree(weight=1.0, rooted=False, name="") Clade(branch_length=1.0) @@ -10702,7 +10709,7 @@ \section{I/O functions} \begin{verbatim} >>> from Bio import Phylo >>> tree = Phylo.read("Tests/Nexus/int_node_labels.nwk", "newick") ->>> print tree +>>> print(tree) \end{verbatim} (Example files are available in the \texttt{Tests/Nexus/} and \texttt{Tests/PhyloXML/} @@ -10714,7 +10721,7 @@ \section{I/O functions} \begin{verbatim} >>> trees = Phylo.parse("Tests/PhyloXML/phyloxml_examples.xml", "phyloxml") >>> for tree in trees: -... print tree +... print(tree) \end{verbatim} Write a tree or iterable of trees back to file with the \verb|write| function: @@ -10754,9 +10761,10 @@ \section{View and export trees} The simplest way to get an overview of a \verb|Tree| object is to \verb|print| it: +%TODO - make this into a doctest? \begin{verbatim} >>> tree = Phylo.read("Tests/PhyloXML/example.xml", "phyloxml") ->>> print tree +>>> print(tree) Phylogeny(rooted='True', description='phyloXML allows to use either a "branch_length" attribute...', name='example from Prof. Joe Felsenstein's book "Inferring Phyl...') Clade() @@ -11215,7 +11223,7 @@ \subsection{Features of PhyloXML trees} % The object hierarchy still looks and behaves similarly: % \begin{verbatim} -% >>> print tree +% >>> print(tree) % Phylogeny(rooted=True, name="") % Clade(branch_length=1.0) @@ -11310,13 +11318,13 @@ \section{PAML integration} >>> ns_sites = results.get("NSsites") >>> m0 = ns_sites.get(0) >>> m0_params = m0.get("parameters") ->>> print m0_params.get("omega") +>>> print(m0_params.get("omega")) \end{verbatim} Existing output files may be parsed as well using a module's \texttt{read()} function: \begin{verbatim} >>> results = codeml.read("Tests/PAML/Results/codeml/codeml_NSsites_all.out") ->>> print results.get("lnL max") +>>> print(results.get("lnL max")) \end{verbatim} Detailed documentation for this new module currently lives on the Biopython wiki: @@ -11419,7 +11427,7 @@ \subsection{Creating a motif from instances} Printing out the Motif object shows the instances from which it was constructed: %cont-doctest \begin{verbatim} ->>> print m +>>> print(m) TACAA TACGC TACAC @@ -11439,7 +11447,7 @@ \subsection{Creating a motif from instances} nucleotide at each position. Printing this counts matrix shows it in an easily readable format: %cont-doctest \begin{verbatim} ->>> print m.counts +>>> print(m.counts) 0 1 2 3 4 A: 3.00 7.00 0.00 2.00 1.00 C: 0.00 0.00 5.00 2.00 6.00 @@ -11521,7 +11529,7 @@ \subsection{Creating a motif from instances} Seq('GCGTA', IUPACUnambiguousDNA()) >>> r.degenerate_consensus Seq('GBGTW', IUPACAmbiguousDNA()) ->>> print r +>>> print(r) TTGTA GCGTA GTGTA @@ -11601,10 +11609,10 @@ \subsubsection*{The JASPAR \texttt{sites} format} The instances from which this motif was created is stored in the \verb+.instances+ property: %cont-doctest \begin{verbatim} ->>> print arnt.instances[:3] +>>> print(arnt.instances[:3]) [Seq('CACGTG', IUPACUnambiguousDNA()), Seq('CACGTG', IUPACUnambiguousDNA()), Seq('CACGTG', IUPACUnambiguousDNA())] >>> for instance in arnt.instances: -... print instance +... print(instance) ... CACGTG CACGTG @@ -11630,7 +11638,7 @@ \subsubsection*{The JASPAR \texttt{sites} format} The counts matrix of this motif is automatically calculated from the instances: %cont-doctest \begin{verbatim} ->>> print arnt.counts +>>> print(arnt.counts) 0 1 2 3 4 5 A: 4.00 19.00 0.00 0.00 0.00 0.00 C: 16.00 0.00 20.00 0.00 0.00 0.00 @@ -11656,7 +11664,7 @@ \subsubsection*{The JASPAR \texttt{pfm} format} %cont-doctest \begin{verbatim} >>> srf = motifs.read(open("SRF.pfm"),"pfm") ->>> print srf.counts +>>> print(srf.counts) 0 1 2 3 4 5 6 7 8 9 10 11 A: 2.00 9.00 0.00 1.00 32.00 3.00 46.00 1.00 43.00 15.00 2.00 2.00 C: 1.00 33.00 45.00 45.00 1.00 1.00 0.00 0.00 0.00 1.00 0.00 1.00 @@ -11667,15 +11675,15 @@ \subsubsection*{The JASPAR \texttt{pfm} format} As this motif was created from the counts matrix directly, it has no instances associated with it: %cont-doctest \begin{verbatim} ->>> print srf.instances +>>> print(srf.instances) None \end{verbatim} We can now ask for the consensus sequence of these two motifs: %cont-doctest \begin{verbatim} ->>> print arnt.counts.consensus +>>> print(arnt.counts.consensus) CACGTG ->>> print srf.counts.consensus +>>> print(srf.counts.consensus) GCCCATATATGG \end{verbatim} @@ -11705,7 +11713,7 @@ \subsubsection*{The JASPAR format \texttt{jaspar}} \begin{verbatim} >>> fh = open("jaspar_motifs.txt") >>> for m in motifs.parse(fh, "jaspar")) -... print m +... print(m) TF name Arnt Matrix ID MA0004.1 Matrix: @@ -11765,7 +11773,7 @@ \subsubsection*{Accessing the JASPAR database} \end{verbatim} Printing the motif reveals that the JASPAR SQL database stores much more meeta-information than the flat files: \begin{verbatim} ->>> print arnt +>>> print(arnt) TF name Arnt Matrix ID MA0004.1 Collection CORE @@ -11791,7 +11799,7 @@ \subsubsection*{Accessing the JASPAR database} We can also fetch motifs by name. The name must be an exact match (partial matches or database wildcards are not currently supported). Note that as the name is not guaranteed to be unique, the \verb+fetch_motifs_by_name+ method actually returns a list. \begin{verbatim} >>> motifs = jdb.fetch_motifs_by_name("Arnt") ->>> print motifs[0] +>>> print(motifs[0]) TF name Arnt Matrix ID MA0004.1 Collection CORE @@ -11852,6 +11860,7 @@ \subsubsection*{Compatibility with Perl TFBS modules} >>> rel_score = (abs_score - pssm.min) / (pssm.max - pssm.min) \end{verbatim} For example, using the Arnt motif before, let's search a sequence with a relative score threshold of 0.8. +%TODO - Check missing ... lines, make into a doctest? \begin{verbatim} >>> test_seq=Seq("TAAGCGTGCACGCGCAACACGTGCATTA", unambiguous_dna) >>> arnt.pseudocounts = motifs.jaspar.calculate_pseudocounts(arnt) @@ -11862,8 +11871,8 @@ \subsubsection*{Compatibility with Perl TFBS modules} >>> for position, score in pssm.search(test_seq, threshold=abs_score_threshold): ... rel_score = (score - min_score) / (max_score - min_score) -... print "Position %d: score = %5.3f, rel. score = %5.3f" % ( - position, score, rel_score) +... print("Position %d: score = %5.3f, rel. score = %5.3f" % ( + position, score, rel_score)) ... Position 2: score = 5.362, rel. score = 0.801 Position 8: score = 6.112, rel. score = 0.831 @@ -11958,9 +11967,9 @@ \subsection{MEME} >>> len(record) 2 >>> motif = record[0] ->>> print motif.consensus +>>> print(motif.consensus) TTCACATGCCGC ->>> print motif.degenerate_consensus +>>> print(motif.degenerate_consensus) TTCACATGSCNC \end{verbatim} In addition to these generic motif attributes, each motif also stores its @@ -11972,7 +11981,7 @@ \subsection{MEME} >>> motif.length 12 >>> evalue = motif.evalue ->>> print "%3.1g" % evalue +>>> print("%3.1g" % evalue) 0.2 >>> motif.name 'Motif 1' @@ -12005,7 +12014,7 @@ \subsection{MEME} \end{verbatim} %Sadly Python 2.5 on Windows gives 1.85e-008 breaking doctest: \begin{verbatim} ->>> print "%5.3g" % pvalue +>>> print("%5.3g" % pvalue) 1.85e-08 \end{verbatim} @@ -12139,7 +12148,7 @@ \subsection{TRANSFAC} Printing the motifs writes them out in their native TRANSFAC format: %cont-doctest \begin{verbatim} ->>> print record +>>> print(record) VV EXAMPLE January 15, 2013 XX // @@ -12192,7 +12201,7 @@ \section{Writing motifs} We can use the \verb+format+ method to write the motif in the simple JASPAR \verb+pfm+ format: %the tabs in the output confuse doctest; don't test \begin{verbatim} ->>> print arnt.format("pfm") +>>> print(arnt.format("pfm")) 4.00 19.00 0.00 0.00 0.00 0.00 16.00 0.00 20.00 0.00 0.00 0.00 0.00 1.00 0.00 20.00 0.00 20.00 @@ -12200,7 +12209,7 @@ \section{Writing motifs} \end{verbatim} Similarly, we can use \verb+format+ to write the motif in the JASPAR \verb+jaspar+ format: \begin{verbatim} ->>> print arnt.format("jaspar") +>>> print(arnt.format("jaspar")) >MA0004.1 Arnt A [ 4.00 19.00 0.00 0.00 0.00 0.00] C [ 16.00 0.00 20.00 0.00 0.00 0.00] @@ -12211,7 +12220,7 @@ \section{Writing motifs} To write the motif in a TRANSFAC-like matrix format, use %cont-doctest \begin{verbatim} ->>> print m.format("transfac") +>>> print(m.format("transfac")) P0 A C G T 01 3 0 0 4 W 02 7 0 0 0 A @@ -12228,7 +12237,7 @@ \section{Writing motifs} %cont-doctest \begin{verbatim} >>> two_motifs = [arnt, srf] ->>> print motifs.write(two_motifs, 'transfac') +>>> print(motifs.write(two_motifs, 'transfac')) P0 A C G T 01 4 16 0 0 C 02 19 0 1 0 A @@ -12259,7 +12268,7 @@ \section{Writing motifs} Or, to write multiple motifs in the \verb+jaspar+ format: \begin{verbatim} >>> two_motifs = [arnt, mef2a] ->>> print motifs.write(two_motifs, "jaspar") +>>> print(motifs.write(two_motifs, "jaspar")) >MA0004.1 Arnt A [ 4.00 19.00 0.00 0.00 0.00 0.00] C [ 16.00 0.00 20.00 0.00 0.00 0.00] @@ -12292,7 +12301,7 @@ \section{Position-Weight Matrices} %cont-doctest \begin{verbatim} >>> pwm = m.counts.normalize(pseudocounts=0.5) ->>> print pwm +>>> print(pwm) 0 1 2 3 4 A: 0.39 0.83 0.06 0.28 0.17 C: 0.06 0.06 0.61 0.28 0.72 @@ -12307,7 +12316,7 @@ \section{Position-Weight Matrices} %cont-doctest \begin{verbatim} >>> pwm = m.counts.normalize(pseudocounts={'A':0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6}) ->>> print pwm +>>> print(pwm) 0 1 2 3 4 A: 0.40 0.84 0.07 0.29 0.18 C: 0.04 0.04 0.60 0.27 0.71 @@ -12339,7 +12348,7 @@ \section{Position-Weight Matrices} %cont-doctest \begin{verbatim} >>> rpwm = pwm.reverse_complement() ->>> print rpwm +>>> print(rpwm) 0 1 2 3 4 A: 0.07 0.07 0.29 0.07 0.51 C: 0.04 0.38 0.04 0.04 0.04 @@ -12358,7 +12367,7 @@ \section{Position-Specific Scoring Matrices} %cont-doctest \begin{verbatim} >>> pssm = pwm.log_odds() ->>> print pssm +>>> print(pssm) 0 1 2 3 4 A: 0.68 1.76 -1.91 0.21 -0.49 C: -2.49 -2.49 1.26 0.09 1.51 @@ -12379,7 +12388,7 @@ \section{Position-Specific Scoring Matrices} \begin{verbatim} >>> background = {'A':0.3,'C':0.2,'G':0.2,'T':0.3} >>> pssm = pwm.log_odds(background) ->>> print pssm +>>> print(pssm) 0 1 2 3 4 A: 0.42 1.49 -2.17 -0.05 -0.75 C: -2.17 -2.17 1.58 0.42 1.83 @@ -12392,9 +12401,9 @@ \section{Position-Specific Scoring Matrices} \verb+.max+ and \verb+.min+ properties: %cont-doctest \begin{verbatim} ->>> print "%4.2f" % pssm.max +>>> print("%4.2f" % pssm.max) 6.59 ->>> print "%4.2f" % pssm.min +>>> print("%4.2f" % pssm.min) -10.85 \end{verbatim} @@ -12404,7 +12413,7 @@ \section{Position-Specific Scoring Matrices} \begin{verbatim} >>> mean = pssm.mean(background) >>> std = pssm.std(background) ->>> print "mean = %0.2f, standard deviation = %0.2f" % (mean, std) +>>> print("mean = %0.2f, standard deviation = %0.2f" % (mean, std)) mean = 3.21, standard deviation = 2.59 \end{verbatim} A uniform background is used if \verb+background+ is not specified. @@ -12437,7 +12446,7 @@ \subsection{Searching for exact matches} %cont-doctest \begin{verbatim} >>> for pos,seq in m.instances.search(test_seq): -... print pos, seq +... print(pos, seq) ... 0 TACAC 10 TACAA @@ -12447,7 +12456,7 @@ \subsection{Searching for exact matches} %cont-doctest \begin{verbatim} >>> for pos,seq in r.instances.search(test_seq): -... print pos, seq +... print(pos, seq) ... 6 GCATT 20 GCATT @@ -12459,7 +12468,7 @@ \subsection{Searching for matches using the PSSM score} %cont-doctest \begin{verbatim} >>> for position, score in pssm.search(test_seq, threshold=3.0): -... print "Position %d: score = %5.3f" % (position, score) +... print("Position %d: score = %5.3f" % (position, score)) ... Position 0: score = 5.622 Position -20: score = 4.601 @@ -12520,21 +12529,21 @@ \subsection{Selecting a score threshold} %cont-doctest \begin{verbatim} >>> threshold = distribution.threshold_fpr(0.01) ->>> print "%5.3f" % threshold +>>> print("%5.3f" % threshold) 4.009 \end{verbatim} or the false-negative rate (probability of ``not finding'' an instance generated from the motif): %cont-doctest \begin{verbatim} >>> threshold = distribution.threshold_fnr(0.1) ->>> print "%5.3f" % threshold +>>> print("%5.3f" % threshold) -0.510 \end{verbatim} or a threshold (approximately) satisfying some relation between the false-positive rate and the false-negative rate ($\frac{\textrm{fnr}}{\textrm{fpr}}\simeq t$): %cont-doctest \begin{verbatim} >>> threshold = distribution.threshold_balanced(1000) ->>> print "%5.3f" % threshold +>>> print("%5.3f" % threshold) 6.241 \end{verbatim} or a threshold satisfying (roughly) the equality between the @@ -12543,7 +12552,7 @@ \subsection{Selecting a score threshold} %cont-doctest \begin{verbatim} >>> threshold = distribution.threshold_patser() ->>> print "%5.3f" % threshold +>>> print("%5.3f" % threshold) 0.346 \end{verbatim} @@ -12553,10 +12562,10 @@ \subsection{Selecting a score threshold} %cont-doctest \begin{verbatim} >>> threshold = distribution.threshold_fpr(0.01) ->>> print "%5.3f" % threshold +>>> print("%5.3f" % threshold) 4.009 >>> for position, score in pssm.search(test_seq,threshold=threshold): -... print "Position %d: score = %5.3f" % (position, score) +... print("Position %d: score = %5.3f" % (position, score)) ... Position 0: score = 5.622 Position -20: score = 4.601 @@ -12573,14 +12582,14 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} >>> from Bio import motifs >>> handle = open("Arnt.sites") >>> motif = motifs.read(handle, 'sites') ->>> print motif.counts +>>> print(motif.counts) 0 1 2 3 4 5 A: 4.00 19.00 0.00 0.00 0.00 0.00 C: 16.00 0.00 20.00 0.00 0.00 0.00 G: 0.00 1.00 0.00 20.00 0.00 20.00 T: 0.00 0.00 0.00 0.00 20.00 0.00 ->>> print motif.pwm +>>> print(motif.pwm) 0 1 2 3 4 5 A: 0.20 0.95 0.00 0.00 0.00 0.00 C: 0.80 0.00 1.00 0.00 0.00 0.00 @@ -12590,7 +12599,7 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} \end{verbatim} %Can't use next bit in doctest, Windows Python 2.5 and 2.6 put -1.$ not -inf \begin{verbatim} ->>> print motif.pssm +>>> print(motif.pssm) 0 1 2 3 4 5 A: -0.32 1.93 -inf -inf -inf -inf C: 1.68 -inf 2.00 -inf -inf -inf @@ -12602,7 +12611,7 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} %cont-doctest \begin{verbatim} >>> for letter in "ACGT": -... print "%s: %4.2f" % (letter, motif.pseudocounts[letter]) +... print("%s: %4.2f" % (letter, motif.pseudocounts[letter])) ... A: 0.00 C: 0.00 @@ -12614,7 +12623,7 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} \begin{verbatim} >>> motif.pseudocounts = 3.0 >>> for letter in "ACGT": -... print "%s: %4.2f" % (letter, motif.pseudocounts[letter]) +... print("%s: %4.2f" % (letter, motif.pseudocounts[letter])) ... A: 3.00 C: 3.00 @@ -12624,7 +12633,7 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} %Can't use this in doctest, Windows Python 2.5 and 2.6 give G/1 as 0.13 not 0.12 %TODO - Check why... \begin{verbatim} ->>> print motif.pwm +>>> print(motif.pwm) 0 1 2 3 4 5 A: 0.22 0.69 0.09 0.09 0.09 0.09 C: 0.59 0.09 0.72 0.09 0.09 0.09 @@ -12634,7 +12643,7 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} \end{verbatim} %cont-doctest \begin{verbatim} ->>> print motif.pssm +>>> print(motif.pssm) 0 1 2 3 4 5 A: -0.19 1.46 -1.42 -1.42 -1.42 -1.42 C: 1.25 -1.42 1.52 -1.42 -1.42 -1.42 @@ -12648,7 +12657,7 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} %cont-doctest \begin{verbatim} >>> for letter in "ACGT": -... print "%s: %4.2f" % (letter, motif.background[letter]) +... print("%s: %4.2f" % (letter, motif.background[letter])) ... A: 0.25 C: 0.25 @@ -12659,7 +12668,7 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} %cont-doctest \begin{verbatim} >>> motif.background = {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2} ->>> print motif.pssm +>>> print(motif.pssm) 0 1 2 3 4 5 A: 0.13 1.78 -1.09 -1.09 -1.09 -1.09 C: 0.98 -1.68 1.26 -1.68 -1.68 -1.68 @@ -12672,7 +12681,7 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} \begin{verbatim} >>> motif.background = None >>> for letter in "ACGT": -... print "%s: %4.2f" % (letter, motif.background[letter]) +... print("%s: %4.2f" % (letter, motif.background[letter])) ... A: 0.25 C: 0.25 @@ -12684,7 +12693,7 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} \begin{verbatim} >>> motif.background = 0.8 >>> for letter in "ACGT": -... print "%s: %4.2f" % (letter, motif.background[letter]) +... print("%s: %4.2f" % (letter, motif.background[letter])) ... A: 0.10 C: 0.40 @@ -12694,13 +12703,13 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} Note that you can now calculate the mean of the PSSM scores over the background against which it was computed: %cont-doctest \begin{verbatim} ->>> print "%f" % motif.pssm.mean(motif.background) +>>> print("%f" % motif.pssm.mean(motif.background)) 4.703928 \end{verbatim} as well as its standard deviation: %cont-doctest \begin{verbatim} ->>> print "%f" % motif.pssm.std(motif.background) +>>> print("%f" % motif.pssm.std(motif.background)) 3.290900 \end{verbatim} and its distribution: @@ -12708,7 +12717,7 @@ \section{Each motif object has an associated Position-Specific Scoring Matrix} \begin{verbatim} >>> distribution = motif.pssm.distribution(background=motif.background) >>> threshold = distribution.threshold_fpr(0.01) ->>> print "%f" % threshold +>>> print("%f" % threshold) 3.854375 \end{verbatim} @@ -12744,7 +12753,7 @@ \section{Comparing motifs} >>> m_reb1 = motifs.read(open("REB1.pfm"), "pfm") >>> m_reb1.consensus Seq('GTTACCCGG', IUPACUnambiguousDNA()) ->>> print m_reb1.counts +>>> print(m_reb1.counts) 0 1 2 3 4 5 6 7 8 A: 30.00 0.00 0.00 100.00 0.00 0.00 0.00 0.00 15.00 C: 10.00 0.00 0.00 0.00 100.00 100.00 100.00 0.00 15.00 @@ -12759,7 +12768,7 @@ \section{Comparing motifs} >>> m_reb1.pseudocounts = {'A':0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6} >>> m_reb1.background = {'A':0.3,'C':0.2,'G':0.2,'T':0.3} >>> pssm_reb1 = m_reb1.pssm ->>> print pssm_reb1 +>>> print(pssm_reb1) 0 1 2 3 4 5 6 7 8 A: 0.00 -5.67 -5.67 1.72 -5.67 -5.67 -5.67 -5.67 -0.97 C: -0.97 -5.67 -5.67 -5.67 2.30 2.30 2.30 -5.67 -0.41 @@ -12773,9 +12782,9 @@ \section{Comparing motifs} %cont-doctest \begin{verbatim} >>> distance, offset = pssm.dist_pearson(pssm_reb1) ->>> print "distance = %5.3g" % distance +>>> print("distance = %5.3g" % distance) distance = 0.239 ->>> print offset +>>> print(offset) -2 \end{verbatim} This means that the best PCC between motif \verb|m| and \verb|m_reb1| is obtained with the following alignment: @@ -12956,7 +12965,7 @@ \subsection{Motif objects} >>> m.reverse_complement().consensus() Seq('TTATA', IUPACUnambiguousDNA()) >>> for i in m.reverse_complement().instances: -... print i +... print(i) TTATA TAATA TTATA @@ -12966,7 +12975,7 @@ \subsection{Motif objects} We can also calculate the information content of a motif with a simple call: %cont-doctest \begin{verbatim} ->>> print "%0.2f" % m.ic() +>>> print("%0.2f" % m.ic()) 5.27 \end{verbatim} This gives us a number of bits of information provided by the motif, @@ -13110,7 +13119,7 @@ \subsubsection{Reading and writing} Speaking of exporting, let's look at export functions. We can export to fasta: \begin{verbatim} ->>> print m.format("fasta") +>>> print(m.format("fasta")) >instance0 TATAA >instance1 @@ -13122,7 +13131,7 @@ \subsubsection{Reading and writing} \end{verbatim} or to TRANSFAC-like matrix format (used by some motif processing software) \begin{verbatim} ->>> print m.format("transfac") +>>> print(m.format("transfac")) XX TY Motif ID @@ -13155,7 +13164,7 @@ \subsection{Searching for instances} the true instances of the motif: \begin{verbatim} >>> for pos,seq in m.search_instances(test_seq): -... print pos,seq.tostring() +... print(pos, seq.tostring()) ... 10 TATAA 15 TATAA @@ -13164,7 +13173,7 @@ \subsection{Searching for instances} We can do the same with the reverse complement (to find instances on the complementary strand): \begin{verbatim} >>> for pos,seq in m.reverse_complement().search_instances(test_seq): -... print pos,seq.tostring() +... print(pos, seq.tostring()) ... 12 TAATA 20 TTATA @@ -13173,7 +13182,7 @@ \subsection{Searching for instances} It's just as easy to look for positions, giving rise to high log-odds scores against our motif: \begin{verbatim} >>> for pos,score in m.search_pwm(test_seq,threshold=5.0): -... print pos,score +... print(pos, score) ... 10 8.44065060871 -12 7.06213898545 @@ -13225,7 +13234,7 @@ \subsection{Searching for instances} instances with balanced threshold with rate of $1000$. \begin{verbatim} >>> for pos,score in m.search_pwm(test_seq,threshold=sd.threshold_balanced(1000)): -... print pos,score +... print(pos, score) ... 10 8.44065060871 15 8.44065060871 @@ -13842,7 +13851,7 @@ \subsection*{Representing a hierarchical clustering solution} >>> from Bio.Cluster import Node, Tree >>> nodes = [Node(1,2,0.2), Node(0,3,0.5), Node(-2,4,0.6), Node(-1,-3,0.9)] >>> tree = Tree(nodes) ->>> print tree +>>> print(tree) (1, 2): 0.2 (0, 3): 0.5 (-2, 4): 0.6 @@ -13876,7 +13885,7 @@ \subsection*{Representing a hierarchical clustering solution} \begin{verbatim} >>> tree = Tree([Node(1,2,0.1), Node(0,-1,0.5), Node(-2,3,0.9)]) ->>> print tree +>>> print(tree) (1, 2): 0.1 (0, -1): 0.5 (-2, 3): 0.9 @@ -13884,7 +13893,7 @@ \subsection*{Representing a hierarchical clustering solution} >>> nodes[0] = Node(0,1,0.2) >>> nodes[1].left = 2 >>> tree = Tree(nodes) ->>> print tree +>>> print(tree) (0, 1): 0.2 (2, -1): 0.5 (-2, 3): 0.9 @@ -14465,7 +14474,7 @@ \subsection{Training the logistic regression model} \begin{verbatim} >>> def show_progress(iteration, loglikelihood): - print "Iteration:", iteration, "Log-likelihood function:", loglikelihood + print("Iteration:", iteration, "Log-likelihood function:", loglikelihood) >>> >>> model = LogisticRegression.train(xs, ys, update_fn=show_progress) Iteration: 0 Log-likelihood function: -11.7835020695 @@ -14536,9 +14545,9 @@ \subsection{Using the logistic regression model for classification} The logistic regression model classifies {\it yxcE}, {\it yxcD} as belonging to the same operon (class OP), while {\it yxiB}, {\it yxiA} are predicted to belong to different operons: \begin{verbatim} ->>> print "yxcE, yxcD:", LogisticRegression.classify(model, [6,-173.143442352]) +>>> print("yxcE, yxcD:", LogisticRegression.classify(model, [6,-173.143442352])) yxcE, yxcD: 1 ->>> print "yxiB, yxiA:", LogisticRegression.classify(model, [309, -271.005880394]) +>>> print("yxiB, yxiA:", LogisticRegression.classify(model, [309, -271.005880394])) yxiB, yxiA: 0 \end{verbatim} (which, by the way, agrees with the biological literature). @@ -14546,20 +14555,20 @@ \subsection{Using the logistic regression model for classification} To find out how confident we can be in these predictions, we can call the \verb+calculate+ function to obtain the probabilities (equations (\ref{eq:OP}) and \ref{eq:NOP}) for class OP and NOP. For {\it yxcE}, {\it yxcD} we find \begin{verbatim} >>> q, p = LogisticRegression.calculate(model, [6,-173.143442352]) ->>> print "class OP: probability =", p, "class NOP: probability =", q +>>> print("class OP: probability =", p, "class NOP: probability =", q) class OP: probability = 0.993242163503 class NOP: probability = 0.00675783649744 \end{verbatim} and for {\it yxiB}, {\it yxiA} \begin{verbatim} >>> q, p = LogisticRegression.calculate(model, [309, -271.005880394]) ->>> print "class OP: probability =", p, "class NOP: probability =", q +>>> print("class OP: probability =", p, "class NOP: probability =", q) class OP: probability = 0.000321211251817 class NOP: probability = 0.999678788748 \end{verbatim} To get some idea of the prediction accuracy of the logistic regression model, we can apply it to the training data: \begin{verbatim} >>> for i in range(len(ys)): - print "True:", ys[i], "Predicted:", LogisticRegression.classify(model, xs[i]) + print("True:", ys[i], "Predicted:", LogisticRegression.classify(model, xs[i])) True: 1 Predicted: 1 True: 1 Predicted: 0 True: 1 Predicted: 1 @@ -14582,7 +14591,7 @@ \subsection{Using the logistic regression model for classification} \begin{verbatim} >>> for i in range(len(ys)): model = LogisticRegression.train(xs[:i]+xs[i+1:], ys[:i]+ys[i+1:]) - print "True:", ys[i], "Predicted:", LogisticRegression.classify(model, xs[i]) + print("True:", ys[i], "Predicted:", LogisticRegression.classify(model, xs[i])) True: 1 Predicted: 1 True: 1 Predicted: 0 True: 1 Predicted: 1 @@ -14638,10 +14647,10 @@ \subsection{Using a $k$-nearest neighbors model for classification} For the example of the gene pairs {\it yxcE}, {\it yxcD} and {\it yxiB}, {\it yxiA}, we find: \begin{verbatim} >>> x = [6, -173.143442352] ->>> print "yxcE, yxcD:", kNN.classify(model, x) +>>> print("yxcE, yxcD:", kNN.classify(model, x)) yxcE, yxcD: 1 >>> x = [309, -271.005880394] ->>> print "yxiB, yxiA:", kNN.classify(model, x) +>>> print("yxiB, yxiA:", kNN.classify(model, x)) yxiB, yxiA: 0 \end{verbatim} In agreement with the logistic regression model, {\it yxcE}, {\it yxcD} are classified as belonging to the same operon (class OP), while {\it yxiB}, {\it yxiA} are predicted to belong to different operons. @@ -14656,7 +14665,7 @@ \subsection{Using a $k$-nearest neighbors model for classification} ... return distance ... >>> x = [6, -173.143442352] ->>> print "yxcE, yxcD:", kNN.classify(model, x, distance_fn = cityblock) +>>> print("yxcE, yxcD:", kNN.classify(model, x, distance_fn = cityblock)) yxcE, yxcD: 1 \end{verbatim} @@ -14669,7 +14678,7 @@ \subsection{Using a $k$-nearest neighbors model for classification} ... return exp(-abs(x1[0]-x2[0]) - abs(x1[1]-x2[1])) ... >>> x = [6, -173.143442352] ->>> print "yxcE, yxcD:", kNN.classify(model, x, weight_fn = weight) +>>> print("yxcE, yxcD:", kNN.classify(model, x, weight_fn = weight)) yxcE, yxcD: 1 \end{verbatim} By default, all neighbors are given an equal weight. @@ -14678,7 +14687,7 @@ \subsection{Using a $k$-nearest neighbors model for classification} \begin{verbatim} >>> x = [6, -173.143442352] >>> weight = kNN.calculate(model, x) ->>> print "class OP: weight =", weight[0], "class NOP: weight =", weight[1] +>>> print("class OP: weight =", weight[0], "class NOP: weight =", weight[1]) class OP: weight = 0.0 class NOP: weight = 3.0 \end{verbatim} which means that all three neighbors of \verb+x1+, \verb+x2+ are in the NOP class. As another example, for {\it yesK}, {\it yesL} we find @@ -14686,7 +14695,7 @@ \subsection{Using a $k$-nearest neighbors model for classification} \begin{verbatim} >>> x = [117, -267.14] >>> weight = kNN.calculate(model, x) ->>> print "class OP: weight =", weight[0], "class NOP: weight =", weight[1] +>>> print("class OP: weight =", weight[0], "class NOP: weight =", weight[1]) class OP: weight = 2.0 class NOP: weight = 1.0 \end{verbatim} which means that two neighbors are operon pairs and one neighbor is a non-operon pair. @@ -14694,7 +14703,7 @@ \subsection{Using a $k$-nearest neighbors model for classification} To get some idea of the prediction accuracy of the $k$-nearest neighbors approach, we can apply it to the training data: \begin{verbatim} >>> for i in range(len(ys)): - print "True:", ys[i], "Predicted:", kNN.classify(model, xs[i]) + print("True:", ys[i], "Predicted:", kNN.classify(model, xs[i])) True: 1 Predicted: 1 True: 1 Predicted: 0 True: 1 Predicted: 1 @@ -14717,7 +14726,7 @@ \subsection{Using a $k$-nearest neighbors model for classification} \begin{verbatim} >>> for i in range(len(ys)): model = kNN.train(xs[:i]+xs[i+1:], ys[:i]+ys[i+1:]) - print "True:", ys[i], "Predicted:", kNN.classify(model, xs[i]) + print("True:", ys[i], "Predicted:", kNN.classify(model, xs[i])) True: 1 Predicted: 1 True: 1 Predicted: 0 True: 1 Predicted: 1 @@ -15706,7 +15715,7 @@ \subsection{Simple Chromosomes} ("Chr V", "CHR_V/NC_003076.fna")] for (name, filename) in entries: record = SeqIO.read(filename,"fasta") - print name, len(record) + print(name, len(record)) \end{verbatim} \noindent This gave the lengths of the five chromosomes, which we'll now use in @@ -15887,12 +15896,12 @@ \subsection{Filtering a sequence file} id_file = "short_list.txt" output_file = "short_list.sff" wanted = set(line.rstrip("\n").split(None,1)[0] for line in open(id_file)) -print "Found %i unique identifiers in %s" % (len(wanted), id_file) +print("Found %i unique identifiers in %s" % (len(wanted), id_file)) records = (r for r in SeqIO.parse(input_file, "sff") if r.id in wanted) count = SeqIO.write(records, output_file, "sff") -print "Saved %i records from %s to %s" % (count, input_file, output_file) +print("Saved %i records from %s to %s" % (count, input_file, output_file)) if count < len(wanted): - print "Warning %i IDs not found in %s" % (len(wanted)-count, input_file) + print("Warning %i IDs not found in %s" % (len(wanted)-count, input_file)) \end{verbatim} Note that we use a Python \verb|set| rather than a \verb|list|, this makes @@ -16053,7 +16062,7 @@ \subsection{Making the sequences in a FASTA file upper case} from Bio import SeqIO records = (rec.upper() for rec in SeqIO.parse("mixed.fas", "fasta")) count = SeqIO.write(records, "upper.fas", "fasta") -print "Converted %i records to upper case" % count +print("Converted %i records to upper case" % count) \end{verbatim} How does this work? The first line is just importing the \verb|Bio.SeqIO| @@ -16178,7 +16187,7 @@ \subsection{Simple quality filtering for FASTQ files} count = 0 for rec in SeqIO.parse("SRR020192.fastq", "fastq"): count += 1 -print "%i reads" % count +print("%i reads" % count) \end{verbatim} \noindent Now let's do a simple filtering for a minimum PHRED quality of 20: @@ -16189,7 +16198,7 @@ \subsection{Simple quality filtering for FASTQ files} SeqIO.parse("SRR020192.fastq", "fastq") \ if min(rec.letter_annotations["phred_quality"]) >= 20) count = SeqIO.write(good_reads, "good_quality.fastq", "fastq") -print "Saved %i reads" % count +print("Saved %i reads" % count) \end{verbatim} \noindent This pulled out only $14580$ reads out of the $41892$ present. @@ -16220,7 +16229,7 @@ \subsection{Trimming off primer sequences} SeqIO.parse("SRR020192.fastq", "fastq") \ if rec.seq.startswith("GATGACGGTGT")) count = SeqIO.write(primer_reads, "with_primer.fastq", "fastq") -print "Saved %i reads" % count +print("Saved %i reads" % count) \end{verbatim} \noindent That should find $13819$ reads from \texttt{SRR014849.fastq} and save them to @@ -16237,7 +16246,7 @@ \subsection{Trimming off primer sequences} SeqIO.parse("SRR020192.fastq", "fastq") \ if rec.seq.startswith("GATGACGGTGT")) count = SeqIO.write(trimmed_primer_reads, "with_primer_trimmed.fastq", "fastq") -print "Saved %i reads" % count +print("Saved %i reads" % count) \end{verbatim} \noindent Again, that should pull out the $13819$ reads from \texttt{SRR020192.fastq}, @@ -16260,7 +16269,7 @@ \subsection{Trimming off primer sequences} trimmed_reads = (trim_primer(record, "GATGACGGTGT") for record in \ SeqIO.parse("SRR020192.fastq", "fastq")) count = SeqIO.write(trimmed_reads, "trimmed.fastq", "fastq") -print "Saved %i reads" % count +print("Saved %i reads" % count) \end{verbatim} This takes longer, as this time the output file contains all $41892$ reads. @@ -16286,7 +16295,7 @@ \subsection{Trimming off primer sequences} original_reads = SeqIO.parse("SRR020192.fastq", "fastq") trimmed_reads = trim_primers(original_reads, "GATGACGGTGT") count = SeqIO.write(trimmed_reads, "trimmed.fastq", "fastq") -print "Saved %i reads" % count +print("Saved %i reads" % count) \end{verbatim} This form is more flexible if you want to do something more complicated @@ -16325,7 +16334,7 @@ \subsection{Trimming off adaptor sequences} original_reads = SeqIO.parse("SRR020192.fastq", "fastq") trimmed_reads = trim_adaptors(original_reads, "GATGACGGTGT") count = SeqIO.write(trimmed_reads, "trimmed.fastq", "fastq") -print "Saved %i reads" % count +print("Saved %i reads" % count) \end{verbatim} Because we are using a FASTQ input file in this example, the \verb|SeqRecord| @@ -16365,7 +16374,7 @@ \subsection{Trimming off adaptor sequences} original_reads = SeqIO.parse("SRR020192.fastq", "fastq") trimmed_reads = trim_adaptors(original_reads, "GATGACGGTGT", 100) count = SeqIO.write(trimmed_reads, "trimmed.fastq", "fastq") -print "Saved %i reads" % count +print("Saved %i reads" % count) \end{verbatim} By changing the format names, you could apply this to FASTA files instead. @@ -16501,7 +16510,7 @@ \subsection{Converting FASTA and QUAL files into FASTQ files} \begin{verbatim} from Bio.SeqIO.QualityIO import PairedFastaQualIterator for record in PairedFastaQualIterator(open("example.fasta"), open("example.qual")): - print record + print(record) \end{verbatim} This function will check that the FASTA and QUAL files are consistent (e.g. @@ -16516,7 +16525,7 @@ \subsection{Converting FASTA and QUAL files into FASTQ files} records = PairedFastaQualIterator(open("example.fasta"), open("example.qual")) count = SeqIO.write(records, handle, "fastq") handle.close() -print "Converted %i records" % count +print("Converted %i records" % count) \end{verbatim} \subsection{Indexing a FASTQ file} @@ -16651,8 +16660,8 @@ \subsection{Identifying open reading frames} ... length = 3 * ((len(record)-frame) // 3) #Multiple of three ... for pro in nuc[frame:frame+length].translate(table).split("*"): ... if len(pro) >= min_pro_len: -... print "%s...%s - length %i, strand %i, frame %i" \ -... % (pro[:30], pro[-3:], len(pro), strand, frame) +... print("%s...%s - length %i, strand %i, frame %i" \ +... % (pro[:30], pro[-3:], len(pro), strand, frame)) GCLMKKSSIVATIITILSGSANAASSQLIP...YRF - length 315, strand 1, frame 0 KSGELRQTPPASSTLHLRLILQRSGVMMEL...NPE - length 285, strand 1, frame 1 GLNCSFFSICNWKFIDYINRLFQIIYLCKN...YYH - length 176, strand 1, frame 1 @@ -16716,8 +16725,8 @@ \subsection{Identifying open reading frames} orf_list = find_orfs_with_trans(record.seq, table, min_pro_len) for start, end, strand, pro in orf_list: - print "%s...%s - length %i, strand %i, %i:%i" \ - % (pro[:30], pro[-3:], len(pro), strand, start, end) + print("%s...%s - length %i, strand %i, %i:%i" \ + % (pro[:30], pro[-3:], len(pro), strand, start, end)) \end{verbatim} \noindent And the output: @@ -17001,7 +17010,7 @@ \subsection{Nucleotide dot plots} #Now find any sub-sequences found in both sequences #(Python 2.3 would require slightly different code here) matches = set(dict_one).intersection(dict_two) -print "%i unique matches" % len(matches) +print("%i unique matches" % len(matches)) \end{verbatim} \noindent In order to use the \verb|pylab.scatter()| we need separate lists for the $x$ and $y$ co-ordinates: \begin{verbatim} @@ -17080,7 +17089,7 @@ \subsection{Plotting the quality scores of sequencing read data} pylab.ylabel("PHRED quality score") pylab.xlabel("Position") pylab.savefig("SRR001666.png") -print "Done" +print("Done") \end{verbatim} You should note that we are using the \verb|Bio.SeqIO| format name \texttt{fastq} @@ -17210,7 +17219,9 @@ \subsection{Position Specific Score Matrices} \end{enumerate} -The command above returns a \verb|PSSM| object. To print out the PSSM as we showed above, we simply need to do a \verb|print my_pssm|, which gives: +The command above returns a \verb|PSSM| object. +To print out the PSSM as shown above, +we simply need to do a \verb|print(my_pssm)|, which gives: \begin{verbatim} A C G T @@ -17228,7 +17239,7 @@ \subsection{Position Specific Score Matrices} You can access any element of the PSSM by subscripting like \verb|your_pssm[sequence_number][residue_count_name]|. For instance, to get the counts for the 'A' residue in the second element of the above PSSM you would do: \begin{verbatim} ->>> print my_pssm[1]["A"] +>>> print(my_pssm[1]["A"]) 7.0 \end{verbatim} @@ -17655,12 +17666,13 @@ \subsection{Writing a print-and-compare test} module could look as follows: \begin{verbatim} +from __future__ import print_function from Bio import Biospam -print "2 + 3 =", Biospam.addition(2, 3) -print "9 - 1 =", Biospam.addition(9, -1) -print "2 * 3 =", Biospam.multiplication(2, 3) -print "9 * (- 1) =", Biospam.multiplication(9, -1) +print("2 + 3 =", Biospam.addition(2, 3)) +print("9 - 1 =", Biospam.addition(9, -1)) +print("2 * 3 =", Biospam.multiplication(2, 3)) +print("9 * (- 1) =", Biospam.multiplication(9, -1)) \end{verbatim} We generate the corresponding output with \verb|python run_tests.py -g test_Biospam.py|, and check the output file \verb|output/test_Biospam|: @@ -18363,7 +18375,7 @@ \section{What the heck is a handle?} \begin{verbatim} from Bio import SeqIO for record in SeqIO.parse("m_cold.fasta", "fasta"): - print record.id, len(record) + print(record.id, len(record)) \end{verbatim} On older versions of Biopython you had to use a handle, e.g. @@ -18372,7 +18384,7 @@ \section{What the heck is a handle?} from Bio import SeqIO handle = open("m_cold.fasta", "r") for record in SeqIO.parse(handle, "fasta"): - print record.id, len(record) + print(record.id, len(record)) handle.close() \end{verbatim} @@ -18384,7 +18396,7 @@ \section{What the heck is a handle?} from Bio import SeqIO handle = gzip.open("m_cold.fasta.gz") for record in SeqIO.parse(handle, "fasta"): - print record.id, len(record) + print(record.id, len(record)) handle.close() \end{verbatim} @@ -18400,17 +18412,17 @@ \subsection{Creating a handle from a string} %doctest \begin{verbatim} >>> my_info = 'A string\n with multiple lines.' ->>> print my_info +>>> print(my_info) A string with multiple lines. >>> from StringIO import StringIO >>> my_info_handle = StringIO(my_info) >>> first_line = my_info_handle.readline() ->>> print first_line +>>> print(first_line) A string >>> second_line = my_info_handle.readline() ->>> print second_line +>>> print(second_line) with multiple lines. \end{verbatim} diff --git a/Tests/test_Tutorial.py b/Tests/test_Tutorial.py index 8e8b4d9679c..b39caf3f700 100644 --- a/Tests/test_Tutorial.py +++ b/Tests/test_Tutorial.py @@ -3,6 +3,9 @@ # license. Please see the LICENSE file that should have been included # as part of this package. +# This will apply to all the doctests too: +from __future__ import print_function + import unittest import doctest import os @@ -14,9 +17,15 @@ if sys.version_info[0] >= 3: from lib2to3 import refactor - rt = refactor.RefactoringTool(refactor.get_fixers_from_package("lib2to3.fixes")) - assert rt.refactor_docstring(">>> print 2+2\n4\n", "example") == \ - ">>> print(2+2)\n4\n" + fixers = refactor.get_fixers_from_package("lib2to3.fixes") + fixers.remove("lib2to3.fixes.fix_print") # Already using print function + rt = refactor.RefactoringTool(fixers) + assert rt.refactor_docstring(">>> print(2+2)\n4\n", "example1") == \ + ">>> print(2+2)\n4\n" + assert rt.refactor_docstring('>>> print("Two plus two is", 2+2)\n' + 'Two plus two is 4\n', "example2") == \ + '>>> print("Two plus two is", 2+2)\nTwo plus two is 4\n' + tutorial = os.path.join(os.path.dirname(sys.argv[0]), "../Doc/Tutorial.tex") if not os.path.isfile(tutorial) and sys.version_info[0] >= 3: @@ -117,6 +126,7 @@ def check_deps(dependencies): continue if sys.version_info[0] >= 3: + example = ">>> from __future__ import print_function\n" + example example = rt.refactor_docstring(example, name) def funct(n, d, f):