Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

637 lines (510 sloc) 23.223 kB
#!/usr/bin/env python
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Test the different representations of Genes.
This exercises the Motif, Schema and Signature methods of representing
genes, as well as generic Pattern methods.
"""
# standard library
import os
import unittest
# Biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
# stuff we are testing
from Bio.NeuralNetwork.Gene import Schema
from Bio.NeuralNetwork.Gene import Motif
from Bio.NeuralNetwork.Gene import Signature
from Bio.NeuralNetwork.Gene import Pattern
VERBOSE = 0
# --- Tests for Pattern
class PatternIOTest(unittest.TestCase):
"""Tests for reading and writing patterns to a file.
"""
def setUp(self):
self.alphabet = IUPAC.ambiguous_dna
self.test_file = os.path.join("NeuralNetwork", "patternio.txt")
#Remove any existing copy of the output file,
if os.path.isfile(self.test_file):
os.remove(self.test_file)
self.pattern_io = Pattern.PatternIO(self.alphabet)
def tearDown(self):
#Clean up by removing our output file,
if os.path.isfile(self.test_file):
os.remove(self.test_file)
def test_motif(self):
"""Reading and writing motifs to a file
"""
# write to a file
motifs = ["GAC", "AAA", "TTT", "GGG"]
output_handle = open(self.test_file, "w")
self.pattern_io.write(motifs, output_handle)
output_handle.close()
# read 'em back
input_handle = open(self.test_file, "r")
read_motifs = self.pattern_io.read(input_handle)
input_handle.close()
assert read_motifs == motifs, \
"Failed to get back expected motifs %s, got %s" \
% (motifs, read_motifs)
# write seqs
seq_motifs = []
for motif in motifs:
seq_motifs.append(Seq(motif, self.alphabet))
output_handle = open(self.test_file, "w")
self.pattern_io.write_seq(seq_motifs, output_handle)
output_handle.close()
# read the seqs back
input_handle = open(self.test_file, "r")
read_motifs = self.pattern_io.read(input_handle)
input_handle.close()
assert read_motifs == motifs, \
"Failed to get back expected motifs %s from seqs, got %s" \
% (motifs, read_motifs)
def test_schema(self):
"""Reading and writing schemas to a file.
"""
schemas = ["GTR", "GAC"]
# write out the schemas
output_handle = open(self.test_file, "w")
self.pattern_io.write(schemas, output_handle)
output_handle.close()
# read back the schemas
input_handle = open(self.test_file, "r")
read_schemas = self.pattern_io.read(input_handle)
input_handle.close()
assert schemas == read_schemas, \
"Read incorrect schemas %s, expected %s." \
% (read_schemas, schemas)
# --- make sure inappropriate alphabets are reported
schemas = ["GTR", "G*C"] # '*' not in the unambigous alphabet
output_handle = open(self.test_file, "w")
self.pattern_io.write(schemas, output_handle)
output_handle.close()
input_handle = open(self.test_file, "r")
try:
read_schemas = self.pattern_io.read(input_handle)
raise AssertionError("Did not report error on bad alphabet.")
except ValueError:
pass # expected behavior
except:
raise AssertionError("Got unexpected error while reading.")
input_handle.close()
def test_signature(self):
"""Reading and writing signatures to a file.
"""
signatures = [("GAC", "GAC"), ("AAA", "TTT")]
output_handle = open(self.test_file, "w")
self.pattern_io.write(signatures, output_handle)
output_handle.close()
input_handle = open(self.test_file, "r")
read_sigs = self.pattern_io.read(input_handle)
input_handle.close()
assert read_sigs == signatures, \
"Got back unexpected signatures %s, wanted %s" \
% (read_sigs, signatures)
class PatternRepositoryTest(unittest.TestCase):
"""Tests for retrieving info from a repository of patterns.
"""
def setUp(self):
self.motifs = {"GATC" : 30,
"GGGG" : 10,
"GTAG" : 0,
"AAAA" : -10,
"ATAT" : -20}
self.repository = Pattern.PatternRepository(self.motifs)
def test_get_all(self):
"""Retrieve all patterns from a repository.
"""
all_motifs = self.repository.get_all()
assert all_motifs == ["GATC", "GGGG", "GTAG", "AAAA", "ATAT"], \
"Unexpected motifs returned %s" % all_motifs
def test_get_random(self):
"""Retrieve random patterns from the repository.
"""
for num_patterns in range(5):
patterns = self.repository.get_random(num_patterns)
assert len(patterns) == num_patterns, \
"Got unexpected number of patterns %s, expected %s" \
% (len(patterns), num_patterns)
for pattern in patterns:
assert pattern in self.motifs.keys(), \
"Got unexpected pattern %s" % pattern
def test_get_top_percentage(self):
"""Retrieve the top percentge of patterns from the repository.
"""
for num_patterns, percentage in ((1, 0.2), (2, .4), (5, 1.0)):
patterns = self.repository.get_top_percentage(percentage)
assert len(patterns) == num_patterns, \
"Got unexpected number of patterns %s, expected %s" \
% (len(patterns), num_patterns)
for pattern in patterns:
assert pattern in self.motifs.keys(), \
"Got unexpected pattern %s" % pattern
def test_get_top(self):
"""Retrieve a certain number of the top patterns.
"""
for num_patterns in range(5):
patterns = self.repository.get_top(num_patterns)
assert len(patterns) == num_patterns, \
"Got unexpected number of patterns %s, expected %s" \
% (len(patterns), num_patterns)
for pattern in patterns:
assert pattern in self.motifs.keys(), \
"Got unexpected pattern %s" % pattern
def test_get_differing(self):
"""Retrieve patterns from both sides of the list (top and bottom).
"""
patterns = self.repository.get_differing(2, 2)
assert patterns == ["GATC", "GGGG", "AAAA", "ATAT"], \
"Got unexpected patterns %s" % patterns
def test_remove_polyA(self):
"""Test the ability to remove A rich patterns from the repository.
"""
patterns = self.repository.get_all()
assert len(patterns) == 5, "Unexpected starting: %s" % patterns
self.repository.remove_polyA()
patterns = self.repository.get_all()
assert len(patterns) == 3, "Unexpected ending: %s" % patterns
assert patterns == ["GATC", "GGGG", "GTAG"], \
"Unexpected patterns: %s" % patterns
def test_count(self):
"""Retrieve counts for particular patterns in the repository.
"""
num_times = self.repository.count("GGGG")
assert num_times == 10, \
"Did not count item in the respository: %s" % num_times
num_times = self.repository.count("NOT_IN_THERE")
assert num_times == 0, \
"Counted items not in repository: %s" % num_times
# --- Tests for motifs
class MotifFinderTest(unittest.TestCase):
"""Tests for finding motifs from sequences.
"""
def setUp(self):
test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
diff_file = os.path.join('NeuralNetwork', 'repeat.fasta')
self.test_records = []
self.diff_records = []
# load the records
for file, records in ((test_file, self.test_records),
(diff_file, self.diff_records)):
handle = open(file, 'r')
iterator = SeqIO.parse(handle, "fasta",
alphabet=IUPAC.unambiguous_dna)
while 1:
try:
seq_record = iterator.next()
except StopIteration:
break
if seq_record is None:
break
records.append(seq_record)
handle.close()
self.motif_finder = Motif.MotifFinder()
def test_find(self):
"""Find all motifs in a set of sequences.
"""
motif_repository = self.motif_finder.find(self.test_records, 8)
top_motif = motif_repository.get_top(1)
assert top_motif[0] == 'TTGGAAAG', \
"Got unexpected motif %s" % top_motif[0]
def test_find_differences(self):
"""Find the difference in motif counts between two sets of sequences.
"""
motif_repository = \
self.motif_finder.find_differences(self.test_records,
self.diff_records, 8)
top, bottom = motif_repository.get_differing(1, 1)
assert top == "TTGGAAAG", "Got unexpected top motif %s" % top
assert bottom == "AATGGCAT", "Got unexpected bottom motif %s" % bottom
class MotifCoderTest(unittest.TestCase):
"""Test the ability to encode sequences as a set of motifs.
"""
def setUp(self):
motifs = ["GAG", "GAT", "GCC", "ATA"]
self.match_strings = (("GATCGCC", [0.0, 1.0, 1.0, 0.0]),
("GATGATCGAGCC", [.5, 1.0, .5, 0.0]))
self.coder = Motif.MotifCoder(motifs)
def test_representation(self):
"""Convert a sequence into its motif representation.
"""
for match_string, expected in self.match_strings:
seq_to_code = Seq(match_string, IUPAC.unambiguous_dna)
matches = self.coder.representation(seq_to_code)
assert matches == expected, \
"Did not match representation, expected %s, got %s" \
% (expected, matches)
# --- Tests for schemas
class SchemaTest(unittest.TestCase):
"""Matching ambiguous motifs with multiple ambiguity characters.
"""
def setUp(self):
ambiguity_chars = {"G" : "G",
"A" : "A",
"T" : "T",
"C" : "C",
"R" : "AG",
"*" : "AGTC"}
self.motif_coder = Schema.Schema(ambiguity_chars)
self.match_string = "GATAG"
self.match_info = [("GA", ["GA"]),
("GATAG", ["GATAG"]),
("GA*AG", ["GATAG"]),
("GATRG", ["GATAG"]),
("*A", ["GA", "TA"])]
def test_find_matches(self):
"""Find all matches in a sequence.
"""
for motif, expected in self.match_info:
found_matches = self.motif_coder.find_matches(motif,
self.match_string)
assert found_matches == expected, "Expected %s, got %s" \
% (expected, found_matches)
def test_num_matches(self):
"""Find how many matches are present in a sequence.
"""
for motif, expected in self.match_info:
num_matches = self.motif_coder.num_matches(motif,
self.match_string)
assert num_matches == len(expected), \
"Expected %s, got %s" % (num_matches, len(expected))
def test_find_ambiguous(self):
"""Find the positions of ambiguous items in a sequence.
"""
ambig_info = (("GATC", []),
("G***", [1, 2, 3]),
("GART", [2]),
("*R*R", [0, 1, 2, 3]))
for motif, expected in ambig_info:
found_positions = self.motif_coder.find_ambiguous(motif)
assert found_positions == expected, \
"Expected %s, got %s for %s" % (expected, found_positions,
motif)
def test_num_ambiguous(self):
"""Find the number of ambiguous items in a sequence.
"""
ambig_info = (("GATC", 0),
("G***", 3),
("GART", 1),
("*R*R", 4))
for motif, expected in ambig_info:
found_num = self.motif_coder.num_ambiguous(motif)
assert found_num == expected, \
"Expected %s, got %s for %s" % (expected, found_num, motif)
def test_motif_cache(self):
"""Make sure motif compiled regular expressions are cached properly.
"""
test_motif = "GATC"
self.motif_coder.find_matches(test_motif, "GATCGATC")
self.assertTrue(test_motif in self.motif_coder._motif_cache,
"Did not find motif cached properly.")
# make sure we don't bomb out if we use the same motif twice
self.motif_coder.find_matches(test_motif, "GATCGATC")
def test_all_unambiguous(self):
"""Return all unambiguous characters that can be in a motif.
"""
found_unambig = self.motif_coder.all_unambiguous()
expected = ["A", "C", "G", "T"]
assert found_unambig == expected, \
"Got %s, expected %s" % (found_unambig, expected)
class SchemaFinderTest(unittest.TestCase):
"""Test finding schemas from a set of sequences.
"""
def setUp(self):
test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
diff_file = os.path.join('NeuralNetwork', 'repeat.fasta')
self.test_records = []
self.diff_records = []
# load the records
for file, records in ((test_file, self.test_records),
(diff_file, self.diff_records)):
handle = open(file, 'r')
records.extend(SeqIO.parse(handle, "fasta",
alphabet=IUPAC.unambiguous_dna))
handle.close()
self.num_schemas = 2
schema_ga = Schema.GeneticAlgorithmFinder()
schema_ga.min_generations = 1
self.finder = Schema.SchemaFinder(num_schemas = self.num_schemas,
schema_finder = schema_ga)
def test_find(self):
"""Find schemas from sequence inputs.
"""
# this test takes too long
if VERBOSE:
repository = self.finder.find(self.test_records + self.diff_records)
schemas = repository.get_all()
assert len(schemas) >= self.num_schemas, "Got too few schemas."
def test_find_differences(self):
"""Find schemas that differentiate between two sets of sequences.
"""
# this test takes too long
if VERBOSE:
repository = self.finder.find_differences(self.test_records,
self.diff_records)
schemas = repository.get_all()
assert len(schemas) >= self.num_schemas, "Got too few schemas."
class SchemaCoderTest(unittest.TestCase):
"""Test encoding sequences as a grouping of motifs.
"""
def setUp(self):
ambiguity_chars = {"G" : "G",
"A" : "A",
"T" : "T",
"C" : "C",
"R" : "AG",
"*" : "AGTC"}
motif_representation = Schema.Schema(ambiguity_chars)
motifs = ("GA", "GATAG", "GA*AG", "GATRG", "*A")
self.motif_coder = Schema.SchemaCoder(motifs,
motif_representation)
self.match_strings = [("GATAG", [.5, .5, .5, .5, 1.0]),
("GAGAGATA", [float(3) / float(4), 0,
float(1) / float(4), 0,
1])]
def test_representation(self):
"""Convert a string into a representation of motifs.
"""
for match_string, expected in self.match_strings:
match_seq = Seq(match_string, IUPAC.unambiguous_dna)
found_rep = self.motif_coder.representation(match_seq)
assert found_rep == expected, "Got %s, expected %s" % \
(found_rep, expected)
class SchemaMatchingTest(unittest.TestCase):
"""Matching schema to strings works correctly.
"""
def shortDescription(self):
return "%s:%s" % (self.__class__.__name__, self.__doc__)
def runTest(self):
match = Schema.matches_schema("GATC", "AAAAA")
assert match == 0, "Expected no match because of length differences"
match = Schema.matches_schema("GATC", "GAT*")
assert match == 1, "Expected match"
match = Schema.matches_schema("GATC", "GATC")
assert match == 1, "Expected match"
match = Schema.matches_schema("GATC", "C*TC")
assert match == 0, "Expected no match because of char mismatch."
match = Schema.matches_schema("G*TC", "*TTC")
assert match == 1, "Expected match because of ambiguity."
class SchemaFactoryTest(unittest.TestCase):
"""Test the SchemaFactory for generating Schemas.
"""
def __init__(self, method):
unittest.TestCase.__init__(self, method)
# a cached schema bank, so we don't have to load it multiple times
self.schema_bank = None
def setUp(self):
self.factory = Schema.SchemaFactory()
self.test_file = os.path.join(os.getcwd(), "NeuralNetwork", "enolase.fasta")
ambiguity_chars = {"G" : "G",
"A" : "A",
"T" : "T",
"C" : "C",
"R" : "AG",
"*" : "AGTC"}
self.schema = Schema.Schema(ambiguity_chars)
def test_easy_from_motifs(self):
"""Generating schema from a simple list of motifs.
"""
motifs = {"GATCGAA" : 20,
"GATCGAT" : 15,
"GATTGAC" : 25,
"TTTTTTT" : 10}
motif_bank = Pattern.PatternRepository(motifs)
schema_bank = self.factory.from_motifs(motif_bank, .5, 2)
if VERBOSE:
print "\nSchemas:"
for schema in schema_bank.get_all():
print "%s: %s" % (schema, schema_bank.count(schema))
def test_hard_from_motifs(self):
"""Generating schema from a real life set of motifs.
"""
schema_bank = self._load_schema_repository()
if VERBOSE:
print "\nSchemas:"
for schema in schema_bank.get_top(5):
print "%s: %s" % (schema, schema_bank.count(schema))
def _load_schema_repository(self):
"""Helper function to load a schema repository from a file.
This also caches a schema bank, to prevent having to do this
time consuming operation multiple times.
"""
# if we already have a cached repository, return it
if self.schema_bank is not None:
return self.schema_bank
# otherwise, we'll read in a new schema bank
# read in the all of the motif records
motif_handle = open(self.test_file, 'r')
seq_records = list(SeqIO.parse(motif_handle, "fasta",
alphabet=IUPAC.unambiguous_dna))
motif_handle.close()
# find motifs from the file
motif_finder = Motif.MotifFinder()
motif_size = 9
motif_bank = motif_finder.find(seq_records, motif_size)
schema_bank = self.factory.from_motifs(motif_bank, .1, 2)
# cache the repository
self.schema_bank = schema_bank
return schema_bank
def test_schema_representation(self):
"""Convert sequences into schema representations.
"""
# get a set of schemas we want to code the sequence in
schema_bank = self._load_schema_repository()
top_schemas = schema_bank.get_top(25)
schema_coder = Schema.SchemaCoder(top_schemas, self.schema)
# get the sequences one at a time, and encode them
fasta_handle = open(self.test_file, 'r')
for seq_record in SeqIO.parse(fasta_handle, "fasta",
alphabet=IUPAC.unambiguous_dna):
schema_values = schema_coder.representation(seq_record.seq)
if VERBOSE:
print "Schema values:", schema_values
fasta_handle.close()
# --- Tests for Signatures
class SignatureFinderTest(unittest.TestCase):
"""Test the ability to find signatures in a set of sequences.
"""
def setUp(self):
test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
self.test_records = []
# load the records
handle = open(test_file, 'r')
self.test_records = list(SeqIO.parse(handle, "fasta",
alphabet=IUPAC.unambiguous_dna))
handle.close()
self.sig_finder = Signature.SignatureFinder()
def test_find(self):
"""Find signatures from sequence inputs.
"""
repository = self.sig_finder.find(self.test_records, 6, 9)
top_sig = repository.get_top(1)
assert top_sig[0] == ('TTGGAA', 'TGGAAA'), \
"Unexpected signature %s" % top_sig[0]
class SignatureCoderTest(unittest.TestCase):
"""Test the ability to encode sequences as a set of signatures.
"""
def setUp(self):
signatures = [("GAC", "GAC"), ("AAA", "TTT"), ("CAA", "TTG")]
self.coder = Signature.SignatureCoder(signatures, 9)
self.test_seqs = [("GACAAAGACTTT", [1.0, 1.0, 0.0]),
("CAAAGACGACTTTAAATTT", [0.5, 1.0, 0.0]),
("AAATTTAAAGACTTTGAC", [1.0 / 3.0, 1.0, 0.0]),
("GACGAC", [1.0, 0.0, 0.0]),
("GACAAAAAAAAAGAC", [1.0, 0.0, 0.0]),
("GACAAAAAAAAAAGAC", [0.0, 0.0, 0.0])]
def test_representation(self):
"""Convert a sequence into its signature representation.
"""
for seq_string, expected in self.test_seqs:
test_seq = Seq(seq_string, IUPAC.unambiguous_dna)
predicted = self.coder.representation(test_seq)
assert predicted == expected, \
"Non-expected representation %s for %s, wanted %s" \
% (predicted, seq_string, expected)
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity = 2)
unittest.main(testRunner=runner)
Jump to Line
Something went wrong with that request. Please try again.