## Learning objectives


1. Understanding assert statements

2. Parsing FASTA files

3. Writing a Python class

4. Creating an iterator

5. Writing a k-mer counter

---




## Defensive programming




In [0]:
def test(a):
    assert a > 5, 'Value is too small'
    print('Value is big enough')

test(10)
test(2)

## Parsing a FASTA sequence


FASTA format


<br><div style="background: #EEE"> \>sequence 1<br> AGATCTCCCTGAGAGAAGAGCTCTCTCTCGA<br> TCTCGGATTACGTAGGCTAGAGAGAGAGCTA<br> TTCAA<br> \>sequence 2<br> GATCTCGGGATAAAAAAACTGGGATCTGATC<br> ATCTAAAGAGAG </div><br>




In [0]:
# Let's have it accept an open file object
# That way it can be passed a file or standard input

def single_FASTAReader(file):
	# Get the first line, which should contain the sequence name
	line = file.readline()

	# Let's make sure the file looks like a FASTA file
	assert line.startswith('>'), "Not a FASTA file"
	
	# Get the sequence name
	seq_id = line[1:].rstrip('\r\n')

	# create a list to contain the 
	sequence = []

	# Get the next line
	line = file.readline()

	# Keep reading lines until we run out
	while line:
		# Check if we've reached a new sequence (in a multi-sequence file)
		if line.startswith('>'):
			break

		# Add next chunk of sequence
		sequence.append(line.strip())
		
		# Get the next line
		line = file.readline()
	return (seq_id, ''.join(sequence))

In [0]:
name, seq = single_FASTAReader(open('subset.fa'))
print(name, seq)

## Parsing a FASTA file




In [0]:
def FASTAReader(file):
    # Get the first line, which should contain the sequence name
    line = file.readline()

    # Let's make sure the file looks like a FASTA file
    assert line.startswith('>'), "Not a FASTA file"
    
    # Get the sequence name
    seq_id = line[1:].rstrip('\r\n')

    # create a list to contain the 
    sequence = []

    # Get the next line
    line = file.readline()

    # Add a list to hold all of the sequences in
    sequences = []

    # Keep reading lines until we run out
    while line:
        # Check if we've reached a new sequence (in a multi-sequence file)
        if line.startswith('>'):
            # Add previous sequence to list
            sequences.append((seq_id, ''.join(sequence)))
            
            # Record new sequence name and reset sequence
            seq_id = line[1:].rstrip('\r\n')
            sequence = []
        else:
            # Add next chunk of sequence
            sequence.append(line.strip())
        
        # Get the next line
        line = file.readline()
    # Add the last sequence to sequences
    sequences.append((seq_id, ''.join(sequence)))

    return sequences

In [0]:
seqs = FASTAReader(open('subset.fa'))
print(len(seqs))
print(seqs[0])
print(seqs[1])

## Python Classes




In [0]:
class OurClass(object):
    def __init__(self):
        print('created')

instance = OurClass()        

In [0]:
class Rect(object):
    def __init__(self, width, height):
        self.width = width
        self.height = height

    def area(self):
        return self.width * self.height

R = Rect(5, 10)
print(R.area())

In [0]:
class Iterator(object):
    def __init__(self, start, stop):
        self.start = start
        self.stop = stop
        self.current = start - 1

    def __iter__(self):
        return self

    def __next__(self):
        self.current += 1
        if self.current >= self.stop:
            raise StopIteration
        return self.current

I = Iterator(0, 10)
for i in I:
    print(i)

## FASTA iterator




In [0]:
class FASTAReader(object):

    def __init__(self, file):
        self.last_id = None
        self.file = file
        self.eof = False

    def __iter__(self):
        return self

    def __next__(self):
        if self.eof:
            raise StopIteration
        # check if this is the first sequence from the file
        if self.last_id is None:
            # First line
            line = self.file.readline()
            # Verify that this is a FASTA file
            assert line.startswith(">"), "Not a FASTA file"
            # Get the sequence ID
            seq_id = line[1:].rstrip("\r\n")
        else:
            # Get ID from previous round
            seq_id = self.last_id

        sequence = []
        while True:
            line = self.file.readline()
            # Check if we've reached the end of the file
            if line == "":
                self.eof = True
                break
            # Check if we've reached the next sequence
            elif not line.startswith(">"):
                sequence.append(line.strip())
            # We've reached the next sequence ID
            else:
                self.last_id = line[1:].rstrip("\r\n")
                break
        
        sequence = "".join(sequence)
        return seq_id, sequence

reader = FASTAReader(open('subset.fa'))
for seq_id, seq in reader:
    print(seq_id, seq)

## K-mer counting




In [0]:
reader = FASTAReader(open('subset.fa'))
kmers = {}

k = 11

for seq_id, sequence in reader:
    for i in range(0, len(sequence) - k):
        kmer = sequence[i:i + k]
        kmers.setdefault(kmer, 0)
        kmers[kmer] += 1

for key in kmers:
    print(key, kmers[key])

## Importing functions and classes from other scripts


