In [None]:
import os
import sys
import itertools
from collections.abc import Sequence
from pprint import pprint as pp

path_to_fasta_file = '/home/alakhani/Coursera-Projects/PythonforGenomicDataScience/FASTA_Files/dna.example_copy.fasta'

In [None]:
# FASTA record descriptions begin with '>', 
# Simplest way of counting without doing anything else:

with open(path_to_fasta_file,'r') as fasta_file:
    count = fasta_file.read().count('>')

# print(count)


In [None]:
# Dictionary comprehension directly from file looks cool but
# does not seem amenable for collecting sequence values in the same iteration.

with open(path_to_fasta_file,'r') as fasta_file:
    seq_dict = {line.strip() : None for line in fasta_file if line[0] == ">"}
    
# print('Length of dictionary made "the python way"', len(seq_dict_2),'\n')
# pprint(seq_dict.items())


## Collecting sequences from a FASTA file into a dictionary

### The more familiar, "less pythonic" way:

In [109]:
def fasta_todict(filepath):
    with open(filepath,'r') as fasta_file:
        header = ""
        seq_accumulator = []
        sequences = {}
        for line in fasta_file:
            # For first line/header.
            if line[0] == ">" and not (header or seq_accumulator):     
                header = line.strip()
            elif line[0] == ">" and seq_accumulator:
                sequences[header] = ''.join(seq_accumulator)
                seq_accumulator.clear()
                header = line.strip()
            elif header:
                seq_accumulator.append(line.strip())
            else:
                print("Check file."
                    "Missing header on first line or two consecutive headers.")
        # Loop ends before the last sequence is paired with header.
        sequences[header] = ''.join(seq_accumulator)
        return sequences

# seqs = fasta_todict(path_to_fasta_file)    
# print('Length of sequence dictionary"', len(sequences))
# print(sequences)
# partial_list = [item[1] for item in enumerate(seqs.items()) if item[0] < 4]
# z = iter(seqs.items())
# partial_list = [next(z) for _ in range(4)]
# print(list(seqs.items())[:4])
# print(partial_list)

## That was clunky. Is there a cleaner way with itertools?

Iterators save memory with 'lazy execution', i.e. they don't start doing the thing until you ask them.
This will let us directly feed keys and values into a dictionary without reading into a list first.  \*\*\*\*

We can use a key function to tell `groupby()` to split the file into chunks of consecutive lines that either do or do not start with '>'.
`groupby()` returns tuples of `(bool, _grouper)`. `__grouper` is not the sequence chunk, it is itself an iterator that "knows" to go through the file and collect lines into chunks, when or if we ask.

We need to iterate through (call `__next__` on) the second item in each tuple to tell `__grouper` to actually start iterating through the file and retrieve the next chunk of lines.

Calling `next()` retrieves and 'expends' the next value(s) in an iterator (same thing a for loop does), so if `next()` is used in a loop those values or 'positions in the list' will get 'skipped' in the next loop.

\*\*\*\* This was my mistake! Groupby is more memory efficient than reading in the whole file at once and then making a second temporary list. But groupby is still constructing a temporary list of lines for each chunk under the hood, then we're iterating over those temporary lists to join lines. I.e., the same thing we did above.  

In [110]:
def joinlines(iterable):
    return ''.join(line.strip() for line in iterable)

def fasta_todict2(filepath):
    with open(filepath,'r') as fasta_file:
        if fasta_file.readline()[0] != '>':
            print("File needs to start with header preceded by '>'.") 
        else:
            fasta_file.seek(0)
            chunks = itertools.groupby(fasta_file, key=lambda x: x[0]=='>')
            seqdict = {joinlines(chunk[1]): joinlines(next(chunks)[1]) 
                       for chunk in chunks}
            return seqdict

# seqdict = fasta_todict2(path_to_fasta_file)

# print(list(seqdict.items())[:4])

In [111]:
# Similar to next(), we can zip an iterator with itself.
# This will also 'expend' multiple values in one iteration over the zip object.
# Note: zip() in Python 3 behaves the same as itertools.izip(). 
#       zip() in Python 2 returns the entire list at once.  
# Also, zipping with the iterator lead to unexpected behavior with the headers 
# being dropped. Had to turn chunks into a generator to work as expected.
#
# Zipping is unnecessary for this application since 
# we're generating tuples just to unpack them.


def fasta_todict3(filepath):
    with open(filepath,'r') as fasta_file:
        if fasta_file.readline()[0] != '>':
            print("File needs to start with header preceded by '>'.") 
        else:
            fasta_file.seek(0)
            chunks = (joinlines(chunk[1]) for chunk in itertools.groupby(fasta_file, key=lambda x: x[0]=='>'))
            seqdict = {chunk1: chunk2 for chunk1, chunk2 in zip(chunks,chunks)}
            return seqdict

# seqs = fasta_todict3(path_to_fasta_file)

# print(list(seqs.items())[:4])

### **Proof that all three methods yield equivalent dictionaries.**

In [112]:
seqs1 = fasta_todict(path_to_fasta_file)
seqs2 = fasta_todict2(path_to_fasta_file)
seqs3 = fasta_todict3(path_to_fasta_file)

seqs1 == seqs2 == seqs3

True

### **Did that optimization make any difference for this 59 kb file?**

In [113]:
from timeit import timeit

timeit("seqs1 = fasta_todict(path_to_fasta_file)", 
       "path_to_fasta_file = '/home/alakhani/Coursera-Projects/PythonforGenomicDataScience/FASTA_Files/dna.example_copy.fasta'\n"
"""def fasta_todict(filepath):
    with open(filepath,'r') as fasta_file:
        header = ""
        seq_accumulator = []
        sequences = {}
        for line in fasta_file:
            # For first line/header.
            if line[0] == ">" and not (header or seq_accumulator):
                header = line.strip()
            elif line[0] == ">" and seq_accumulator:
                sequences[header] = ''.join(seq_accumulator)
                seq_accumulator.clear()
                header = line.strip()
            elif header:
                seq_accumulator.append(line.strip())
            else:
                print("Check file.")
        # Loop ends before the last sequence is paired with header.
        sequences[header] = ''.join(seq_accumulator)
        return sequences""",
        number=10000)

6.436991600028705

In [114]:
from timeit import timeit

timeit("seqs2 = fasta_todict2(path_to_fasta_file)", 
       "path_to_fasta_file = '/home/alakhani/Coursera-Projects/PythonforGenomicDataScience/FASTA_Files/dna.example_copy.fasta'\n"
"""def joinlines(iterable):
    return ''.join(line.strip() for line in iterable)
def fasta_todict2(filepath):
    with open(filepath,'r') as fasta_file:
        if fasta_file.readline()[0] != '>':
            print("File needs to start with header preceded by '>'.") 
        else:
            fasta_file.seek(0)
            chunks = itertools.groupby(fasta_file, key=lambda x: x[0]=='>')
            seqdict = {joinlines(chunk[1]): joinlines(next(chunks)[1]) for 
                       chunk in chunks}
            return seqdict""",
        number=10000)

9.177164799999446

In [115]:
from timeit import timeit

timeit("seqs3 = fasta_todict3(path_to_fasta_file)", 
       "path_to_fasta_file = '/home/alakhani/Coursera-Projects/PythonforGenomicDataScience/FASTA_Files/dna.example_copy.fasta'\n"
"""def joinlines(iterable):
    return ''.join(line.strip() for line in iterable)
def fasta_todict3(filepath):
    with open(filepath,'r') as fasta_file:
        if fasta_file.readline()[0] != '>':
            print("File needs to start with header preceded by '>'.") 
        else:
            fasta_file.seek(0)
            chunks = (joinlines(chunk[1]) for chunk in 
                      itertools.groupby(fasta_file, key=lambda x: x[0]=='>'))
            seqdict = {chunk1: chunk2 for chunk1, chunk2 in zip(chunks,chunks)}
            return seqdict""",
        number=10000)

11.62209239997901

## Lol
Indeed, all that glitters is not gold. The 'dumb way' was faster and will be more readable (to me) in 6 months.

It seems the extra overhead from creating extra intermediate objects to perform disjointed nested loops, plus calling a lambda function on each line instead of a simple conditional outweighed the idea that "it's faster because it's implemented in C".

To be fair, these tools seem more oriented towards managing memory consumption, which is contradicted by copying everything into a dictionary anyway. 
If we truly needed to avoid copying data unless or until it was needed, it could make more sense to save the `__grouper` objects as dictionary values. However, since the philosophy of Python is to abstract away these types of concerns, maybe it would be unnecessary even then.
___

## Let's play with dictionaries some more

First we'll define `fasta_to_nesteddict` to create a nested dict, so we can associate additional properties with each sequence name.

Then we'll pretend we either did not plan ahead, or do not want to change the fuctionality of fasta_todict. In this case we need to replace dictionary values with a nested dictionary, while retaining the original dictionary value somewhere. 

#### This highlights one of the advantages of implementing a class rather than a dictionary. What if we want to say, store a property of a property? Class implementation will be explored later.

In [None]:
def fasta_to_nesteddict(filepath):
    '''Returns {$FASTA_HEADER: {'Sequence': $SEQUENCE}}'''
    with open(filepath,'r') as fasta_file:
        header = ""
        seq_accumulator = []
        sequences = {}
        for line in fasta_file:
            # For first line/header.
            if line[0] == ">" and not (header or seq_accumulator):     
                header = line.strip()
            elif line[0] == ">" and seq_accumulator:
                sequences[header] = {'Sequence': ''.join(seq_accumulator)}
                seq_accumulator.clear()
                header = line.strip()
            elif header:
                seq_accumulator.append(line.strip())
            else:
                print("Check file."
                    "Missing header on first line or two consecutive headers.")
        # Loop ends before the last sequence is paired with header.
        sequences[header] = {'Sequence': ''.join(seq_accumulator)}
        return sequences
    
# nestedseqdict = fasta_to_nesteddict(path_to_fasta_file)
# nestedseqdict_items = iter(nestedseqdict.items())
# print([next(nestedseqdict_items) for _ in range(4)])

## Avoid mutable default values

In the next function, two parameters are given default values of None, when the intended behavior is to have these parameters default to empty lists. 

This is due to Python's treatment of mutable default arguments:

> <https://stackoverflow.com/questions/1132941/least-astonishment-and-the-mutable-default-argument>

> <https://docs.python-guide.org/writing/gotchas/#mutable-default-arguments>

When functions are defined with default values in Python, objects are initialized with these default values. These default argument *objects*, not the defined default *values*, "stick with" the function object throughout the program. 

If an argument with a mutable default is mutated within a function, calling the function without explicitly declaring this argument will change the argument's "default value" for future function calls.
This implies a defensive programming practice of explicitly defining mutable arguments if we intend them to be empty.

In the function below we do not mutate addkeys or addvals, but it is advisable to avoid mutable defaults as a rule unless this state dependence is specifically intended.

In [None]:
# This function requires that addvals support indexing.
# It does not support 

def nestdict(currentdict, newvalname="Old val", addkeys=None, 
             addvals: Sequence=None):
    '''Replaces dict values with {newvalname: currentdict[key]}. 

    New key:val pairs can be added to each key's nested dictionary using addkeys and addvals.
    Members of addvals can be called on currentdict values.
    If addkeys is longer than addvals, unmatched members of addkeys are initialized to None.
    Unmatched members of addvals are ignored. 
    '''
    newdict = {}
    addkeys = [] if addkeys is None else addkeys
    addvals = [] if addvals is None else addvals 
    for key, val in currentdict.items():
        lvl2dict = {}
        lvl2dict[newvalname] = val
        for index, name in enumerate(addkeys):
            try:
                x = addvals[index]
            except IndexError:
                x = None
            except:
                print("addvals in nestdict must be None or a Sequence.")
                raise
            if callable(x):
                lvl2dict[name] = x(val)
            else:
                lvl2dict[name] = x
        newdict[key] = lvl2dict
    if len(addvals) > index + 1:
        print("More values than keys provided, excess values ignored.")
    return newdict

# z = nestdict(fasta_todict(path_to_fasta_file), 'sequence', ['length','ORFs'], [lambda x: len(x), None])
# z_items = iter(z.items())
# print([next(z_items) for _ in range(4)])

**Another approach**

`zip_longest()` is cleaner and allows compatibility with iterables that do not support indexing, although the latter seems to have limited benefit. 

Similar to the previous version, if addvals is longer than addkeys for some reason, key values will default to `None` once addkeys is exhausted.  
`zip_longest()` will then generate tuples with the pattern `((None, val_n-2), (None, val_n-1)...(None, val_n))`, each excess value being overwritten until only the last value in addvals is saved.  
Managing this will be left to the user. A warning cannot be based on `len` in this case because at no point is there a guarantee that addvals supports `len`.  

\#whenthedocstringisaslongasthefunction

In [None]:
def nestdict(currentdict, newvalname="Old val", addkeys=None, addvals=None):
    '''Replaces dict values with {newvalname: currentdict[key]}. 

    addkeys and addvals are zipped to generate new key:val pairs for each key's nested dictionary. 
    Members of addvals can be called on currentdict values.
    If addkeys is longer than addvals, unmatched members of addkeys are initialized to None.
    If addvals is longer than addkeys, unmatched members of addvals will be lost except for the last element. The last members of the nested dicts will be (None: addvals[n]).
    '''
    newdict = {}
    addkeys = [] if addkeys is None else addkeys
    addvals = [] if addvals is None else addvals
    for key, val in currentdict.items():
        lvl2dict = {}
        lvl2dict[newvalname] = val
        lvl2dict.update((k, v(val)) if callable(v) else (k, v)
                        for k, v in 
                        itertools.zip_longest(addkeys, addvals))
        newdict[key] = lvl2dict
    return newdict

# z = nestdict(fasta_todict(path_to_fasta_file), 'sequence', ['length','ORFs', 'more', 'keys'], {'dictionary': 'inception', 'this':'is', 'why':'classes', 'are':'probably', 'better': None}.items())
# z_items = iter(z.items())
# print([next(z_items) for _ in range(4)])

In [None]:
%whos

In [None]:
# this seems to overestimate memory of generators by iterating through generator objects.

import sys
import gc

def actualsize(input_obj):
    memory_size = 0
    ids = set()
    objects = [input_obj]
    while objects:
        new = []
        for obj in objects:
            if id(obj) not in ids:
                ids.add(id(obj))
                memory_size += sys.getsizeof(obj)
                new.append(obj)
        objects = gc.get_referents(*new)
    return memory_size
