In [3]:
import os
import sys
from pprint import pprint
import pprint as pp               # brought in for pformat, can remove or reformat later

path_to_fasta_file = '/home/alakhani/Coursera-Projects/PythonforGenomicDataScience/FASTA_Files/dna.example_copy.fasta'

In [4]:
# with automatically closes file before leaving statement. 
# A bonus is being able to retry things without having to file.seek(0) every time

with open(path_to_fasta_file,'r') as fasta_file:
    fileaslist =  list(fasta_file)
# pprint(fileaslist)

In [5]:
# The file object still exists after the file has been closed.
print(type(fasta_file))
print(sys.getsizeof(fasta_file))

# Unless it is deleted explicitly
del fasta_file
# print(type(fasta_file))

<class '_io.TextIOWrapper'>
208


In [6]:
# FASTA record descriptions begin with '>', 
# Simplest way of counting:

with open(path_to_fasta_file,'r') as fasta_file:
    count = fasta_file.read().count('>')

# print(count)


In [7]:
# Dictionary comprehension directly from file looks cool, 
# but it does not seem amenable for collecting sequence values in the same iteration.

with open(path_to_fasta_file,'r') as fasta_file:
    seq_dict_2 = {line.strip() : None for line in fasta_file if line[0] == ">"}
    
# print('Length of dictionary made "the python way"', len(seq_dict_2),'\n')
# pprint(seq_dict_2.items())


In [8]:
# Collect sequence keys and values in separate lists then join into a dictionary.

with open(path_to_fasta_file,'r') as fasta_file:
    header = ""
    seq_accumulator = []
    sequences = {}
    for line in fasta_file:
        if line[0] == ">" and not (header or seq_accumulator):     # For first line/header.
            header = line.strip('>\n')
        elif line[0] == ">" and seq_accumulator and header:
            sequences[header] = ''.join(seq_accumulator)
            seq_accumulator.clear()
            header = line.strip('>\n')
        elif header:                                               # Will have unintended effects if first line is not a header.
            seq_accumulator.append(line.strip())
        else:
            print("Check file format. Missing header on first line")
            
    sequences[header] = ''.join(seq_accumulator)                   # Loop ends when we reach eof but last sequence was not paired with header yet.
    
print('Length of sequence dictionary"', len(sequences))
# print(sequences)
# print(list(sequences.items())[:1])
# print(list(sequences.items())[-2:])

Length of sequence dictionary" 25


In [11]:
# That was clunky. Is there a cleaner way with itertools?
# 1 - enumerate can yield an index so we know where to start looking for next sequence
# 2 - takewhile can collect lines until the next header is seen. use index returned from enumerate to take a slice.
# 
# Disadvantages are making a (shallow) copy of the list every time we see a header, 
# and still iterating over lines that have already been stored as sequences.
# Also function calls from lambda in loop have (in this case fixed?) overhead.  
# With respect to speed of creating new copies, this will be a bottleneck with very large files. 

import itertools
sequences = {}
with open(path_to_fasta_file,'r') as fasta_file:
    readlist = fasta_file.read().splitlines()
#    readlist = [line for line in fasta_file]
    for index, line in enumerate(readlist):
        if line[0] == ">":
            # [key] = value used for adding items to dict
            # Make a slice of readlist starting from the next line.
            # Then use takewhile to collect lines in a list (technically "create an iterator") until we see another header line.
            # Once we encounter another header line, join elements returned by takewhile and assign the resulting string
            # as a dict value corresponding to the current line as a dict key.
            sequences[line] = ''.join(itertools.takewhile(lambda x: x[0] != ">", readlist[index + 1:]))

# pprint(readlist)
# pprint(sequences)

