In [65]:
import glob
import os
def read_aa_sequences(dir):
    """
    Read in all of the protein sequences from the given directory.

    Input: directory
    Output: list of protein sequences
    """
    files = glob.glob(dir + '/*.fa')

    aa_sequences = []
    # iterate over each .fa file in the given directory
    for filepath in glob.iglob(os.path.join(dir, "*.fa")):

        aa_sequences.append(read_aa_sequence(filepath))

    print("Read in %d amino acid sequences"%len(aa_sequences))

    return aa_sequences



In [66]:
import glob
import os
def read_aa_sequences_names(dir):
    """
    Read in all of the protein sequences from the given directory.

    Input: directory
    Output: list of protein sequences
    """
    files = glob.glob(dir + '/*.fa')

    aa_sequences = []
    # iterate over each .fa file in the given directory
    for filepath in glob.iglob(os.path.join(dir, "*.fa")):

        aa_sequences.append(read_aa_sequence_name(filepath))

    print("Read in %d amino acid sequences' names"%len(aa_sequences))

    return aa_sequences

In [67]:

class AASeq:
    """
    A simple class for an amino acid sequence
    """

    def __init__(self, name):
        self.name = name
        self.partialsequence = []

    # Overload the __repr__ operator to make printing simpler.
    def __repr__(self):
        return self.name

In [68]:
!pwd


/Users/lcech/BMI203/HW3_skeleton


In [69]:
import itertools
def read_aa_sequence(filepath):
    """
    Read in a single amino acid sequence given a fasta file

    Input: fasta file path
    Output: amino acid sequence instance
    """
    basename = os.path.basename(filepath)
    name = os.path.splitext(basename)

    if name[1] != ".fa":
        raise IOError("%s is not a fasta file"%filepath)

    aa_sequence = AASeq(name[0])


    # open .fa file
    with open(filepath, "r") as f:
        # iterate over each line in the file
        for line in itertools.islice(f, 0, 5):
            part_sequence = f.read() 
    part_sequence = ''.join(part_sequence.split())    
    aa_sequence.partialsequence.append(part_sequence)
       

    return aa_sequence.partialsequence 



In [70]:
import itertools
def read_aa_sequence_name(filepath):
    """
    Read in a single amino acid sequence given a fasta file

    Input: fasta file path
    Output: amino acid sequence instance
    """
    basename = os.path.basename(filepath)
    name = os.path.splitext(basename)

    if name[1] != ".fa":
        raise IOError("%s is not a fasta file"%filepath)

    aa_sequence = AASeq(name[0])


    # open .fa file
    with open(filepath, "r") as f:
        # iterate over each line in the file
        for line in itertools.islice(f, 0, 5):
            part_sequence = f.read() 
    part_sequence = ''.join(part_sequence.split())    
    aa_sequence.partialsequence.append(part_sequence)
       

    return aa_sequence

In [71]:
read_aa_sequence("/Users/lcech/BMI203/HW3_skeleton/sequences/prot-0004.fa")

['SLEAAQKSNVTSSWAKASAAWGTAGPEFFMALFDAHDDVFAKFSGLFSGAAKGTVKNTPEMAAQAQSFKGLVSNWVDNLDNAGALEGQCKTFAANHKARGISAGQLEAAFKVLSGFMKSYGGDEGAWTAVAGALMGEIEPDM']

In [81]:
seq1 = read_aa_sequence("/Users/lcech/BMI203/HW3_skeleton/sequences/prot-0050.fa")


In [85]:
read_aa_sequence("/Users/lcech/BMI203/HW3_skeleton/sequences/prot-0050.fa")

['STKKTQLQLEHLLLDLQMILNGINNYKNPKLTRMLTFKFYMPKKATELKHLQCLEEELKPLEEVLNLAQSKNFHLRPRDLISNINVIVLELKCEYADETATIVEFLNRWITFAQSIISTLT']

In [74]:
type(seq1)

list

In [75]:
type('SLEAAQKSNVTSSWAKASAAWGTAGPEFFMALFDAHDDVFAKFSGLFSGAAKGTVKNTPE'
    )

str

In [76]:
print(seq1)

['SLEAAQKSNVTSSWAKASAAWGTAGPEFFMALFDAHDDVFAKFSGLFSGAAKGTVKNTPEMAAQAQSFKGLVSNWVDNLDNAGALEGQCKTFAANHKARGISAGQLEAAFKVLSGFMKSYGGDEGAWTAVAGALMGEIEPDM']


In [77]:
!pwd


/Users/lcech/BMI203/HW3_skeleton


In [78]:
path_to_seq = "/Users/lcech/BMI203/HW3_skeleton/sequences/"

In [79]:
seq = read_aa_sequences(path_to_seq)

Read in 182 amino acid sequences


In [80]:
print(seq)

[['KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL'], ['IINGYEAYTGLFPYQAGLDITLQDQRRVWCGGSLIDNKWILTAAHCVHDAVSVVVYLGSAVQYEGEAVVNSERIISHSMFNPDTYLNDVALIKIPHVEYTDNIQPIRLPSGEELNNKFENIWATVSGWGQSNTDTVILQYTYNLVIDNDRCAQEYPPGIIVESTICGDTSDGKSPCFGDSGGPFVLSDKNLLIGVVSFVSGAGCESGKPVGFSRVTSYMDWIQQNTGIKF'], ['FRNPTVAPTHDVTTDRSQRLTLRFIPVDREDTAYSYKARFTLAVGDNRVLDMASTYFDIRGVLDRGPTFKPYSGTAYNALAPKGAPNSCEWEQTEDSGRAVAEDEEEEDEDEEEEEEEQNARDQATKKTHVYAQAPLSGETITKSGLQIGSDNAETQAKPVYADPSYQPEPQIGESQWNEADANAAGGRVLKKTTPMKPCYGSYARPTNPFGGQSVLVPDEKGVPLPKVKPKVVLYSEDVNMETPDTHLSYKPGKGDENSKAMLGQQSMPNRPNYIAFRDNFIGLMYYNSTGNMGVLAGQASQLNAVVDLQDRNTELSYQLLLDSIGDRTRYFSMWNQAVDSYDPDVRIIENHGTEDELPNYCFPLGGIGVTDTYQAIKANGNGSGDNGDTTWTKDETFATRNEIGVGNNFAMEINLNANLWRNFLYSNIALYLPDKLKYNPTNVEISDNPNTYDYMNKRVVAPGLVDCYINLGARWSLDYMDNVNPFNHHRNAGLRYRSMLLGNGRYVPFHIQVPQKFFAIKNLLLLPGSYTYEWNFRKDVNMVLQSSLGNDLRVDGASIKFDSICLYATFFPMAHNTASTLEAMLRNDTNDQSFNDYLSAANMLYPIPANATNVPISIPSRNWA

In [64]:
type(seq)

list

In [86]:
seq_names = read_aa_sequences_names(path_to_seq)

Read in 182 amino acid sequences' names


In [87]:
print(seq_names)

[prot-0659, prot-0275, prot-0364, prot-0397, prot-0286, prot-0063, prot-0022, prot-0047, prot-0589, prot-0305, prot-0245, prot-0354, prot-0360, prot-0271, prot-0261, prot-0370, prot-0077, prot-0166, prot-0026, prot-0779, prot-0244, prot-0355, prot-0177, prot-0361, prot-0392, prot-0102, prot-0345, prot-0314, prot-0609, prot-0598, prot-0056, prot-0300, prot-0091, prot-0240, prot-0173, prot-0190, prot-0386, prot-0264, prot-0648, prot-0069, prot-0178, prot-0785, prot-0414, prot-0886, prot-0875, prot-0776, prot-0602, prot-0583, prot-0461, prot-0570, prot-0148, prot-0008, prot-0424, prot-0597, prot-0606, prot-0486, prot-0410, prot-0772, prot-0540, prot-0791, prot-0915, prot-0616, prot-0587, prot-0465, prot-0018, prot-0298, prot-0534, prot-0596, prot-0716, prot-0821, prot-0860, prot-0510, prot-0805, prot-0586, prot-0524, prot-0108, prot-0561, prot-0470, prot-0712, prot-0900, prot-0592, prot-0613, prot-0582, prot-0460, prot-0520, prot-0431, prot-0571, prot-0736, prot-0454, prot-0228, prot-0484

In [None]:
#now have both a list of sequences names and a list of strings with sequences
#however they are not in the order that would make sense, yet this will be remedied if you can figure out a way to "index" based on the name of the protein
#possibly use the same approach as for the previous homework where you indexed based on the dataframe column