In [5]:
import itertools
import sys
import re

# based on post here
# https://drj11.wordpress.com/2010/02/22/python-getting-fasta-with-itertools-groupby/

# define what a header looks like in FASTA format
def isheader(line):
    return line[0] == '>'


# this function reads in fasta file and returns pairs of data
# where the first item is the ID and the second is the sequence
# it isn't that efficient as it reads it all into memory
# but this is good enough for our project
def aspairs(filename):
    seq_id = ''
    sequence = ''
    with open(filename,"r") as f:
        for header,group in itertools.groupby(f, isheader):
            if header:
                line = next(group)
                seq_id = line[1:].split()[0]
            else:
                sequence = ''.join(line.strip() for line in group)
                yield seq_id, sequence

In [2]:
inputfile="E_coli_K12.pep"
numprots = 0
with open(inputfile,"r") as fh:
    for line in fh:
        if line.startswith('>'):
            numprots += 1
    print(f'There are {numprots} proteins')

There are 4213 proteins


In [12]:
seqdata = aspairs(inputfile)
numprots=0
for s in seqdata:
    numprots += 1
print(f'There are {numprots} proteins')

seqdata = aspairs(inputfile)
AA_counts = {}
for p in seqdata:
    numprots +=1
    pepid = p[0]
    pepseq = p[1]
    # loop through all the letters in this sequence
    for aa in pepseq:
        count = 1 + AA_counts.get(aa,0)
        AA_counts[aa] = count
total_len = sum(AA_counts.values())
print(f'There are {numprots} proteins')
print(AA_counts,"\n",total_len)
for aa in sorted(AA_counts):
    fraction = AA_counts[aa] / total_len
    print(f'{aa}\t{100*fraction:6.3f}%')

print("now sorted by most frequent AA")
for aa in sorted(AA_counts,reverse=True,key=lambda x: AA_counts[x]):
    print(f'{aa}\t{100*AA_counts[aa]/total_len:6.3f}%')

There are 4213 proteins
There are 8426 proteins
{'M': 37739, 'K': 58843, 'R': 74021, 'I': 80211, 'S': 77358, 'T': 72057, 'G': 98579, 'N': 52508, 'A': 127253, 'V': 94522, 'L': 142792, 'F': 51976, 'E': 77207, 'D': 68787, 'Q': 59505, 'P': 59237, 'H': 30423, 'C': 15485, 'Y': 37984, 'W': 20474, 'U': 3} 
 1336964
A	 9.518%
C	 1.158%
D	 5.145%
E	 5.775%
F	 3.888%
G	 7.373%
H	 2.276%
I	 5.999%
K	 4.401%
L	10.680%
M	 2.823%
N	 3.927%
P	 4.431%
Q	 4.451%
R	 5.536%
S	 5.786%
T	 5.390%
U	 0.000%
V	 7.070%
W	 1.531%
Y	 2.841%
now sorted by most frequent AA
L	10.680%
A	 9.518%
G	 7.373%
V	 7.070%
I	 5.999%
S	 5.786%
E	 5.775%
R	 5.536%
T	 5.390%
D	 5.145%
Q	 4.451%
P	 4.431%
K	 4.401%
N	 3.927%
F	 3.888%
Y	 2.841%
M	 2.823%
H	 2.276%
W	 1.531%
C	 1.158%
U	 0.000%
