In [54]:
import numpy as np
from Bio import SeqIO
import pandas as pd
from Bio.Seq import Seq
from itertools import chain
from collections import Counter
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [50]:
v = []
for r in SeqIO.parse('data-sars-cov-2/sequences.fasta-2.txt', "fasta"):
    loc = r.description.split('|')[1]
    if loc == "Viet Nam": loc = "Vietnam"
    v.append({'id':r.id, 'name':r.name, 'isolation_loc': loc, 'length': len(r.seq), 'sequence_dna': r.seq, 'sequence_amino': r.seq.translate()})
df_sars_cov_2 = pd.DataFrame(v)

In [51]:
def convert_amino_to_prot(ams):
    return [p for p in (ams.split("*")) if len(p) >= 20]


prot = df_sars_cov_2.sequence_amino.apply(convert_amino_to_prot)
prot = list(chain(*prot.values))


In [53]:
protein_count = Counter(prot)
protein_count

Counter({Seq('QDTSNSSIFCRLLTVSSVLQPIISTSRFRPGVTER', HasStopCodon(ExtendedIUPACProtein(), '*')): 80,
         Seq('DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS', HasStopCodon(ExtendedIUPACProtein(), '*')): 82,
         Seq('TALCVHQTFGCSNCTSWSCYG', HasStopCodon(ExtendedIUPACProtein(), '*')): 77,
         Seq('DTWCPCPSCGRNTSGLPQGSSS', HasStopCodon(ExtendedIUPACProtein(), '*')): 80,
         Seq('HLQWGMSKFCISLKFHNQDYSTKG', HasStopCodon(ExtendedIUPACProtein(), '*')): 80,
         Seq('RDRHYFGIFFCFHKCFCGNCERFGL', HasStopCodon(ExtendedIUPACProtein(), '*')): 82,
         Seq('TEINTESSLCICIRGCSCCTINFLPHS', HasStopCodon(ExtendedIUPACProtein(), '*')): 82,
         Seq('NCSKFCACFTEGRYNNTRWNFTVFTETH', HasStopCodon(ExtendedIUPACProtein(), '*')): 82,
         Seq('QSSCNGLHYRWCCSVDFAVAN', HasStopCodon(ExtendedIUPACProtein(), '*')): 82,
         Seq('IQRRNWPTHASKSPKRNYLLRGRNTSHRSVNRGSCLENW', HasStopCodon(ExtendedIUPACProtein(), '*')): 81,
         Seq('YDGNKQYLHTQRRCTNKGYFW', HasStopCodon(ExtendedIUP

In [55]:
prot

[Seq('QDTSNSSIFCRLLTVSSVLQPIISTSRFRPGVTER', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('TALCVHQTFGCSNCTSWSCYG', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('DTWCPCPSCGRNTSGLPQGSSS', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('HLQWGMSKFCISLKFHNQDYSTKG', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('RDRHYFGIFFCFHKCFCGNCERFGL', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('TEINTESSLCICIRGCSCCTINFLPHS', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('NCSKFCACFTEGRYNNTRWNFTVFTETH', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('QSSCNGLHYRWCCSVDFAVAN', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('IQRRNWPTHASKSPKRNYLLRGRNTSHRSVNRGSCLENW', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('YDGNKQYLHTQRRCTNKGYFW', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('PYTFFKSLCRYCSHKCLLSCL', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('NASESFEKSANRQLYNHLPGSGFKWLHC

In [62]:
# Protein analysis
p_list = []
for p in set(prot):
    p_analysis = ProteinAnalysis(str(p))
    if 'X' in str(p):
        continue
        print(str(p), 'Skipped, X, count of:', protein_count[p])
    if 'J' in str(p):
        continue
        print(str(p), 'Skipped, J, count of:', protein_count[p])
    p_list.append({'seq': p, 'count': protein_count[p], 'mw': p_analysis.molecular_weight(), 'aromaticity': p_analysis.aromaticity(), 'amino_acid_count': p_analysis.count_amino_acids(), 'isoelectric_point': p_analysis.isoelectric_point() })

In [63]:
pd.DataFrame(p_list)

Unnamed: 0,seq,count,mw,aromaticity,amino_acid_count,isoelectric_point
0,"(W, S, S, C, C, R, F, L, L, F, I, V, N, A, Y, ...",4,3286.8249,0.178571,"{'A': 1, 'C': 4, 'D': 1, 'E': 0, 'F': 3, 'G': ...",7.884242
1,"(L, K, K, V, P, L, L, V, V, T, Y, P, K, M, L, ...",4,3291.1715,0.071429,"{'A': 0, 'C': 0, 'D': 0, 'E': 0, 'F': 1, 'G': ...",10.301548
2,"(T, Y, I, A, L, D, L, V, L, R, N, Y, L, C, M, ...",7,3437.2660,0.137931,"{'A': 1, 'C': 2, 'D': 1, 'E': 0, 'F': 0, 'G': ...",5.494705
3,"(N, Q, Q, W, L, L, M, Q, P, M, F, T, L, N, M, ...",4,2720.1467,0.090909,"{'A': 0, 'C': 0, 'D': 0, 'E': 3, 'F': 1, 'G': ...",4.050028
4,"(K, S, H, H, I, F, T, D, A, T, R, S, T, I, E, ...",3,4818.5171,0.069767,"{'A': 4, 'C': 3, 'D': 1, 'E': 2, 'F': 2, 'G': ...",8.657604
...,...,...,...,...,...,...
334,"(T, D, K, Y, Y, N, L, V, F, T, I, K, C, L, P, ...",1,3721.4185,0.233333,"{'A': 0, 'C': 3, 'D': 1, 'E': 0, 'F': 5, 'G': ...",9.353154
335,"(T, M, L, R, C, Y, F, P, K, C, S, E, K, N, N, ...",73,5483.2746,0.239130,"{'A': 0, 'C': 2, 'D': 1, 'E': 2, 'F': 8, 'G': ...",6.421863
336,"(T, V, F, V, L, I, I, C, L, I, S, L, L, Y, C, ...",4,4221.2910,0.083333,"{'A': 0, 'C': 4, 'D': 0, 'E': 2, 'F': 1, 'G': ...",6.412882
337,"(R, R, Q, R, Q, C, L, K, S, V, K, V, P, F, T, ...",4,5836.0500,0.104167,"{'A': 1, 'C': 3, 'D': 0, 'E': 2, 'F': 4, 'G': ...",10.243978
