In [6]:
#################################################################
## Extract best features for peptide detectability prediction
#################################################################

# Gather information from Biopython, Disprot and SeqComplex (Perl)
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
from Bio import SeqRecord, Seq
from Bio.Alphabet import generic_protein
import re
import pandas as pd
import numpy as np
from sklearn import preprocessing
from tqdm import tqdm
import os.path

In [47]:
peptides = pd.read_csv("data/peptides.tsv", sep = "\t").values
peptides.shape

(17736, 1)

In [111]:
X = [ProteinAnalysis(e[0]) for e in peptides]
result = [None,]*len(X)
for i, e in tqdm(enumerate(X)):
    
    aa_percent = list(e.get_amino_acids_percent().values())
    rest = [e.molecular_weight(), len(peptides[i]), e.molecular_weight() / len(peptides[i]),
           e.aromaticity(), e.instability_index(), e.isoelectric_point()]
    result[i] = aa_percent + rest
    

17736it [00:02, 7250.62it/s]


In [123]:
colnames = list(X[0].get_amino_acids_percent().keys()) + ["MW", "LEN", "ML", "AROM", "INST", "IP"]
print(colnames)
result = pd.DataFrame(np.vstack(result), columns = colnames)

['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'MW', 'LEN', 'ML', 'AROM', 'INST', 'IP']


In [124]:
result.to_csv("data/protein_analysis.tsv", sep = "\t")