In [1]:
import numpy as np
import sys, time, pandas, itertools

Lets define our amino acids and a function to put them together into peptide labels

In [2]:
Num2Word = {1:"AminoAcids",
            2:"Di",
            3:"Tri",
            4:"Tetra",
            5:"Penta",
            6:"Hexa",
            7:"Hepta",
            8:"Octa",
            9:"Nona",
            10:"Deca",
            11:"Undeca",
            12:"Dodeca"}

letters_1 = np.array(["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"])

def GenerateDatasetIndex(AminoAcids):
    return [''.join(i) for i in itertools.product(letters_1, repeat = AminoAcids)]

Define the type (di = 2, tri=3, etc) and list of peptides we will be using 

In [3]:
L = 2
peptides = GenerateDatasetIndex(L)
if len(peptides) < 500:
    print(peptides)
else:
    print("Peptides:", len(peptides))

Peptides: 160000


Now lets define the amino acid specific paraeters

In [4]:
features = ["SP2", "NH2", "MW", "S", "LogP WW", "Z", "MaxASA", "RotRatio", "Bulkiness", "OH", "pI"]
SP2 =       np.array([0,    0,   1,   1,   6,   0,   3,   0,   0,   0,   0,   1,   0,   1,   1,   0,   0,   0,   8,   6], dtype=np.float32)
SP3 =       np.array([1,    1,   1,   2,   1,   0,   1,   4,   4,   4,   3,   1,   3,   2,   3,   1,   2,   3,   1,   1], dtype=np.float32)
NH2 =       np.array([0,    0,   0,   0,   0,   0,   0,   0,   1,   0,   0,   1,   0,   1,   2,   0,   0,   0,   0,   0], dtype=np.float32)
MW =        np.array([89.10, 121.16, 133.11, 147.13, 165.19, 75.07, 155.16, 131.18, 146.19, 131.18, 149.21, 132.12, 115.13, 146.15, 174.20, 105.09, 119.12, 117.15, 204.23, 181.19], dtype=np.float32)
S =         np.array([0,    1,   0,   0,   0,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=np.float32)
charge =    np.array([0,    0,  -1,  -1,  0,    0,   0,   0,   1,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0], dtype=np.float32)
# ASP ARG and LYS are as charged side chains
Gwif = np.array([0.17, -0.24, 1.23, 2.02, -1.13, 0.01, 0.17, -0.31, 0.99, -0.56, -0.23, 0.42, 0.45, 0.58, 0.81, 0.13, 0.14, 0.07, -1.85, -0.94, ], dtype=np.float32) 
Gwoct = np.array([0.5, -0.02, 3.64, 3.63, -1.71, 1.15, 0.11, -1.12, 2.8, -1.25, -0.67, 0.85, 0.14, 0.77, 1.81, 0.46, 0.25, -0.46, -2.09, -0.71, ], dtype=np.float32) 
#Tien et al. 2013 (theory)
MaxASA =    np.array([129, 167, 193, 223, 240, 104, 224, 197, 236, 201, 224, 195, 159, 225, 274, 155, 172, 174, 285, 263], dtype=np.float32)
# Zimmerman J.M., Eliezer N., Simha R. J. Theor. Biol. 21:170-201(1968).
bulky =     np.array([11.50, 13.46, 11.68, 13.57, 19.80, 3.4, 13.69, 21.40, 15.71, 21.4, 16.25, 12.82, 17.43, 14.45, 14.28, 9.47, 15.77, 21.57, 21.67, 18.03], dtype=np.float32)
OH =        np.array([0,  0,   0,   0,    0,     0,    0,   0,   0,  0,   0,  0,  0,    0,   0,  1,  1,    0,   0,  1], dtype=np.float32)
#pI =        np.array([6.11,5.15,2.98,3.08,5.76,6.06, 7.64, 6.04, 9.47, 6.04, 5.71, 5.43, 6.30, 5.65, 11.5,5.07, 5.60,6.02,5.88,5.63], dtype=np.float32)


Define the max and min values possible within the peptide dataset so that the entire dataset may be normalized.

In [5]:
SP2_max = ((max(SP2)*L)/2.0).astype(np.float32) 
polytryptophan_index = [18]*L
RotRatio_max = SP2[polytryptophan_index].sum() / SP3[polytryptophan_index].sum() 
RotRatio_max = np.float32(RotRatio_max/2.0)
NH2_max = ((max(NH2)*L)/2.0).astype(np.float32)
MW_min = (min(MW)*L).astype(np.float32)
MW_max = (max(MW)*L).astype(np.float32)
S_max = ((max(S)*L)/2.0).astype(np.float32)
Z_min = (min(charge)*L).astype(np.float32)
Z_max = (max(charge)*L).astype(np.float32)
polyasparticacid_index = [2]*L
LogP_WW_min = (Gwif[polyasparticacid_index] - Gwoct[polyasparticacid_index]).sum()
polyisoleucine_index = [7]*L
LogP_WW_max = (Gwif[polyisoleucine_index] - Gwoct[polyisoleucine_index]).sum()
MaxASA_min = (min(MaxASA)*L).astype(np.float32)
MaxASA_max = (max(MaxASA)*L).astype(np.float32)
bulky_min = (min(bulky)*L).astype(np.float32)
bulky_max = (max(bulky)*L).astype(np.float32)
OH_max = ((max(OH)*L)/2.0).astype(np.float32)
#pI_min = (min(pI)).astype(np.float32)
#pI_max = (max(pI)).astype(np.float32)

We need a function that finds the index of a particular peptide, AA = (0, 0), YY = (19,19)

In [6]:
def pep2index(peptide):
    L = len(peptide)
    size = int(20**L)
    solution = 0
    letters_1 = np.array(list("ACDEFGHIKLMNPQRSTVWY"))
    for i in range(1, L+1):
        index = np.where(letters_1 == peptide[i-1])[0][0]
        number = int((size/(20**i)) * index)
        solution += number
    return solution

peptide_numbers = np.zeros((len(peptides), L), dtype=np.uint8)
for i,pep in enumerate(peptides):
    for j,letter in enumerate(pep):
        peptide_numbers[i][j] = pep2index(letter)
        
print("Example:", peptides[45], peptide_numbers[45])

Example: AADG [0 0 2 5]


Setup a pandas dataframe and write the values for each peptide to its columns

In [7]:
pd_table = pandas.DataFrame()

print("Judred_NH2")
pd_table["Judred_NH2"] = NH2[peptide_numbers].sum(axis=1)
pd_table["Judred_NH2"] = (pd_table["Judred_NH2"] / NH2_max) - np.float32(1.0)

print("Judred_MW")
pd_table["Judred_MW"] = MW[peptide_numbers].sum(axis=1) 
pd_table["Judred_MW"] = pd_table["Judred_MW"] - MW_min
pd_table["Judred_MW"] = pd_table["Judred_MW"] / ((MW_max - MW_min)/2).astype(np.float32)
pd_table["Judred_MW"] = pd_table["Judred_MW"] - np.float32(1.0)

print("Judred_S")
pd_table["Judred_S"] = S[peptide_numbers].sum(axis=1) 
pd_table["Judred_S"] = (pd_table["Judred_S"] / S_max) - np.float32(1.0)

print("Judred_LogP WW")
pd_table["Judred_LogP WW"] = (Gwif[peptide_numbers] - Gwoct[peptide_numbers]).sum(axis=1)
pd_table["Judred_LogP WW"] = pd_table["Judred_LogP WW"] - LogP_WW_min
pd_table["Judred_LogP WW"] = pd_table["Judred_LogP WW"] / ((LogP_WW_max - LogP_WW_min)/2.0).astype(np.float32)
pd_table["Judred_LogP WW"] = pd_table["Judred_LogP WW"] - np.float32(1.0)

print("Judred_Z")
pd_table["Judred_Z"] = charge[peptide_numbers].sum(axis=1) 
pd_table["Judred_Z"] = pd_table["Judred_Z"] - Z_min
pd_table["Judred_Z"] = pd_table["Judred_Z"] / ((Z_max - Z_min)/2.0).astype(np.float32)
pd_table["Judred_Z"] = pd_table["Judred_Z"] - np.float32(1.0)
        
print("Judred_MaxASA")
pd_table["Judred_MaxASA"] = MaxASA[peptide_numbers].sum(axis=1) 
pd_table["Judred_MaxASA"] = pd_table["Judred_MaxASA"] - MaxASA_min
pd_table["Judred_MaxASA"] = pd_table["Judred_MaxASA"] / ((MaxASA_max - MaxASA_min)/2.0).astype(np.float32)
pd_table["Judred_MaxASA"] = pd_table["Judred_MaxASA"] - np.float32(1.0)

print("Judred_SP2 & Judred_RotRatio")
pd_table["Judred_SP2"] = SP2[peptide_numbers].sum(axis=1) 
pd_table["Judred_RotRatio"] = (pd_table["Judred_SP2"]/(SP3[peptide_numbers].sum(axis=1)))
pd_table["Judred_RotRatio"] = np.nan_to_num(pd_table["Judred_RotRatio"].values, copy=True)
pd_table["Judred_RotRatio"] = (pd_table["Judred_RotRatio"] / RotRatio_max) - np.float32(1.0)
pd_table["Judred_SP2"] = (pd_table["Judred_SP2"] / SP2_max) - np.float32(1.0)

print("Judred_Bulkiness")
pd_table["Judred_Bulkiness"] = bulky[peptide_numbers].sum(axis=1)
pd_table["Judred_Bulkiness"] = pd_table["Judred_Bulkiness"] - bulky_min
pd_table["Judred_Bulkiness"] = pd_table["Judred_Bulkiness"] / ((bulky_max - bulky_min)/2.0).astype(np.float32)
pd_table["Judred_Bulkiness"] = pd_table["Judred_Bulkiness"] - np.float32(1.0)

print("Judred_OH")
pd_table["Judred_OH"] = OH[peptide_numbers].sum(axis=1) 
pd_table["Judred_OH"] = (pd_table["Judred_OH"] / OH_max) - np.float32(1.0)
# Set the index to the peptide labels
pd_table.index = peptides

Judred_NH2
Judred_MW
Judred_S
Judred_LogP WW
Judred_Z
Judred_MaxASA
Judred_SP2 & Judred_RotRatio
Judred_Bulkiness
Judred_OH


Save to file

In [8]:
print(pd_table)
pd_table.to_csv(Num2Word[L]+"peptides_Judred.csv")

      Judred_NH2  Judred_MW  Judred_S  Judred_LogP WW  Judred_Z  \
AAAA        -1.0  -0.782750      -1.0        0.291926      0.00   
AAAC        -1.0  -0.658641      -0.5        0.309006      0.00   
AAAD        -1.0  -0.612380      -1.0       -0.031056     -0.25   
AAAE        -1.0  -0.558106      -1.0        0.093168     -0.25   
AAAF        -1.0  -0.488193      -1.0        0.433230      0.00   
...          ...        ...       ...             ...       ...   
YYYS        -1.0   0.348637      -1.0        0.338509      0.00   
YYYT        -1.0   0.402950      -1.0        0.372671      0.00   
YYYV        -1.0   0.395324      -1.0        0.472050      0.00   
YYYW        -1.0   0.732425      -1.0        0.427019      0.00   
YYYY        -1.0   0.643233      -1.0        0.354037      0.00   

      Judred_MaxASA  Judred_SP2  Judred_RotRatio  Judred_Bulkiness  Judred_OH  
AAAA      -0.723757     -1.0000          -1.0000         -0.113301       -1.0  
AAAC      -0.618785     -1.0000    