In [1]:
from __future__ import print_function, division
import tempfile
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences

In [2]:
# read dataset
DATA_DIR = r'D:\data'
df = pd.read_csv(r'D:\data\processed_sequences.csv')
pd.set_option('display.max_columns',None)
np.set_printoptions(threshold=np.inf) 
train = {}
train = df.to_dict()
train1 = train['Sequence']

In [3]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

char_to_num = {
    'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7,
    'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13,
    'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20
}

def OE1(seq_temp1):
    fea1 = [[char_to_num.get(char, 0)] for char in seq_temp1]
    return fea1

train1_oe1 = [OE1(train1[i]) for i in train1]

In [4]:
#Amino acid composition (AAC)
handcraft_AAC_test = [[0] * 20 for _ in range(len(train1_oe1))]
for row in range(len(train1_oe1)):
    seq = train1_oe1[row]
    for i in seq:
        col = i[0]-1
        handcraft_AAC_test[row][col] += 1/len(seq)
hc_AAC_test = np.array(handcraft_AAC_test)
print(hc_AAC_test.shape)
print(hc_AAC_test)

(85, 20)
[[0.2        0.         0.         0.         0.         0.2
  0.         0.         0.         0.         0.         0.
  0.2        0.         0.         0.         0.         0.2
  0.         0.2       ]
 [0.2        0.         0.         0.         0.         0.2
  0.         0.         0.         0.         0.         0.
  0.2        0.         0.         0.         0.         0.2
  0.         0.2       ]
 [0.2        0.         0.         0.         0.         0.2
  0.         0.         0.         0.         0.         0.
  0.2        0.         0.         0.         0.         0.2
  0.         0.2       ]
 [0.2        0.         0.         0.         0.         0.2
  0.         0.         0.         0.         0.         0.
  0.2        0.         0.         0.         0.         0.2
  0.         0.2       ]
 [0.2        0.         0.         0.         0.         0.2
  0.         0.         0.         0.         0.         0.
  0.2        0.         0.         0.     

In [5]:
#Dipeptide composition (DPC)
from collections import Counter
import numpy as np

def compute_dpc_pairs(sequence, k):
    return [sequence[i] + sequence[i + k + 1] for i in range(len(sequence) - k - 1)]

def calculate_amino_acid_pairs_frequency(sequence, max_k):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    pair_count = len(amino_acids) ** 2
    feature_vector = []

    for k in range(max_k + 1):
        dpc_pairs = compute_dpc_pairs(sequence, k)
        pair_counter = Counter(dpc_pairs)
        total_pairs = len(dpc_pairs)

        vector = [pair_counter.get(a + b, 0) / total_pairs for a in amino_acids for b in amino_acids]
        feature_vector.extend(vector)

    return feature_vector

max_k = 0
dpc_group_pairs = [calculate_amino_acid_pairs_frequency(sequence, max_k) for sequence in train1.values()]
DPC = np.array(dpc_group_pairs)

print(DPC.shape)
print("Length of feature vector:", len(dpc_group_pairs[0]))

(85, 400)
Length of feature vector: 400


In [6]:
#The One-Hot descriptor for sequences
import pandas as pd
import numpy as np

amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
amino_index = {aa: i for i, aa in enumerate(amino_acids)}
max_length = max(df['Sequence'].apply(len))

def sequence_to_one_hot(seq):
    one_hot = np.zeros(14*20)
    
    for i, aa in enumerate(seq):
        if i >= 14:
            break
        if aa in amino_index:
            index = amino_index[aa] + i * 20
            one_hot[index] = 1
            
    return one_hot


sequences = df['Sequence']


one_hot_encoded = np.array([sequence_to_one_hot(seq) for seq in sequences])

print(one_hot_encoded.shape)

(85, 280)


In [7]:
#Hand-crafted features
def summarized_featuresPC(peptide_sequences):
    summarized_featuresPC = []
    for i in peptide_sequences:
        last_aa = peptide_sequences[i][0]  # Get the last amino acid
        if last_aa == 'P':
            summarized_featuresPC.append([1,0])  #If the last amino acid is Pro
        elif last_aa == 'C':
            summarized_featuresPC.append([0,1])
        else:
            summarized_featuresPC.append([0,0])
    return summarized_featuresPC
def summarized_featuresF(peptide_sequences):
    summarized_featuresF = []
    for i in peptide_sequences:
        last_aa = peptide_sequences[i][0]
        first_aa = peptide_sequences[i][-1]
        first3_aa = peptide_sequences[i][-3:]
        if last_aa == 'F':
            if first3_aa == 'CRG':
                summarized_featuresF.append([1,0,1,1])
            elif first3_aa == 'YRG':
                summarized_featuresF.append([0,1,1,1])
            elif first_aa == 'G':
                summarized_featuresF.append([0,0,1,1])
            else:
                summarized_featuresF.append([0,0,0,1])
        else:
            summarized_featuresF.append([0,0,0,0])
    return summarized_featuresF
def summarized_featuresM(peptide_sequences):
    summarized_featuresM = []
    for i in peptide_sequences:
        last_aa = peptide_sequences[i][0]
        first_aa = peptide_sequences[i][-1]
        first6_aa = peptide_sequences[i][-6:]
        if last_aa == 'M':
            if first6_aa == 'PNSFEG':
                summarized_featuresM.append([1,1,1])
            elif first_aa == 'G':
                summarized_featuresM.append([1,0,1])
            else:
                summarized_featuresM.append([0,0,1])
        else:
            summarized_featuresM.append([0,0,0])
    return summarized_featuresM
def summarized_featuresT(peptide_sequences):
    summarized_featuresT = []
    for i in peptide_sequences:
        last_aa = peptide_sequences[i][0]
        last2_aa = peptide_sequences[i][0:2]
        first_aa = peptide_sequences[i][-1]
        first2_aa = peptide_sequences[i][-2:]
        if last_aa == 'T':
            if last2_aa == 'TD':
                if first2_aa == 'GG':
                    summarized_featuresT.append([1,1,0,0,1,1])
                elif first_aa == 'G':
                    summarized_featuresT.append([1,0,0,0,1,1])
                else:
                    summarized_featuresT.append([0,0,0,0,1,1])
            elif first2_aa == 'DG':
                summarized_featuresT.append([0,0,1,0,0,1])
            elif first2_aa == 'FG':
                summarized_featuresT.append([0,0,0,1,0,1])
            elif first_aa == 'G':
                summarized_featuresT.append([1,0,0,0,0,1])
            else:
                summarized_featuresT.append([0,0,0,0,0,1])
        else:
            summarized_featuresT.append([0,0,0,0,0,0])
    return summarized_featuresT
def summarized_featuresG(peptide_sequences):
    summarized_featuresG = []
    for i in peptide_sequences:
        last_aa = peptide_sequences[i][0]
        if last_aa == 'G':
            summarized_featuresG.append(1)
        else:
            summarized_featuresG.append(0)
    return summarized_featuresG

featuresPC = summarized_featuresPC(train1)
featuresF = summarized_featuresF(train1)
featuresM = summarized_featuresM(train1)
featuresT = summarized_featuresT(train1)
featuresG = summarized_featuresG(train1)

print("Generated featuresPC:", featuresPC)
print("Generated featuresF:", featuresF)
print("Generated featuresM:", featuresM)
print("Generated featuresT:", featuresT)
print("Generated featuresG:", featuresG)
Generated_features = np.c_[featuresPC,featuresF,featuresM,featuresT,featuresG]
print(Generated_features.shape)

Generated featuresPC: [[0, 0], [1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]
Generated featuresF: [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0,

In [9]:
#Concatenation
from sklearn.preprocessing import MinMaxScaler

import joblib
X_train = np.c_[Generated_features, hc_AAC_test, DPC, one_hot_encoded]
print(X_train.shape)
scaler = joblib.load('scaler307.pkl')
X_scaled = scaler.transform(X_train)
columns_to_remove = [12, 39, 42, 45, 47, 48, 50, 51, 52, 54, 57, 58, 59, 60, 62, 63, 67, 68, 69, 71, 72, 73, 79, 83, 84, 86, 87, 89, 90, 91, 93, 94, 97, 99, 103, 104, 105, 107, 109, 113, 114, 115, 118, 122, 126, 127, 131, 133, 135, 137, 139, 147, 151, 153, 154, 156, 159, 162, 163, 165, 166, 167, 168, 169, 171, 174, 176, 179, 182, 183, 184, 186, 187, 190, 191, 193, 194, 195, 202, 203, 205, 206, 207, 208, 209, 211, 215, 216, 217, 219, 220, 222, 223, 224, 225, 226, 227, 229, 231, 232, 233, 235, 237, 238, 239, 240, 242, 246, 249, 252, 253, 254, 256, 257, 258, 259, 262, 265, 267, 268, 269, 270, 272, 273, 274, 276, 277, 279, 284, 285, 289, 290, 294, 295, 297, 298, 299, 300, 302, 304, 305, 307, 309, 310, 311, 312, 316, 319, 320, 323, 325, 326, 327, 329, 330, 332, 334, 335, 336, 337, 338, 342, 343, 344, 345, 349, 350, 351, 354, 359, 366, 367, 369, 371, 372, 373, 374, 376, 377, 382, 384, 385, 387, 388, 390, 391, 392, 393, 395, 397, 398, 399, 400, 402, 404, 405, 406, 409, 411, 412, 413, 414, 418, 419, 423, 426, 428, 431, 434, 519, 542, 549, 553, 573, 579, 582, 590, 591, 593, 594, 597, 602, 603, 606, 609, 613, 614, 616, 619, 622, 623, 624, 625, 626, 628, 629, 633, 634, 635, 642, 643, 644, 646, 647, 648, 649, 650, 653, 654, 655, 656, 657, 662, 663, 664, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 680, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715]
df2 = pd.DataFrame(X_scaled)
df2_non_zero = df2.drop(df2.columns[columns_to_remove], axis=1)

X_filtered=np.array(df2_non_zero)
X_new = X_filtered[:, [20, 93, 28, 26, 0, 325, 21, 161, 298, 36, 367, 1, 149, 318, 82, 100, 62, 6, 12, 35, 208, 172, 44, 71, 101, 188, 225, 321, 374, 405, 104, 123, 198, 113, 68, 222, 116, 59, 119, 143, 7, 168, 175, 55, 194, 42, 3, 380, 138, 351, 65, 344, 8, 52, 58, 197, 18, 56]]

import pandas as pd

(85, 716)


In [10]:
from joblib import dump, load

# Loading model
model_307 = load('model_307.pkl')
y_test_307 = model_307.predict(X_new)
print(y_test_307)

[0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1.
 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.]
