In [1]:
!kaggle datasets download -d alfrandom/protein-secondary-structure
!unzip protein-secondary-structure.zip
!rm -rf protein-secondary-structure.zip
!mkdir dataset
!mv 2018-06-06-pdb-intersect-pisces.csv dataset/2018-06-06-pdb-intersect-pisces.csv
!mv 2018-06-06-ss.cleaned.csv dataset/2018-06-06-ss.cleaned.csv

Downloading protein-secondary-structure.zip to /Users/bdboy/Desktop/Projects/PSSP/data
100%|██████████████████████████████████████| 38.8M/38.8M [00:06<00:00, 6.22MB/s]
100%|██████████████████████████████████████| 38.8M/38.8M [00:06<00:00, 6.04MB/s]
Archive:  protein-secondary-structure.zip
  inflating: 2018-06-06-pdb-intersect-pisces.csv  
  inflating: 2018-06-06-ss.cleaned.csv  


In [2]:
import numpy as np                                     
import pandas as pd                                    
import copy                                            
from sklearn.model_selection import train_test_split   
from sklearn.svm import SVC                            
from sklearn.metrics import classification_report      
from sklearn import metrics                            
from sklearn.model_selection import GridSearchCV   

In [4]:
df = pd.read_csv('./dataset/2018-06-06-ss.cleaned.csv')
df.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa
0,1A30,C,EDL,CBC,CEC,3,False
1,1B05,B,KCK,CBC,CEC,3,False
2,1B0H,B,KAK,CBC,CEC,3,False
3,1B1H,B,KFK,CBC,CEC,3,False
4,1B2H,B,KAK,CBC,CEC,3,False


In [17]:
maxlen_seq = 256
input_seqs, target_seqs = df[['seq', 'sst8']][(df.len <= maxlen_seq) & (~df.has_nonstd_aa)].values.T
print(input_seqs[0:5])
print(input_seqs.size)

['EDL' 'KCK' 'KAK' 'KFK' 'KAK']
226733


In [18]:
print(target_seqs[0:5])
print(target_seqs.size)

['CBC' 'CBC' 'CBC' 'CBC' 'CBC']
226733


In [19]:
def split(sequence): 
    return [char for char in sequence]

In [27]:
primary_split = []
secondary_split = []
for row in range(int(len(target_seqs)/40)):
    primary_split.append(split(input_seqs[row]))
    secondary_split.append(split(target_seqs[row]))

In [28]:
print(primary_split[0:5])
print(len(primary_split))

[['E', 'D', 'L'], ['K', 'C', 'K'], ['K', 'A', 'K'], ['K', 'F', 'K'], ['K', 'A', 'K']]
5668


In [29]:
print(secondary_split[0:5])
print(len(secondary_split))

[['C', 'B', 'C'], ['C', 'B', 'C'], ['C', 'B', 'C'], ['C', 'B', 'C'], ['C', 'B', 'C']]
5668


In [31]:
def orthogonal_primary(arg):
    switch = {
        'A' : np.array([1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),  
        'C' : np.array([0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'E' : np.array([0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'D' : np.array([0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'G' : np.array([0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'F' : np.array([0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'I' : np.array([0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'H' : np.array([0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]),
        'K' : np.array([0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]),
        'M' : np.array([0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]),
        'L' : np.array([0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]),
        'N' : np.array([0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]),
        'Q' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]),
        'P' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]),
        'S' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]),
        'R' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]),
        'T' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]),
        'W' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]),
        'V' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]),
        'Y' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1])
    }
    
    return switch.get(arg)


In [32]:
def orthogonal_secondary(arg):
    switch = {
        'H' : 0,                    
        'C' : 1,
        'E' : 2,  
        'B' : 3,                    
        'G' : 4,                    
        'I' : 5,                    
        'T' : 6,                    
        'S' : 7                     
    }
    
    return switch.get(arg)

In [33]:
for row in range(len(primary_split)):  
    sequence = primary_split[row]
    for col in range(len(sequence)):
        sequence[col] = orthogonal_primary(sequence[col])
primary_split[0:5]

In [34]:
for row in range(len(secondary_split)):  
    sequenceS = secondary_split[row]
    for col in range(len(sequenceS)):
        sequenceS[col] = orthogonal_secondary(sequenceS[col])

In [35]:
def graph_sum2(seq1,seq2):
    result = [None]*len(seq1)
    for col in range(len(seq1)):
        result[col] =  seq1[col]+seq2[col]
    return result


def graph_sum3(seq1,seq2,seq3):
    result = [None]*len(seq1)
    for col in range(len(seq1)):
        result[col] =  seq1[col]+seq2[col]+seq3[col]
    return result

In [36]:
graph_input = copy.deepcopy(primary_split)
for row in range(len(primary_split)):
    sequence = primary_split[row]
    graph_input[row][0]=graph_sum2(sequence[0],sequence[1])
    graph_input[row][len(sequence)-1]=graph_sum2(sequence[len(sequence)-1],sequence[len(sequence)-2])
    for col in range(1,len(sequence)-1):
        graph_input[row][col] = graph_sum3(sequence[col-1],sequence[col],sequence[col+1])
        
graph_input[0:5]

[[[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]

In [37]:
def targetY(data_list):
    Y = []
    for i in range(len(data_list)):
        for j  in range(len(data_list[i])):
            Y.append(data_list[i][j])
    return Y
y_label = targetY(secondary_split)
print(y_label[0:5])
print(len(y_label))

[1, 3, 1, 1, 3]
55942


In [38]:
def window_padding_data(size, sequence):
    num = int(size/2)
    zeros = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    for i in range(len(sequence)):
        for j in range(num):
            sequence[i].append(zeros)
            sequence[i].insert(0, zeros)
            
    X = []
    temp = []

    for k in range(len(sequence)):
        for l in range(len(sequence[k])-(size-1)):
            temp = sequence[k][l:l+size]
            X.append(temp)
            temp = []

    return X

In [39]:
X = window_padding_data(11,graph_input)
len(X)

55942

In [40]:
np.set_printoptions(threshold=np.inf)
X = np.array(X)
y_label = np.array(y_label)
X = X.reshape(len(X),11*20)
print(X[0:5])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0
  0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size = 0.20,random_state=54)

In [42]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
             'kernel': ['rbf']}  
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 

grid.fit(X_train, y_train) 

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.709 total time=18.0min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.713 total time=14.8min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.710 total time=15.3min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.709 total time=18.1min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.714 total time=29.6min
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.733 total time= 6.9min
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.739 total time= 5.5min
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.738 total time= 5.6min
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.733 total time= 5.5min
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.737 total time= 5.5min
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.708 total time= 2.3min
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf