In [56]:
import gzip
import shutil
import numpy as np
import gzip
import os
import h5py

def load_cb513_dataset():
    
    #Get dataset from url:
    #import urllib.request
    #with urllib.request.urlopen('http://www.princeton.edu/~jzthree/datasets/ICML2014/cb513+profile_split1.npy.gz') as f:
    #html = f.read().decode('utf-8')
    
    filename = 'cb513+profile_split1_2.npy.gz'
    new_filename = 'cb513+profile_split1_2.npy'

    #get path of dataset in cwd
    dir_path = os.path.dirname(os.path.realpath(filename))    
    source_path = dir_path + '/' + filename    
    destination_path = dir_path + '/' + new_filename
    
    #open zipped file and copy contents to new .npy file
    with gzip.open(filename, 'rb') as f_in:
        with open(new_filename, 'wb') as f_out:
            shutil.copyfile(source_path, destination_path)
            
    #X_test_in = np.load(gzip.open('./cb513+profile_split1.npy.gz', 'rb'))
    #X_test = np.reshape(X_test_in,(514,700,57))

    #load .npy dataset and reshape 
    cb513_data = np.load(new_filename)
    cb513_data = np.reshape(cb513_data, (-1,700,57))
    #print(cb513_data.shape)

    #protein sequence feature
    test_hot = cb513_data[:,:,0:21]
    #protein profile feature
    test_pssm = cb513_data[:,:,35:56]
    #protein labels
    test_labels = cb513_data[:,:,22:30]
    
    #print(np.argmin(test_hot))
    #print(np.argmax(test_hot))
    
    #create new protein sequence feature, set values to max value if if value!=0 ? 
    test_hot_ = np.ones((test_hot.shape[0], test_hot.shape[1]))
    for x in range(test_hot.shape[0]):
          for y in range(test_hot.shape[1]):
                if np.sum(test_hot[x,y,:]) != 0:
                    test_hot_[x,y] = np.argmax(test_hot[x,y,:]) 
                    
    return test_hot_, test_pssm, test_labels

In [50]:
test_hot, test_pssm, test_labels = load_cb513_dataset()

In [51]:
test_hot.shape

(514, 700)

In [52]:
test_pssm.shape

(514, 700, 21)

In [53]:
test_labels.shape

(514, 700, 8)

In [33]:
def load_cullpdb_6133_dataset():
    
    filename = 'cullpdb+profile_6133_filtered.npy.gz'
    new_filename = 'cullpdb+profile_6133_filtered.npy'

    #get path of dataset in cwd
    dir_path = os.path.dirname(os.path.realpath(filename))    
    source_path = dir_path + '/' + filename    
    destination_path = dir_path + '/' + new_filename
    
    #open zipped file and copy contents to new .npy file
    with gzip.open(filename, 'rb') as f_in:
        with open(new_filename, 'wb') as f_out:
            shutil.copyfile(source_path, destination_path)
    
    #load .npy dataset and reshape 
    cullpdb_data = np.load(new_filename)
    cullpdb_data = np.reshape(cullpdb_data, (-1, 700, 57))
    
    #onehot=cull[:, :, 0:21]#sequence feature
    #pssm=cull[:, :, 35:56]#profile feature
    
    train_labels = cullpdb_data[:, :, 22:30]    # secondary struture label , 8-D/8-classes
    #number of protein sequences
    num_seqs = cullpdb_data.shape[0]
    #protein sequence length
    seq_len = cullpdb_data.shape[1]
    #number of protein classes 
    num_classes = train_labels.shape[2]
    num_seqs1 = np.size
    vals = np.arange(0,8)
    
    #print(num_seqs)
    #print(seq_len)
    #print(num_classes)
    #print(num_seqs1)
    #print('\n')
      
    #convert dataset to float datatype
    cullpdb_data = cullpdb_data.astype('float32')
    labels_new = np.zeros((num_seqs, seq_len))
    
    #create new labels matrix of the dot product of training_labels and values np array
    for i in range(train_labels.shape[0]):
        labels_new[i,:] = np.dot(train_labels[i,:,:],vals)
    
    #convert labels matrix to int datatype
    labels_new = labels_new.astype('int32')
    train_labels = labels_new
    seq_names = np.arange(0,num_seqs)
    
    #shuffle protein sequence names
    np.random.shuffle(seq_names)
    #get training and valid data
    cullpdb_data_train = cullpdb_data[seq_names[0:5278]]
    cullpdb_data_valid = cullpdb_data[seq_names[5278:5534]]

    #get labels of training and valid data
    labels_train = train_labels[seq_names[0:5278]]
    labels_valid = train_labels[seq_names[5278:5534]]
    #mask_train = mask[seq_names[0:5278]]
    #mask_valid = mask[seq_names[5278:5534]]
    num_seq_train = cullpdb_data_train.shape[0]
    num_seq_valid = cullpdb_data_valid.shape[0]

    return cullpdb_data_train, cullpdb_data_valid, labels_train, labels_valid, num_seq_train

In [37]:
X_train, X_valid, train_labels, valid_labels, num_seq_train = load_cullpdb_6133_dataset()

5534
700
8
<function size at 0x7fb7fde949d8>




In [41]:
X_train.shape

(5278, 700, 57)

In [43]:
X_valid.shape

(256, 700, 57)

In [45]:
train_labels.shape

(5278, 700)

In [46]:
valid_labels.shape

(256, 700)

In [57]:
def load_casp10_data():
    
    #load casp10 dataset
    casp10_data = h5py.File("casp10.h5")
    
    #load protein sequence and profile feature data
    casp10_data_hot = casp10_data['features'][:, :, 0:21]
    casp10_data_pssm = casp10_data['features'][:, :, 21:42]
    #load protein label data
    test_labels = casp10_data['labels'][:, :, 0:8]  
    
    #create new protein sequence feature, set values to max value if if value!=0 ? 
    casp10_data_test_hot = np.ones((casp10_data_hot.shape[0], casp10_data_hot.shape[1]))
    for x in range(casp10_data_hot.shape[0]):
        for y in range(casp10_data_hot.shape[1]):
               if np.sum(casp10_data_hot[x,y,:]) != 0:
                    casp10_data_test_hot[x,y] = np.argmax(casp10_data_hot[x,y,:])

    return casp10_data_test_hot, casp10_data_pssm, test_labels

In [58]:
casp10_data_test_hot, casp10_data_pssm, test_labels = load_casp10_data()

  after removing the cwd from sys.path.


In [59]:
casp10_data_test_hot.shape

(123, 700)

In [60]:
casp10_data_pssm.shape

(123, 700, 21)

In [61]:
test_labels.shape

(123, 700, 8)

In [63]:
def load_casp11_data():
    
    #load casp11 dataset
    casp11_data = h5py.File("casp11.h5")
    
    #load protein sequence and profile feature data
    casp11_data_hot = casp11_data['features'][:,:,0:21]
    casp11_data_pssm = casp11_data['features'][:,:,21:42]
    #load protein label data
    test_labels = casp11_data['labels'][:,:,0:8]
    
    #create new protein sequence feature, set values to max value if if value!=0 ? 
    casp11_data_test_hot = np.ones((casp11_data_hot.shape[0], casp11_data_hot.shape[1]))
    for x in range(casp11_data_hot.shape[0]):
        for y in range(casp11_data_hot.shape[1]):
            if np.sum(casp11_data_hot[x,y,:]) != 0:
                casp11_data_test_hot[x,y] = np.argmax(casp11_data_hot[x,y,:])
                
    return casp11_data_test_hot, casp11_data_test_hot, test_labels

In [64]:
casp11_data_test_hot, casp11_data_test_hot, test_labels = load_casp11_data()

  after removing the cwd from sys.path.


In [65]:
casp11_data_test_hot.shape

(105, 700)

In [66]:
casp11_data_test_hot.shape

(105, 700)

In [67]:
test_labels.shape

(105, 700, 8)