# Data preprocessing utilities

In [1]:
import glob
import os
from collections import Counter
import string
import os
import random
import numpy as np
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from tempfile import TemporaryFile

Using TensorFlow backend.


### Load dataset

In [11]:
families = ["fam_1", "fam_2", "fam_3", "fam_4", "fam_5", "fam_6", "fam_7", "fam_8", "fam_9"]
ds_path = "../dataset/families"

prepare_dirs = True

family_paths = {}
for f in families:
    family_paths[f]= os.path.join(ds_path, f)

In [12]:
# prepare dirs
if prepare_dirs:
    if not os.path.exists("models"):
        os.mkdir("models")
    if not os.path.exists("models_proteins"):
        os.mkdir("models_proteins")
    if not os.path.exists("logs"):
        os.mkdir("logs")
    if not os.path.exists("data_serialized"):
        os.mkdir("data_serialized")
    if not os.path.exists("data_serialized_protein"):
        os.mkdir("data_serialized_protein")
    for f in families:
        if not os.path.exists(os.path.join("models", f)):
            os.mkdir(os.path.join("models", f))

        if not os.path.exists(os.path.join("models_proteins", f)):
            os.mkdir(os.path.join("models_proteins", f))

        if not os.path.exists(os.path.join("logs", f)):
            os.mkdir(os.path.join("logs", f))

        if not os.path.exists(os.path.join("data_serialized", f)):
            os.mkdir(os.path.join("data_serialized", f))

        if not os.path.exists(os.path.join("data_serialized_protein", f)):
            os.mkdir(os.path.join("data_serialized_protein", f))
        if not os.path.exists(os.path.join("data_serialized_protein", f, 'train')):
            os.mkdir(os.path.join("data_serialized_protein", f, 'train'))
        if not os.path.exists(os.path.join("data_serialized_protein", f, 'val')):
            os.mkdir(os.path.join("data_serialized_protein", f, 'val'))
        if not os.path.exists(os.path.join("data_serialized_protein", f, 'test')):
            os.mkdir(os.path.join("data_serialized_protein", f, 'test'))

In [14]:
letters_di=dict(zip(string.ascii_letters,[ord(c)%32 -1 for c in string.ascii_letters]))

In [16]:
letters_di

{'A': 0,
 'B': 1,
 'C': 2,
 'D': 3,
 'E': 4,
 'F': 5,
 'G': 6,
 'H': 7,
 'I': 8,
 'J': 9,
 'K': 10,
 'L': 11,
 'M': 12,
 'N': 13,
 'O': 14,
 'P': 15,
 'Q': 16,
 'R': 17,
 'S': 18,
 'T': 19,
 'U': 20,
 'V': 21,
 'W': 22,
 'X': 23,
 'Y': 24,
 'Z': 25,
 'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25}

In [5]:
angles = {'A': [122.4, 119.4,-164.2],
          'B': [129.8, 135.6, -176.6],
          'C': [117.1, 111.0, -142.2],
          'D': [118.4, 126.9, -146.1],
          'E': [116.7, 138.6, 168.7],
          'F': [115.6, 112.9, -117.9],
          'G': [135.3, 118.6, -148.5],
          'H': [120.1, 114.3, -90.7],
          'I': [133.6, 117.1, -120.8],
          'J': [115.9, 91.4, -134.6],
          'K': [119.7, 90.4, -105.9],
          'L': [110.0, 90.8, -158.8],
          'M': [110.0, 100.8, 177.0],
          'N': [90.1, 138.2, 19.6],
          'O': [92.4, 91.2, -127.4],
          'P': [91.8, 96.7, -104.8],
          'Q': [95.9, 117.7, 136.0],
          'R': [94.5, 112.6, 115.0],
          'S': [96.3, 94.7, 112.0],
          'T': [93.0, 92.8, 83.1],
          'U': [91.4, 90.7, 49.8],
          'V': [93.3, 89.1, 68.3],
          'W': [93.8, 105.2, 32.3],
          'X': [111.4, 94.6, 21.8],
          'Y': [89.0, 95.1, -54.4]
          }

In [6]:
def to_angles(conf):
    return [angles[str(l)] for l in conf]

In [7]:
def process_conf(configurations, use_angles ,categorical, padding, max_length, normalize, flatten, num_classes=25):
    if categorical:
        # tranforms data to one hot encodings
        configurations = [[letters_di[l] for l in p] for p in configurations]
        configurations = np.array([to_categorical(p, num_classes=num_classes) for p in configurations])
    elif use_angles:
        # use angles
        configurations = np.array([to_angles(p) for p in configurations])
    else:
        configurations = np.array([[letters_di[l] for l in p] for p in configurations])
 
    if padding:
        # pad sequences if less than max length
        if categorical:
            new_families_conf = np.zeros((configurations.shape[0], max_length, num_classes))
            for i,f in enumerate(configurations):
                new_families_conf[i,:f.shape[0], :f.shape[1]] += f
            configurations = new_families_conf
        elif use_angles:
            new_families_conf = np.zeros((configurations.shape[0], max_length, 3))
            for i,f in enumerate(configurations):
                new_f = np.array(f)
                new_families_conf[i,:new_f.shape[0], :new_f.shape[1]] += new_f
            configurations = new_families_conf
        del new_families_conf
    print(configurations.shape)
    if flatten:
        if categorical:
            configurations = configurations.reshape(-1, num_classes * max_length)
        elif use_angles:
            configurations = configurations.reshape(-1, 3 * max_length)
    if normalize:
        if categorical:
            configurations = configurations.astype('float32') / (letters_di['Y'] * 1.0)
        elif use_angles:
            configurations = configurations.astype('float32') + 180 / 360
    print(configurations.shape)
    return configurations

In [8]:
def load_family(f):
    proteins = glob.glob(os.path.join(family_paths[f], "*.out"))
    print("Proteins for family %s" %f)
    for p in proteins:
        print(p)
    proteins_conf = []
    for p in proteins:
        with open(p) as in_file:
            for line in in_file:
                proteins_conf.append(line.strip())
    l = [len(p) for p in proteins_conf]
    print(Counter(l))
    return proteins_conf