# Data preprocessing utilities

In [18]:
import glob
import os
from collections import Counter
import string
import os
import random
import numpy as np
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from tempfile import TemporaryFile

### Load dataset

In [30]:
families = ["fam_1", "fam_2", "fam_3", "fam_4", "fam_5", "fam_6", "fam_7", "fam_8", "fam_9"]
ds_path = "../dataset/families"

family_paths = {}
for f in families:
    family_paths[f]= os.path.join(ds_path, f)

In [31]:
letters_di=dict(zip(string.ascii_letters,[ord(c)%32 -1 for c in string.ascii_letters]))

In [69]:
angles = {'A': [122.4, 119.4,-164.2],
          'B': [129.8, 135.6, -176.6],
          'C': [117.1, 111.0, -142.2],
          'D': [118.4, 126.9, -146.1],
          'E': [116.7, 138.6, 168.7],
          'F': [115.6, 112.9, -117.9],
          'G': [135.3, 118.6, -148.5],
          'H': [120.1, 114.3, -90.7],
          'I': [133.6, 117.1, -120.8],
          'J': [115.9, 91.4, -134.6],
          'K': [119.7, 90.4, -105.9],
          'L': [110.0, 90.8, -158.8],
          'M': [110.0, 100.8, 177.0],
          'N': [90.1, 138.2, 19.6],
          'O': [92.4, 91.2, -127.4],
          'P': [91.8, 96.7, -104.8],
          'Q': [95.9, 117.7, 136.0],
          'R': [94.5, 112.6, 115.0],
          'S': [96.3, 94.7, 112.0],
          'T': [93.0, 92.8, 83.1],
          'U': [91.4, 90.7, 49.8],
          'V': [93.3, 89.1, 68.3],
          'W': [93.8, 105.2, 32.3],
          'X': [111.4, 94.6, 21.8],
          'Y': [89.0, 95.1, -54.4]
          }

In [70]:
for v in angles:
    angles[v] = [(x + 180) / 360 for x in angles[v]]

In [78]:
def to_angles(conf):
    return [angles[str(l)] for l in conf]

In [72]:
def process_conf(configurations, use_angles ,categorical, padding, max_length, flatten, num_classes=25):
    if categorical:
        # tranforms data to one hot encodings
        configurations = [[letters_di[l] for l in p] for p in configurations]
        configurations = np.array([to_categorical(p, num_classes=num_classes) for p in configurations])
    elif use_angles:
        # use angles
        configurations = np.array([to_angles(p) for p in configurations])
    else:
        configurations = np.array([[letters_di[l] for l in p] for p in configurations])
    if padding:
        # pad sequences if less than max length
        if categorical:
            new_families_conf = np.zeros((configurations.shape[0], max_length, num_classes))
            for i,f in enumerate(configurations):
                new_families_conf[i,:f.shape[0], :f.shape[1]] += f
            configurations = new_families_conf
        elif to_angles:
            new_families_conf = np.zeros((configurations.shape[0], max_length, 3))
            for i,f in enumerate(configurations):
                new_f = np.array(f)
                new_families_conf[i,:new_f.shape[0], :new_f.shape[1]] += new_f
            configurations = new_families_conf
        del new_families_conf
    print(configurations.shape)
    if flatten:
        if categorical:
            configurations = configurations.reshape(-1, num_classes * max_length)
        elif use_angles:
            configurations = configurations.reshape(-1, 3 * max_length)
    print(configurations.shape)
    return configurations

In [73]:
def load_family(f):
    proteins = glob.glob(os.path.join(family_paths[f], "*.out"))
    print("Proteins for family %s" %f)
    for p in proteins:
        print(p)
    proteins_conf = []
    for p in proteins:
        with open(p) as in_file:
            for line in in_file:
                proteins_conf.append(line.strip())
    l = [len(p) for p in proteins_conf]
    print(Counter(l))
    return proteins_conf

### Preprocess data and serialize trian/ test sets

In [None]:
# for f in families:
#     families_conf = load_family(f)
#     families_conf = process_conf(families_conf, categorical=categorical, angles=angles, padding=padding, max_length=max_length,normalize=normalize, flatten=flatten)  
#     test_size = int(0.25 * families_conf.shape[0])
#     val_size = int(0.15 * families_conf.shape[0])
#     train_all, test = train_test_split(families_conf, test_size=test_size, random_state=42)
#     train, val = train_test_split(train_all, test_size = val_size, random_state=42)
#     del families_conf
#     del train_all
#     print("train: " + repr(train.shape))
#     print("val: " + repr(val.shape))
#     print("test: " + repr(test.shape))
#     train_filename = os.path.join("data_serialized_angles", f, "train.npy")
#     val_filename = os.path.join("data_serialized_angles", f, "val.npy")
#     test_filename = os.path.join("data_serialized_angles", f, "test.npy")
#     np.save(train_filename, train)
#     np.save(val_filename, val)
#     np.save(test_filename, test)
#     del train
#     del test
#     del val

In [None]:
# for f in families:
#     proteins = glob.glob(os.path.join(family_paths[f], "*.out"))
#     print("Proteins for family %s" %f)
    
#     for p in proteins:
#         print(p)
#         proteins_conf = []
#         with open(p) as in_file:
#             for line in in_file:
#                 proteins_conf.append(line.strip())
#         print(len(proteins_conf))
#         test_size = int(0.25 * len(proteins_conf))
#         val_size = int(0.15 * len(proteins_conf))
#         train_all_p, test_p = train_test_split(proteins_conf, test_size=test_size, random_state=42)
#         train_p, val_p = train_test_split(train_all_p, test_size = val_size, random_state=42)
        
#         #preprocess
#         train_p = process_conf(train_p, categorical=categorical, angles=angles, padding=padding, max_length=max_length,normalize=normalize, flatten=flatten)
#         val_p = process_conf(val_p, categorical=categorical, angles=angles, padding=padding, max_length=max_length,normalize=normalize, flatten=flatten)
#         test_p = process_conf(test_p, categorical=categorical, angles=angles, padding=padding, max_length=max_length,normalize=normalize, flatten=flatten)
#         print("train: " + repr(train_p.shape))
#         print("val: " + repr(val_p.shape))
#         print("test: " + repr(test_p.shape))
#         del train_all_p
#         del proteins_conf

#         p_name = os.path.basename(p).split('.')[0]
#         train_filename = os.path.join("data_serialized_angles_protein", f, "train", "train_"+ p_name +".npy")
#         val_filename = os.path.join("data_serialized_angles_protein", f, "val", "val_"+ p_name +".npy")
#         test_filename = os.path.join("data_serialized_angles_protein", f, "test", "test_"+ p_name +".npy")
#         np.save(train_filename, train_p)
#         np.save(val_filename, val_p)
#         np.save(test_filename, test_p)

#         del train_p
#         del test_p
#         del val_p