In [11]:
import shallow_nn as nn
import optimized_tree as trr
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import preprocessing

In [3]:
def read_unirep(file_path):
    """
    Parse a Unirep file.
    """
    data_matrix = []
    with open(file_path) as fp:
        data = fp.readlines()
    for i, line in enumerate(data):
        if line[0] == '>':
            data_matrix.append(np.array(
                data[i + 1].strip().split(sep=' '),
                dtype=np.float64))
    return np.array(data_matrix)

In [4]:
# Imports unirep files.
raw_cyt = read_unirep("cytoxplasmUniRef50(1).unirep")
raw_peri = read_unirep("periplasmUniRef50(1).unirep")
# Shuffle imported data.
np.random.shuffle(raw_cyt)
np.random.shuffle(raw_peri)
# Creates mixed dataset (from shuffled = random selection).
all_raw = np.concatenate((raw_cyt[1:3000], raw_peri[1:3000]))

In [5]:
scaled_transfrom = preprocessing.scale(all_raw, axis=0)
# Check for successfull scaling.
print(np.std(scaled_transfrom[:, 25]))

0.9999999999999998


In [6]:
# Gives value 1 to cytoplasmic proteins, 0 to others.
labels = np.zeros(len(all_raw), dtype=int)
labels[0:3000] = 1

In [7]:
# One hot encoding for neural network.
def one_hot(labels):
    encoded_labels = []
    number_classes = len(set(labels))
    for item in labels:
        encoded = np.zeros(number_classes, dtype=int)
        encoded[item] = 1
        encoded_labels.append(encoded)
    return np.array(encoded_labels)

In [8]:
f_labels = one_hot(labels)

In [9]:
# Prepares input for neural network.
def nn_input_format(dataset, labels):
    encoded_data = []
    for data, lab in zip(dataset, labels):
        value = (data.reshape((-1,1)), lab.reshape((-1,1)))
        encoded_data.append(value)
    return encoded_data

In [10]:
x = nn_input_format(scaled_transfrom, f_labels)
len(x)

5998

In [11]:
# 10-fold cross validation.
def nn_10fold(arch):
    # Shuffle dataset.
    np.random.shuffle(x)
    n = len(x) / 10  # size of fold.
    tot_result = 0
    # trains and test on each fold.
    for i in range(10):
        network = nn.ShallowNetwork(arch)  # Initialize network.
        # Test fold.
        x_test_1 = x[int(i * n):int((i+1) * n)]
        # Train fold.
        x_train_1 = x[0:int(i * n)] + x[int((i+1) * n):]
        # Training step.
        network.stochastic_gradient_descent(x_train_1, 10, 10, 0.5, 0.5)
        # Current model score init.
        result = 0
        # Test phase using test set.
        for elt in x_test_1:
            result += network.predict(elt[0]) == np.where(elt[1] == 1)[0][0]
            result /= len(x_test_1)
            tot_result += result  # Adds fold result to total result.
    # Averages the results.
    final_res = tot_result / 10
    return (final_res)

# Parameter = architecture: 64 inputs, 32hn, 2 outputs(0 or 1).
nn_10fold([64, 32, 2])

0.9914796706917377

In [10]:
# Shuffles data and lables the same way.
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [14]:
# 10-fold cross-validation for decision trees.
def dt_10fold():
    # Shuffles dataset.
    new_scaled, new_labels = unison_shuffled_copies(scaled_transfrom, labels)
    n = len(new_scaled) / 10  # Size of fold.
    tot_result = 0
    # Trains and test on each fold.
    for i in range(10):
        # Training data and labels.
        x_train_2 = np.concatenate((new_scaled[0:int(i * n)], new_scaled[int((i+1) * n):]), axis=0)
        x_labs_train = np.concatenate((new_labels[0:int(i * n)], new_labels[int((i+1) * n):]), axis=0)
        # Test data and labels.
        x_test_2 = new_scaled[int(i * n):int((i+1) * n)]
        x_labs_test = new_labels[int(i * n):int((i+1) * n)]
        # Model generation.
        dt = trr.DecisionTree(x_train_2, x_labs_train, trr.CostFunction.gini, 5)
        result = 0  # Fold result init.
        for elt, lab in zip(x_test_2, x_labs_test):
            result += dt.predict(elt) == lab
            result /= len(x_test_2)
            tot_result += result  # Adds fold result to total result.
    # Averages the results.
    final_res = tot_result / 10
    return (final_res)

dt_10fold()

0


  while not np.issubdtype(type(current_node.split), int):


1
2
3
4
5
6
7
8
9


0.9787896242941093

In [12]:
# Single model training for visualisation of the architecture.
new_scaled, new_labels = unison_shuffled_copies(scaled_transfrom, labels)
dt = trr.DecisionTree(new_scaled, new_labels, trr.CostFunction.gini, 5)
dt

`- Split(feature=33, value=0.317, score=0.8785), Classes: {1: 3000, 0: 2998}
   |- Split(feature=32, value=-0.4969, score=0.9341), Classes: {0: 2828, 1: 221}
   |  |- Split(feature=17, value=1.1483, score=0.8999), Classes: {1: 132, 0: 16}
   |  |  |- Split(feature=54, value=-1.4686, score=0.9699), Classes: {1: 127, 0: 4}
   |  |  |  |- 0, Classes: {0: 2}
   |  |  |  `- Split(feature=22, value=2.2496, score=0.9846), Classes: {1: 127, 0: 2}
   |  |  |     |- 1, Classes: {1: 127, 0: 1}
   |  |  |     `- 0, Classes: {0: 1}
   |  |  `- Split(feature=15, value=0.1882, score=0.902), Classes: {0: 12, 1: 5}
   |  |     |- Split(feature=54, value=-0.3698, score=1.0), Classes: {1: 5, 0: 1}
   |  |     |  |- 0, Classes: {0: 1}
   |  |     |  `- 1, Classes: {1: 5}
   |  |     `- 0, Classes: {0: 11}
   |  `- Split(feature=47, value=0.7915, score=0.9583), Classes: {0: 2812, 1: 89}
   |     |- Split(feature=46, value=-1.1056, score=0.977), Classes: {0: 2770, 1: 38}
   |     |  |- Split(feature=56, val


Decision tree trained with 5998 samples, 2 classes