In [30]:
import shallow_nn as nn
import optimized_tree as tr
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from collections import Counter

## Data importation

### Parser for the unirep files

In [31]:
def read_unirep(file_path):
    """
    Parse a Unirep file.
    """
    data_matrix = []
    with open(file_path) as fp:
        data = fp.readlines()
    for i, line in enumerate(data):
        if line[0] == '>':
            data_matrix.append(np.array(
                data[i + 1].strip().split(sep=' '),
                dtype=np.float64))
    return np.array(data_matrix)

### Dataset creation

Here, we will generate a dataset composed of 6000 samples: 3000 cytoplasmic proteins, and 3000 periplasmic proteins. The dataset for each class is first shuffled, then the 3000 first elements of each set are extracted onto the final dataset. This process is equivalent as random selection of samples.

In [32]:
# Imports unirep files.
raw_cyt = read_unirep("cytoxplasmUniRef50(1).unirep")
raw_peri = read_unirep("periplasmUniRef50(1).unirep")
# Shuffle imported data.
np.random.shuffle(raw_cyt)
np.random.shuffle(raw_peri)
# Creates mixed dataset (from shuffled = random selection).
all_raw = np.concatenate((raw_cyt[0:3000], raw_peri[0:3000]))

Then, we scale the data, giving for each feature a mean = 0, and a standard deviation = 1. This makes the learning process easier for certain type of algorithms, for example for neural networks.

In [33]:
# We use sklearn, which allows to get the scaling factor 
# and the mean used to scale the initial data.

# Scaler object init and fit to data.
scaler = StandardScaler().fit(all_raw)
# Apply scaler to the training set.
scaled_transfrom = scaler.transform(all_raw)
# Extracts and save scaling parameters.
sc = scaler.scale_
mn = scaler.mean_
np.save('scale_factor.npy' , sc)
np.save('scale_mean.npy', mn)

In [16]:
# We transform data back to see if it works as expected.
back_transform = scaled_transfrom * sc + mn
equality_check = [elt - tle for elt, tle in zip(back_transform[0], all_raw[0]) 
                  if elt != tle]
print(equality_check)
""" All comparisons considered as not equal are actually really close """

[-3.469446951953614e-18, 1.734723475976807e-18]


' All comparisons considered as not equal are actually really close '

We then generate the labels for the dataset. The labels are kept in an array of the size of the dataset, and are given the value 1 for cytoplasmic proteins, or 0 for periplasmic ones.

In [17]:
# Gives value 1 to cytoplasmic proteins, 0 to others.
labels = np.zeros(len(all_raw), dtype=int)
labels[0:3000] = 1

## Neural network training

### Data preprocessing

The neural network we designed needs a one-hot encoding for the labels. One-hot encoding means that the value is stored in a sparse array, where all the values are 0 except the one corresponding the the class. For example, if there are 5 classes, an item with class 3 will be encoded as follows: [0, 0, 0, 1, 0, 0]. We designed a function that transfroms an array of integers to that format.

In [18]:
# One hot encoding for neural network.
def one_hot(labels):
    encoded_labels = []
    number_classes = len(set(labels))
    for item in labels:
        encoded = np.zeros(number_classes, dtype=int)
        encoded[item] = 1
        encoded_labels.append(encoded)
    return np.array(encoded_labels)

In [19]:
f_labels = one_hot(labels)

Then, the way the network is designed requires the data corresponding to a certain sample to be stored with it's label in an array, giving a dataset on the from [[sample1_values, sample1_label], [sample2_values, sample2_label], ...]. The following function adapts the data to this requirement.

In [20]:
# Prepares input for neural network.
def nn_input_format(dataset, labels):
    encoded_data = []
    for data, lab in zip(dataset, labels):
        value = (data.reshape((-1,1)), lab.reshape((-1,1)))
        encoded_data.append(value)
    return encoded_data

In [21]:
x = nn_input_format(scaled_transfrom, f_labels)

In [22]:
# Network initialisation and training.
final_model = nn.ShallowNetwork([64, 22, 2])
final_model.stochastic_gradient_descent(x, 100, 8, 0.2, 0.5)

In [23]:
np.save('weights.npy' , final_model.weights)
np.save('biases.npy', final_model.biases)