<a href="https://colab.research.google.com/github/Zhengro/DL-Identification/blob/jaume/Map_noisyObs_to_indexes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
__author__ = ''

## Import all necessary libraries

In [0]:
# Load main libraries for the project
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np

## Get users' data

### Option 1 - Establish connection with Google Drive and read users' data from txt/npy files

In [0]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# All the data needed for this notebook is stored in MAIN_PATH
MAIN_PATH = "/content/drive/My Drive/Colab Notebooks/"

In [0]:
# Path to all the binary data
DATA_FILENAME = MAIN_PATH + 'data/userData_500_256.npy'

# Read data
userData = np.load(DATA_FILENAME)

# Create main parameters
NUM_USERS, NUM_FEATURES = userData.shape

print(NUM_USERS)
print(NUM_FEATURES)

### Option 2 - Generate users' data

In [66]:
# Main parameters
NUM_USERS = 1000
NUM_FEATURES = 64
NUM_HIDDEN_NODES = 2*NUM_FEATURES
bsc_e = 0.20

# Check that NUM_HIDDEN_NODES < NUM_USERS
while NUM_HIDDEN_NODES > NUM_USERS:
    NUM_HIDDEN_NODES = int( NUM_HIDDEN_NODES / 2 ) 

# Generate user data
userData = np.random.choice([0,1],(NUM_USERS,NUM_FEATURES))

# Information about the data and design of the NN
print("Number of nodes in the input layer: %d nodes" % NUM_FEATURES)
print("Number of nodes in the hidden layer: %d nodes" % NUM_HIDDEN_NODES)
print("Number of nodes in the output layer: %d nodes" % NUM_USERS)

Number of nodes in the input layer: 64 nodes
Number of nodes in the hidden layer: 128 nodes
Number of nodes in the output layer: 1000 nodes


## Binary Simmetric Channel

In [0]:
def bsc(bit_sequence, bsc_e):
    """
    Binary Symmetric Channel.
    
    Parameters
    ----------
    bit_sequence : 1D ndarray containing {0, 1}
        Input arrary of bits to the channel.
    bsc_e : float in [0, 1]
        Transition/Error probability of the channel.
    
    Returns
    -------
    output_bits : 1D ndarray containing {0, 1}
        Output bits from the channel.
    """
    flipped_bits = np.random.choice([0,1],bit_sequence.shape,p=[1.0-bsc_e,bsc_e])
    output_bits = np.absolute(np.subtract(bit_sequence,flipped_bits))
    return output_bits

In [68]:
# Test bsc function
input_bits = np.zeros((100,1))
output_bits = bsc(input_bits,bsc_e)
print(np.count_nonzero(output_bits)/output_bits.shape[0])


0.25


## Neural Network Identification algorithm

### Training

In [72]:
# Construct train set and test set
NUM_SAMPLES = 100 # Number of noisy samples for each user
X = np.zeros((NUM_USERS*NUM_SAMPLES,NUM_FEATURES))
Y = np.zeros((NUM_USERS*NUM_SAMPLES, 1))

for k in range(NUM_USERS):
    for n in range(NUM_SAMPLES):
        X[n + k*NUM_SAMPLES, :] = bsc(np.int_(userData[k, :]), bsc_e)
        Y[n + k*NUM_SAMPLES, :] = k

train_data, test_data, train_labels, test_labels = train_test_split(X,Y,test_size=0.2,shuffle=True)


# Information about the training and the testing data
print(train_data.shape)
print(test_data.shape)
print(train_labels.shape)
print(test_labels.shape)

(64000, 64)
(16000, 64)
(64000, 1)
(16000, 1)


In [73]:
# Build MLP model and train the NN
clf = MLPClassifier(hidden_layer_sizes=(NUM_HIDDEN_NODES), activation='relu',
                    solver='adam', verbose=True, alpha=0.001, early_stopping=True) 
clf.fit(train_data,train_labels)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 5.59534706
Validation score: 0.582500
Iteration 2, loss = 2.22711481
Validation score: 0.836250
Iteration 3, loss = 0.82355605
Validation score: 0.875625
Iteration 4, loss = 0.48754321
Validation score: 0.892969
Iteration 5, loss = 0.34737798
Validation score: 0.903125
Iteration 6, loss = 0.26731112
Validation score: 0.906250
Iteration 7, loss = 0.21437999
Validation score: 0.910469
Iteration 8, loss = 0.17561350
Validation score: 0.910937
Iteration 9, loss = 0.14609349
Validation score: 0.911875
Iteration 10, loss = 0.12371117
Validation score: 0.910625
Iteration 11, loss = 0.10392714
Validation score: 0.910469
Iteration 12, loss = 0.08903636
Validation score: 0.913281
Iteration 13, loss = 0.07782672
Validation score: 0.907969
Iteration 14, loss = 0.06909702
Validation score: 0.910625
Iteration 15, loss = 0.06006335
Validation score: 0.910156
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=128, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [63]:
# Number of output layers
print(clf.n_outputs_)

1000


In [74]:
# Test MLP algorithm
estimated_labels = clf.predict(test_data).reshape(test_labels.shape)
error_rate = np.count_nonzero( estimated_labels - test_labels ) / len(estimated_labels)
print("Error rate = %.5f" % error_rate)

Error rate = 0.08519


### Results obtained so far


Copy and paste the message that is automatiacally created below together with the setup of the MLP algorithm



Error rate = 0.003675, NUM_USERS = 500, NUM_FEATURES = 256, NUM_SAMPLES = 400, bsc_e = 0.3, hidden_layer_sizes=(128, 64, 32)

Error rate = 0.014750, NUM_USERS = 500, NUM_FEATURES = 256, NUM_SAMPLES = 400, bsc_e = 0.3, hidden_layer_sizes=(128, 64, 32, 16)

Error rate = 0.320275, NUM_USERS = 500, NUM_FEATURES = 256, NUM_SAMPLES = 400, bsc_e = 0.3, hidden_layer_sizes=(128, 64, 32)

Error rate = 0.2544625, NUM_USERS = 1000, NUM_FEATURES = 64, NUM_SAMPLES = 400, bsc_e = 0.2

Error rate = 0.2443, NUM_USERS = 1500, NUM_FEATURES = 128, NUM_SAMPLES = 200, bsc_e = 0.2

In [65]:
msg = "Error rate = {0}, NUM_USERS = {1}, NUM_FEATURES = {2}, " \
       "NUM_SAMPLES = {3}, bsc_e = {4}".format(error_rate,
                                             NUM_USERS,
                                             NUM_FEATURES,
                                             NUM_SAMPLES,
                                             bsc_e)
print(msg)

Error rate = 0.0011333333333333334, NUM_USERS = 1000, NUM_FEATURES = 128, NUM_SAMPLES = 150, bsc_e = 0.2
