<a href="https://colab.research.google.com/github/Zhengro/DL-Identification/blob/jaume/SOM_algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
__author__ = 'Jaume Anguera Peris'

## Read txt files from Google Drive

In [31]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# After executing the cell above, all the data needed for this project 
# will be present in "/content/drive/My Drive/Project - DL identification systems/Data/"
MAIN_PATH = "/content/drive/My Drive/Project - DL identification systems/Data/"
DEFAULT_MAIN_FOLDER = MAIN_PATH + '500 users/data_len_64/'

## Import necessary libraries

In [0]:
import numpy as np

## Import classes

### SOM_algorithm.py

In [0]:
class SOM:

    DEFAULT_NUM_EPOCHS = 200
    DEFAULT_INIT_LEARN_RATE = 0.4

    # Initialization
    def __init__(self, net_dim_x, net_dim_y, num_features,
                 num_epochs=DEFAULT_NUM_EPOCHS,
                 init_learning_rate=DEFAULT_INIT_LEARN_RATE):
        self.net_dimensions = np.array([net_dim_x,net_dim_y])
        self.num_features = num_features
        self.num_epochs = num_epochs
        self.init_learning_rate = init_learning_rate
        self.init_radius = min(self.net_dimensions[0],self.net_dimensions[1])
        self.time_constant = num_epochs / np.log(self.init_radius)
        self.generate_weight_matrix()

    # Functions
    def generate_weight_matrix(self):
        self.net_weights = np.random.random((self.num_features,
                                             self.net_dimensions[0],
                                             self.net_dimensions[1]))

    def train(self, inputData, verbose=True):

        if verbose is True:
            msg_interval = 10

        for i in range(self.num_epochs):
            radius = self.decay_radius(i)
            learning_rate = self.decay_learning_rate(i)

            if verbose and (i % msg_interval == 0):
                print("Iteration %d out of %d" % (i,self.num_epochs))
                print("Radius = %.2f" % radius)
                print("Learning rate = %.2e\n" % learning_rate)

            bmu_ind = self.predict_cluster(inputData)

            for x in range(self.net_dimensions[0]):
                for y in range(self.net_dimensions[1]):
                    node_position = np.array([x, y])
                    weight_k = self.net_weights[:, x, y].reshape(1, self.num_features)
                    for sample_ind in range(inputData.shape[0]):
                        distance_between_nodes = SOM.euclidean_dist(node_position, bmu_ind[sample_ind])
                        if distance_between_nodes <= radius ** 2:
                            input_vec = inputData[sample_ind,:]
                            step_size = learning_rate * SOM.neighborhood_influence(distance_between_nodes, radius)
                            updated_weight = weight_k + (step_size * (input_vec - weight_k))
                            self.net_weights[:, x, y] = updated_weight.reshape(self.num_features,)


    def predict_cluster(self, inputData, saveData=False, fileName='cluster_ind.txt'):
        cluster_ind = []

        for user in range(inputData.shape[0]):
            input_vec = inputData[user, :]
            cluster_ind.append(self.find_bmu(input_vec))

        if saveData is True:
            np.savetxt(fileName, cluster_ind, fmt='[%d,%d]', delimiter=',')

        return cluster_ind

    def decay_radius(self, iteration):
        return( self.init_radius * np.exp(-iteration/self.time_constant) )

    def decay_learning_rate(self, iteration):
        return ( self.init_learning_rate * np.exp(-iteration/self.num_epochs) )

    @staticmethod
    def euclidean_dist(first_vector, second_vector):
        return ( np.sum((first_vector - second_vector) ** 2) )

    @staticmethod
    def neighborhood_influence(distance, radius):
        return ( np.exp(-distance/(2 * (radius ** 2))) )

    def find_bmu(self, input_vec):
        bmu_ind = np.array([0, 0])
        bmu_init = self.net_weights[:, 0, 0].reshape(1, self.num_features)
        min_dist = self.euclidean_dist(input_vec, bmu_init)

        for x in range(self.net_dimensions[0]):
            for y in range(self.net_dimensions[1]):
                weight_k = self.net_weights[:, x, y].reshape(1, self.num_features)
                distance_between_vectors = SOM.euclidean_dist(input_vec, weight_k)
                if min_dist > distance_between_vectors:
                    min_dist = distance_between_vectors
                    bmu_ind = np.array([x, y])

        return bmu_ind


### read_Data.py

In [0]:
class dataReader:
	"""
	This class is responsible for reading the data that will be used for
	the identification system.
	---------------------------------------------------------------------
	Attributes
		1. fileData_str - indicates where the data is stored. The file is
		   generated using the using the Matlab code from Linghui
		2. fileData - matrix with all the features of all the users in the DB
		3. num_users - number of users in the DB
		4. num_features - number of features of each user

	Functions
		1. read_fileData_fromFile - creates a list of with all the user's data.
		   Each element in the list is a feature vector of one user
		2. convert_str2int_matrix - converts the list of feature vectors to a
		   matrix that can be used as a mathematical object
	---------------------------------------------------------------------
	"""

	# Initialization
	def __init__(self, file_path):
		self.file_path = file_path
		self.initialize()

	def initialize(self):
		fileData_str = self.read_data_fromFile()
		self.fileData = self.convert_str2int_matrix(fileData_str)
		self.num_entries = self.fileData.shape[0]
		self.num_features = self.fileData.shape[1]

	# Functions
	def read_data_fromFile(self):
		fileID = open(self.file_path,'r')
		fileData = []

		for line in fileID.readlines():
			line = line.strip()
			fileData.append(line.split(','))

		fileID.close()
		return(fileData)


	def convert_str2int_matrix(self, fileData_str):
		fileData_int = []

		for vec in fileData_str:
			fileData_int.append(list(map(int,vec)))

		return(np.array(fileData_int))


	def get_info(self):
		print("The data is stored in %s" % self.file_path)
		print("There is a total of %d entries" % self.num_entries)
		print("Each entry has %d features" % self.num_features)


### evaluate_performance.py

In [0]:
def findIndex_obs(cluster_ind_train, cluster_ind_test, testData):
	decoded_db_path = DEFAULT_MAIN_FOLDER + 'decompressed_db_500.txt'
	decoded_db = dataReader(decoded_db_path)
	decoded_db_data = decoded_db.fileData

	estimated_ind = []

	for ind_test, obs_test in enumerate(cluster_ind_test):

		isEstimated = False
		best_ind = 0
		best_mutual_info = 0
		obs_vec = testData[ind_test, :]

		for ind_train, obs_train in enumerate(cluster_ind_train):
			if obs_test[0] == obs_train[0] and obs_test[1] == obs_train[1]:
				entropy_obs = find_entropy(obs_vec)
				entropy_usr = find_entropy(decoded_db_data[ind_train, :])
				cross_entropy = find_cross_entropy(obs_vec, decoded_db_data[ind_train, :])
				mutual_info = entropy_obs + entropy_usr - cross_entropy

				if isEstimated is False:
					best_ind = ind_train
					best_mutual_info = mutual_info
					isEstimated = True

				if (isEstimated is True) and (mutual_info > best_mutual_info):
					best_ind = ind_train
					best_mutual_info = mutual_info

		estimated_ind.append(best_ind)

	return estimated_ind


def missclassification_error(original_ind, estimated_ind):
	error_rate = 0

	for k, est_ind in enumerate(estimated_ind):
		if original_ind[k] != est_ind:
			error_rate += 1

	return float(error_rate / len(estimated_ind))


def find_entropy(sequence):
	prob = np.zeros(2)
	prob[1] = float(len(np.nonzero(sequence)) / len(sequence))
	prob[0] = 1. - prob[1]
	return -1 * np.matmul(prob, np.log2(prob))


def find_cross_entropy(seq_p, seq_g):
	prob = np.zeros(4)
	length_seq_p = len(seq_p)
	for l in range(length_seq_p):
		if seq_p[l] == 1 and seq_g[l] == 1:
			prob[0] += 1
		elif seq_p[l] == 0 and seq_g[l] == 1:
			prob[1] += 1
		elif seq_p[l] == 1 and seq_g[l] == 0:
			prob[2] += 1
		else:
			prob[3] += 1

	for k in range(len(prob)):
		if prob[k] != 0:
			prob[k] = prob[k] / length_seq_p
		else:
			prob[k] = 1

	return -1 * np.matmul(prob, np.log2(prob))


def save_errorRate(error_rate, net_dimension_x, net_dimension_y, num_users, num_features, num_epochs,
				   learn_rate, fileName=MAIN_PATH+'errorRate_SOM.txt', print_config=False):
	
    msg = "error={0} [dim_x={1},dim_y={2},num_users={3},num_features={4},epochs={5},learn_rate={6}]\n".format(
			error_rate,
			net_dimension_x,
			net_dimension_y,
			num_users,
			num_features,
			num_epochs,
			learn_rate)
    
    with open(fileName, 'a') as errorFile:
        errorFile.write(msg)
    
    if print_config is True:
        print(msg)
            

## Test classes

### test_read_Data.py

In [24]:
# Read user data
fileData_path = DEFAULT_MAIN_FOLDER + '500 users/data_len_64/users_data_500.txt'
print(fileData_path)
fileData_test = dataReader(fileData_path)

# Print different attributes
fileData_test.get_info()

/content/drive/My Drive/Project - DL identification systems/Training data/500 users/data_len_64/users_data_500.txt
The data is stored in /content/drive/My Drive/Project - DL identification systems/Training data/500 users/data_len_64/users_data_500.txt
There is a total of 500 entries
Each entry has 64 features


### test_evaluate_performance.py

In [26]:
p = np.random.randint(2, size=10)
g = np.random.randint(2, size=10)
print('Entropy of p = %.3f' % find_entropy(p))
print('Entropy of g = %.3f' % find_entropy(g))
print('Cross entropy p and g = %.3f' % find_cross_entropy(p,g))

a = np.arange(10)
b = np.arange(10)
c = np.arange(10) + np.ones(10)
d = np.arange(10)
d[0] = 2
print('Error rate a and b = %.3f' % missclassification_error(a,b))
print('Error rate a and c = %.3f' % missclassification_error(a,c))
print('Error rate a and d = %.3f' % missclassification_error(a,d))


Entropy of p = 0.469
Entropy of g = 0.469
Cross entropy p and g = 1.685
Error rate a and b = 0.000
Error rate a and c = 1.000
Error rate a and d = 0.100


## Main

In [0]:
# Generate train and test data
trainData_path = DEFAULT_MAIN_FOLDER + 'users_data_500.txt'
testData_path = DEFAULT_MAIN_FOLDER + 'observations_500.txt'
usersData_train = dataReader(trainData_path)
usersData_test = dataReader(testData_path)

In [0]:
# Parameters unsupervised algorithm
net_dimension_x = 14
net_dimension_y = 14
num_nodes = net_dimension_x * net_dimension_y
num_features = usersData_train.num_features

In [0]:
# Train cluster algorithm
trainData = usersData_train.fileData
unspv_alg = SOM(net_dimension_x, net_dimension_y, num_features)
unspv_alg.train(usersData_train.fileData)
cluster_ind_train = unspv_alg.predict_cluster(trainData)

In [0]:
# Test cluster algorithm
testData = usersData_test.fileData
cluster_ind_test = unspv_alg.predict_cluster(testData)
estimated_ind = findIndex_obs(cluster_ind_train,cluster_ind_test,testData)

In [62]:
# Evaluate performance
error_rate = missclassification_error(np.arange(usersData_test.num_entries),estimated_ind)
save_errorRate(error_rate,unspv_alg.net_dimensions[0],unspv_alg.net_dimensions[1],
			   usersData_train.num_entries,usersData_train.num_features,
			   unspv_alg.DEFAULT_NUM_EPOCHS,unspv_alg.DEFAULT_INIT_LEARN_RATE,print_config=True)


error=0.998 [dim_x=14,dim_y=14,num_users=500,num_features=64,epochs=200,learn_rate=0.4]

