## import numpy as np

In [1]:
import findspark
findspark.init()
from pyspark import *
import numpy as np

In [2]:
def readFile(fileName):
    
    loaded_data = np.loadtxt(filename, delimiter = ',')
    return loaded_data

In [3]:
def normalize(Xy):
    
    #separate data and labels
    data = np.delete(Xy, 11, 1)
    labels = Xy[:,11].reshape(-1,1).astype(np.int8)
    
    #compute mean
    mean = np.average(data, axis = 0)
    
    #compute standard deviation
    standard_deviation = np.std(data, axis = 0)
    
    
    #FORMULA USED TO NORMALIZE DATA: (X-mean) / standard deviation
    
    normalized_data = (data - mean)/standard_deviation
    Xy_normalized = np.concatenate((normalized_data, labels), axis = 1)
    
    return Xy_normalized

In [4]:
def train(Xy, iterations, learning_rate):
    
    #separate data and labels
    data = np.delete(Xy, 11, 1)
    y = Xy[:,11]
    num_rows = len(data)
    
    #set initial random weight and bias values
    weight = np.random.rand(data.shape[1])
    bias = np.random.randint(0,100)/100

    
    for i in range(iterations):
        
        y_pred = predict(weight, bias, data)
        
        #COMPUTE THE LOSS FUNCTION
        loss = -np.mean((y*np.log(y_pred + epsilon)+((1-y)*np.log(1-y_pred + epsilon))))  # epsilon is a very small value added to avoid calculation problems
        
        #COMPUTE WEIGHT DERIVATIVE
        weight_der =np.dot(data.T, (y_pred-y))/num_rows
        
        #COMPUTE BIAS DERIVATIVE
        bias_der = np.mean((y_pred-y))
        
        #UPDATE WEIGHT AND BIAS VALUES
        weight = weight - learning_rate*weight_der
        bias = bias - learning_rate*bias_der
        
        acc = accuracy(weight, bias, Xy)
        print("EPOCH",i," ---->  LOSS: ",loss," ACCURACY: ",acc)

        
    return weight,bias
    

In [5]:
def accuracy(w, b, Xy):
    
    y = Xy[:, 11]
    data = np.delete(Xy, 11, 1)
    
    
    #get the predicted values in order to be able to compare them with the given ones
    y_pred = predict(w,b,data)
    #if the prediction is > 0.5, set y_pred to 1, otherwise set it to 0
    y_pred = np.where(y_pred > 0.5, 1, 0)
    #get the number of predicted labels matching the exact given value
    matching = np.equal(y_pred,y)
    #accuracy is given by the % of matching predictions over the total samples
    acc = matching.mean()*100
    
    return acc

In [6]:
def predict(w,b,X):
    
    y_pred = sigmoid(np.matmul(w, X.T) + b)
    
    return y_pred

In [7]:
def sigmoid(x):
    y = 1/(1+np.exp(-x))
    return y

In [8]:
# ****** EXECUTION ********

In [9]:
# GLOBAL VARIABLES

filename = "../alumno/Downloads/botnet_tot_syn_l.csv"
epochs = 10
learning_r = 1.5

epsilon = 0.000000000001

In [10]:
#----- PROVIDED EXECUTION CODE ----------

In [11]:
# read data
data = readFile(filename)

# standardize
data = normalize(data)

w, b = train(data, epochs, learning_r)
acc = accuracy(w, b, data)

print("acc: ", acc)


EPOCH 0  ---->  LOSS:  1.3675626410922301  ACCURACY:  62.4452
EPOCH 1  ---->  LOSS:  0.7508346401734097  ACCURACY:  75.8562
EPOCH 2  ---->  LOSS:  0.47946609523787526  ACCURACY:  85.2479
EPOCH 3  ---->  LOSS:  0.36076558042049817  ACCURACY:  89.20439999999999
EPOCH 4  ---->  LOSS:  0.3032118002104034  ACCURACY:  90.7397
EPOCH 5  ---->  LOSS:  0.27108370351017186  ACCURACY:  91.5951
EPOCH 6  ---->  LOSS:  0.25080338701140775  ACCURACY:  92.1775
EPOCH 7  ---->  LOSS:  0.23682582096944002  ACCURACY:  92.5012
EPOCH 8  ---->  LOSS:  0.22657811816764872  ACCURACY:  92.7507
EPOCH 9  ---->  LOSS:  0.2187201274512405  ACCURACY:  92.9412
acc:  92.9412
