# Centralized version

In [1]:
import numpy as np
import math

## Functions

### Auxiliar functions

In [2]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [3]:
def cost_function(w, x, y, y_hat):
    # Computes the cost function for all the training samples
    m = x.shape[0]
    return -(1 / m) * np.sum(
        y * np.log(y_hat) + (1 - y)
        * np.log(1 - y_hat)
    )

### Mandatory functions

In [4]:
def readFile(filename):
    return np.genfromtxt(filename, delimiter=',')

In [5]:
def normalize(RDD_Xy):
    x = RDD_Xy[:,0:-1]
    y = RDD_Xy[:,-1]
    means = x.mean(axis=0)
    stds = x.std(axis=0)
    return np.c_[(x - means) / stds, y]

In [6]:
def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    
    m = RDD_Xy.shape[0]
    n = RDD_Xy.shape[1]
    
    X = np.c_[np.ones(m), RDD_Xy[:,0:-1]]
    y = RDD_Xy[:,-1]
    
    w = np.random.rand(n)
    
    for it in range(iterations):
        y_hat = np.array([sigmoid(x) for x in np.matmul(X, w)])
        
        print(f"Cost for it {it}:", cost_function(w, X, y, y_hat))
        
        dw = ((X.transpose() * (y_hat - y)).sum(axis=1) + lambda_reg * w)/m
        w -= learning_rate * dw
              
    return w

In [7]:
def accuracy(w, RDD_Xy):
    m = RDD_Xy.shape[0]
    
    X = np.c_[np.ones(m), RDD_Xy[:,0:-1]]
    y = RDD_Xy[:,-1]
    
    y_hat = predict(w, X)
    
    return np.sum(y_hat == y)/len(y)

In [8]:
def predict(w, X):
    y_hat = np.array([np.around(sigmoid(x),0) for x in np.matmul(X, w)])
    
    return y_hat

## Testing

In [9]:
data = readFile("../data/botnet_tot_syn_l.csv")

In [10]:
data = normalize(data)

In [11]:
w = train(data, 15, 0.1, 0.1)

Cost for it 0: 1.320980271581351
Cost for it 1: 1.2654769566221535
Cost for it 2: 1.2125469551723587
Cost for it 3: 1.1621492294970766
Cost for it 4: 1.1142312209744811
Cost for it 5: 1.0687300895196385
Cost for it 6: 1.0255742150367007
Cost for it 7: 0.9846848681695745
Cost for it 8: 0.9459779426637646
Cost for it 9: 0.9093656425428364
Cost for it 10: 0.8747580331113404
Cost for it 11: 0.8420643908530959
Cost for it 12: 0.811194317330363
Cost for it 13: 0.7820586104633183
Cost for it 14: 0.7545699091561351


In [12]:
acc = accuracy(w, data)
acc

0.615927