# Parallelized version

In [1]:
import math
import numpy as np
np.random.seed(33)

## Initializing spark

In [2]:
from pyspark import SparkConf, SparkContext

In [3]:
conf = SparkConf().setAppName("appName").setMaster("local")
sc = SparkContext()

22/11/16 18:22:40 WARN Utils: Your hostname, Alexs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.156.1.130 instead (on interface en0)
22/11/16 18:22:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/16 18:22:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Functions

### Auxiliar functions

In [4]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [5]:
def rdd_cost_function(RDD_Xyyhat):
    y = RDD_Xyyhat[1]
    y_hat = RDD_Xyyhat[2]
    return (
        y * np.log(y_hat) + (1 - y)
        * np.log(1 - y_hat)
)

In [6]:
def cost_function(w, y, y_hat):
    # Computes the cost function for all the training samples
    return -(1 / m) * np.sum(
        y * np.log(y_hat) + (1 - y)
        * np.log(1 - y_hat)
    )

In [7]:
def str_to_number_list(line):
    numbers = line.replace(" ","").split(",")
    numbers = [float(number) for number in numbers]
    
    # Separate X and y
    numbers = (numbers[:-1], numbers[-1])
    
    return numbers

In [8]:
def sum_list_values(list1, list2):
    return [n1 + n2 for n1, n2 in zip(list1, list2)]

In [9]:
def sum_minus_mean_squared(numbers):
    return [(number-means.value[x])**2 for x,number in enumerate(numbers)]

In [10]:
def rdd_mean_by_column(RDD_Xy, m):
    means = RDD_Xy.map(lambda x: x[0]).reduce(sum_list_values)
    means = [x/m for x in means]
    return means

In [11]:
def rdd_std_by_column(RDD_Xy, m):
    stds = RDD_Xy.map(lambda x: x[0]).map(sum_minus_mean_squared).reduce(sum_list_values)
    stds = [math.sqrt(x/m) for x in stds]
    return stds

In [12]:
def multiply_RDDXy_by_w(Xy):
    sum = 0
    for xi, wi in zip(Xy[0], w.value):
        sum += xi * wi
    return (Xy[0], Xy[1], sigmoid(sum))

In [13]:
def calculate_dw(RDD_Xyyhat):
    dw = []
    for x in RDD_Xyyhat[0]:
        dw.append(x*(RDD_Xyyhat[1]-RDD_Xyyhat[2]))
    return dw

### Mandatory functions

In [14]:
def readFile(filename):
    dataset = sc.textFile(filename)
    dataset = dataset.map(str_to_number_list)
    return dataset.sample(False, 0.00001)

In [15]:
def normalize(RDD_Xy):
    m = RDD_Xy.count()
    global means
    means = sc.broadcast(rdd_mean_by_column(RDD_Xy, m))
    stds = sc.broadcast(rdd_std_by_column(RDD_Xy, m))
    norm_rdd = RDD_Xy.map(lambda values: ([(x - mean) / std for x, mean, std in zip(values[0], means.value, stds.value)], values[1]))
    return norm_rdd

In [25]:
def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    
    global m
    m = RDD_Xy.count()
    n = len(RDD_Xy.take(1)[0][0])
    
    global w
    w = sc.broadcast(np.random.rand(n))
    
    for it in range(iterations):
        RDD_Xyyhat = RDD_Xy.map(multiply_RDDXy_by_w)
        print(f"Cost for it {it}:", RDD_Xyyhat.map(rdd_cost_function).reduce(lambda x,y: x+y) * (-1/m))
        dw = RDD_Xyyhat.map(calculate_dw).map(lambda x: [(xi + lambda_reg*wi)/m for xi, wi in zip(x, w.value)])
        w = sc.broadcast([wi - (learning_rate * dwi) for wi, dwi in zip(w.value, dw.collect()[0])])
        
        #y_hat = np.array([sigmoid(x) for x in np.matmul(X, w)])
        #print(f"Cost for it {it}:", cost_function(w, y, y_hat))
        #dw = ((X.transpose() * (y_hat - y)).sum(axis=1) + lambda_reg * w)/m
        #w -= learning_rate * dw
        
        # n,m*m = n
              
    return w

In [17]:
def accuracy(w, RDD_Xy):
    m = RDD_Xy.shape[0]
    
    X = np.c_[np.ones(m), RDD_Xy[:,0:-1]]
    y = RDD_Xy[:,-1]
    
    y_hat = predict(w, X)
    
    return np.sum(y_hat == y)/len(y)

In [18]:
def predict(w, X):
    y_hat = np.array([np.around(sigmoid(x),0) for x in np.matmul(X, w)])
    
    return y_hat

## Testing

In [19]:
RDD_Xy = readFile("../data/botnet_tot_syn_l.csv")

In [20]:
RDD_Xy = normalize(RDD_Xy)

                                                                                

In [21]:
w = train(RDD_Xy, 15, 0.1, 0.1)

                                                                                

Cost for it 0: 0.7592536405736146


                                                                                

Cost for it 1: 0.7633896014722554


                                                                                

Cost for it 2: 0.7677038989366809


                                                                                

Cost for it 3: 0.7722006611869321


                                                                                

Cost for it 4: 0.7768838076490949


                                                                                

Cost for it 5: 0.7817570210679515


                                                                                

Cost for it 6: 0.7868237202316093


                                                                                

Cost for it 7: 0.7920870337235476


                                                                                

Cost for it 8: 0.7975497751254514


                                                                                

Cost for it 9: 0.8032144200911473


                                                                                

Cost for it 10: 0.8090830856971771


                                                                                

Cost for it 11: 0.8151575124486794


                                                                                

Cost for it 12: 0.8214390492805527


                                                                                

Cost for it 13: 0.8279286418440851


                                                                                

Cost for it 14: 0.8346268243096682


                                                                                

In [22]:
w = train(RDD_Xy, 15, 0.1, 0.1)

                                                                                

Cost for it 0: 0.902003273818732


                                                                                

Cost for it 1: 0.9081218150375743


                                                                                

Cost for it 2: 0.9144434995944324


                                                                                

Cost for it 3: 0.9209687320256672


                                                                                

Cost for it 4: 0.9276975128885218


                                                                                

Cost for it 5: 0.9346294388682642


                                                                                

Cost for it 6: 0.9417637065482168


                                                                                

Cost for it 7: 0.9490991197671641


                                                                                

Cost for it 8: 0.9566341004166263


                                                                                

Cost for it 9: 0.9643667024636238


                                                                                

Cost for it 10: 0.9722946289250691


                                                                                

Cost for it 11: 0.9804152514697342


                                                                                

Cost for it 12: 0.9887256322841383


                                                                                

Cost for it 13: 0.9972225478106638


                                                                                

Cost for it 14: 1.0059025139499138


                                                                                

In [23]:
w.value

[0.26256856924080557,
 -0.018192875160363563,
 0.33577223586602123,
 0.09401836932278308,
 1.210163569691844,
 0.8266753015864874,
 0.5330772498386562,
 0.33603744659156415,
 0.5591437682628511,
 0.6656855125663429,
 0.20658846779175521]

In [24]:
acc = accuracy(w, data)
acc

NameError: name 'data' is not defined