# Parallelized version

In [2]:
import math
import random
import numpy as np
np.random.seed(33)
global b  
b = 0

## Initializing spark

In [3]:
from pyspark import SparkConf, SparkContext

In [4]:
conf = SparkConf().setAppName("appName").setMaster("local")
sc = SparkContext()

## Functions

### Auxiliar functions

In [5]:
def trainTestSplit(dataset):
    train = dataset.filter(lambda x: x[2]==1)
    train = train.map(lambda x: (x[0], x[1]))
    test = dataset.filter(lambda x: x[2]==0)
    test = test.map(lambda x: (x[0], x[1]))
    return (train, test)

In [6]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [7]:
def rdd_cost_function(RDD_Xyyhat, lambda_ref, m, w):
    y = RDD_Xyyhat[1]
    y_hat = RDD_Xyyhat[2]
    temp = (-1/m) * (
        y * np.log(y_hat) + (1 - y)
        * np.log(1 - y_hat)
    )
    cost = temp + (lambda_ref/(2*m))*sum([i*i for i in w.value])
    return cost

In [8]:
def str_to_number_list(line):
    numbers = line.replace(" ","").split(",")
    numbers = [float(number) for number in numbers]
    
    # Separate X and y
    numbers = (numbers[:-1], numbers[-1])
    
    return numbers

In [9]:
def sum_list_values(list1, list2):
    return [n1 + n2 for n1, n2 in zip(list1, list2)]

In [10]:
def sum_minus_mean_squared(numbers):
    return [(number-means.value[x])**2 for x,number in enumerate(numbers)]

In [11]:
def rdd_mean_by_column(RDD_Xy, m):
    means = RDD_Xy.map(lambda x: x[0]).reduce(sum_list_values)
    means = [x/m for x in means]
    return means

In [12]:
def rdd_mean_by_column(RDD_Xy, m):
    means = RDD_Xy.map(lambda x: x[0]).reduce(sum_list_values)
    means = [x/m for x in means]
    return means

In [22]:
def rdd_std_by_column(RDD_Xy, m):
    stds = RDD_Xy.map(lambda x: x[0]).map(sum_minus_mean_squared).reduce(sum_list_values)
    stds = [math.sqrt(x/m) for x in stds]
    return stds

In [13]:
def multiply_RDDXy_by_w(Xy):
    tot = 0
    for xi, wi in zip(Xy[0], w.value):
        tot += xi * wi
    return (Xy[0], Xy[1], sigmoid(tot))

In [14]:
def calculate_dw(RDD_Xyyhat):
    dw = []
    for x in RDD_Xyyhat[0]:
        dw.append((RDD_Xyyhat[1]-RDD_Xyyhat[2])*x)
    return dw

### Mandatory functions

In [15]:
def readFile(filename):
    dataset = sc.textFile(filename)
    dataset = dataset.map(str_to_number_list)
    return dataset.sample(False, 0.00001)

In [16]:
def normalize(RDD_Xy):
    m = RDD_Xy.count()
    global means
    means = sc.broadcast(rdd_mean_by_column(RDD_Xy, m))
    stds = sc.broadcast(rdd_std_by_column(RDD_Xy, m))
    norm_rdd = RDD_Xy.map(lambda values: ([(x - mean) / std for x, mean, std in zip(values[0], means.value, stds.value)], values[1]))
    return norm_rdd

In [26]:
def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    global b
    global m
    m = RDD_Xy.count()
    n = len(RDD_Xy.take(1)[0])
    
    global w
    w = sc.broadcast(np.random.rand(n))
    
    
    for it in range(iterations):
        RDD_Xyyhat = RDD_Xy.map(multiply_RDDXy_by_w)
        print(f"Cost for it {it}:", RDD_Xyyhat.map(lambda x: rdd_cost_function(x, lambda_reg, m, w)).reduce(lambda x,y: x+y))
        dw = RDD_Xyyhat.map(calculate_dw).reduce(lambda x, y: [xi+yi for xi, yi in zip(x, y)])        
        dw_final = []
        for dwi, wi in zip(dw, w.value):
            dw_final.append( ((dwi+lambda_reg)/m)+wi )
        w = sc.broadcast([wi - (learning_rate * dwi) for wi, dwi in zip(w.value, dw_final)])
        
        b -= learning_rate * (1/m)*RDD_Xyyhat.map(lambda x: x[1]-x[2]).sum()
    
    return w, b


In [18]:
def accuracy(ws, b, RDD_Xy):
    total = RDD_Xy.count()
    y_and_y_hat = RDD_Xy.map(lambda x: (x[1], predict(ws, x[0], b)))
    correct = y_and_y_hat.map(lambda x: 1 if x[0]==x[1] else 0).reduce(lambda x, y: x+y)
    return correct/total

In [19]:
def predict(w, X, b):
    tot = 0
    for xi, wi in zip(X, w.value):
        tot += xi * wi
    tot += b
    val = sigmoid(tot)
    if (val>=0.5):
        return (1.0)
    return (0.0)

## Testing

In [20]:
RDD_Xy = readFile("../data/botnet_tot_syn_l.csv")
RDD_Xy.count()

1000000

In [23]:
RDD_Xy_normalized = normalize(RDD_Xy)

In [29]:
RDD_Xy_presplit = RDD_Xy_normalized.map(lambda x: (x[0], x[1], 1 if random.random() < 0.75 else 0))
train_data, test_data = trainTestSplit(RDD_Xy_presplit)
print("train_data")
print(train_data.count())
print("test_data")
print(test_data.count())

train_data
749931
test_data
249703


In [30]:
w_final, b = train(train_data, 50, 0.1, 0.1)
b

Cost for it 0: 0.9370956298141058
Cost for it 1: 0.9178398735874175
Cost for it 2: 0.9013849430174388
Cost for it 3: 0.8876796917323783
Cost for it 4: 0.8748163785647725
Cost for it 5: 0.861958252749179
Cost for it 6: 0.8529016289874114
Cost for it 7: 0.8443233733219585
Cost for it 8: 0.8367569879089303
Cost for it 9: 0.8304348274213116
Cost for it 10: 0.8232560202310979
Cost for it 11: 0.8181133083193959
Cost for it 12: 0.8134692580417783
Cost for it 13: 0.8093832968469038
Cost for it 14: 0.8060949268603597
Cost for it 15: 0.8013409475253304
Cost for it 16: 0.798378092484574
Cost for it 17: 0.7958818705228232
Cost for it 18: 0.7931298346478657
Cost for it 19: 0.7909231296283342
Cost for it 20: 0.7878267371960703
Cost for it 21: 0.7878961646612034
Cost for it 22: 0.7847545207257696
Cost for it 23: 0.7837593213436056
Cost for it 24: 0.782959149496358
Cost for it 25: 0.7816045487691992
Cost for it 26: 0.7801771229123448
Cost for it 27: 0.779317113042042
Cost for it 28: 0.7788517530307646

-0.006812335766407407

In [31]:
acc = accuracy(w_final, b, test_data)
acc

0.3038730605698195