# Cross-validation version

In [10]:
import math
import random
import numpy as np
np.random.seed(33)

## Initializing spark

In [11]:
from pyspark import SparkConf, SparkContext

In [30]:
conf = SparkConf().setAppName("appName").setMaster("local")
sc = SparkContext()

## Functions

### Auxiliar functions

In [12]:
def trainTestSplit(dataset):
    train = dataset.filter(lambda x: x[2]==1)
    train = train.map(lambda x: (x[0], x[1]))
    test = dataset.filter(lambda x: x[2]==0)
    test = test.map(lambda x: (x[0], x[1]))
    return (train, test)

In [13]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [14]:
def rdd_cost_function(RDD_Xyyhat, lambda_ref, m, w):
    y = RDD_Xyyhat[1]
    y_hat = RDD_Xyyhat[2]
    temp = (-1/m) * (
        y * np.log(y_hat) + (1 - y)
        * np.log(1 - y_hat)
    )
    cost = temp + (lambda_ref/(2*m))*sum([i*i for i in w.value])
    return cost

In [15]:
def str_to_number_list(line):
    numbers = line.replace(" ","").split(",")
    numbers = [float(number) for number in numbers]
    
    # Separate X and y
    numbers = (numbers[:-1], numbers[-1])
    
    return numbers

In [16]:
def sum_list_values(list1, list2):
    return [n1 + n2 for n1, n2 in zip(list1, list2)]

In [17]:
def sum_minus_mean_squared(numbers):
    return [(number-means.value[x])**2 for x,number in enumerate(numbers)]

In [18]:
def rdd_mean_by_column(RDD_Xy, m):
    means = RDD_Xy.map(lambda x: x[0]).reduce(sum_list_values)
    means = [x/m for x in means]
    return means

In [19]:
def rdd_std_by_column(RDD_Xy, m):
    stds = RDD_Xy.map(lambda x: x[0]).map(sum_minus_mean_squared).reduce(sum_list_values)
    stds = [math.sqrt(x/m) for x in stds]
    return stds

In [20]:
def multiply_RDDXy_by_w(Xy):
    global b
    tot = 0
    for xi, wi in zip(Xy[0], w.value):
        tot += xi * wi
    tot += b
    return (Xy[0], Xy[1], sigmoid(tot))

In [21]:
def calculate_dw(RDD_Xyyhat):
    dw = []
    for x in RDD_Xyyhat[0]:
        dw.append((RDD_Xyyhat[2]-RDD_Xyyhat[1])*x)
    return dw

### Mandatory functions

In [53]:
def readFile(filename):
    dataset = sc.textFile(filename)
    dataset = dataset.map(str_to_number_list)
    return dataset.sample(False, 0.001)

In [23]:
def normalize(RDD_Xy):
    m = RDD_Xy.count()
    global means
    means = sc.broadcast(rdd_mean_by_column(RDD_Xy, m))
    stds = sc.broadcast(rdd_std_by_column(RDD_Xy, m))
    norm_rdd = RDD_Xy.map(lambda values: ([(x - mean) / std for x, mean, std in zip(values[0], means.value, stds.value)], values[1]))
    return norm_rdd

In [44]:
def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    global b
    global m
    m = RDD_Xy.count()
    n = len(RDD_Xy.take(1)[0][0])
    global w
    w_temp = w.value.copy()
    for it in range(iterations):
        RDD_Xyyhat = RDD_Xy.map(multiply_RDDXy_by_w)
        print(f"Cost for it {it}:", RDD_Xyyhat.map(lambda x: rdd_cost_function(x, lambda_reg, m, w)).reduce(lambda x,y: x+y))
        dw=[0 for i in range(0,n)]
        for cl in range(n):
                X_cl = RDD_Xyyhat.map(lambda x: (x[2]-x[1])*x[0][cl]).reduce(lambda x, y: x+y)
                dw[cl] = (1/m)*(X_cl)+(lambda_reg/m)*w_temp[cl]
                w_temp[cl] -= learning_rate * dw[cl]
        w = sc.broadcast(w_temp)
        db = (1/m)*RDD_Xyyhat.map(lambda x: x[2]-x[1]).reduce(lambda x, y: x+y)
        b -= learning_rate * db
    return w, b

In [25]:
def accuracy(ws, b, RDD_Xy):
    total = RDD_Xy.count()
    y_and_y_hat = RDD_Xy.map(lambda x: (x[1], predict(ws, x[0], b)))
    correct = y_and_y_hat.map(lambda x: 1 if x[0]==x[1] else 0).reduce(lambda x, y: x+y)
    return correct/total

In [26]:
def predict(w, X, b):
    tot = 0
    for xi, wi in zip(X, w.value):
        tot += xi * wi
    tot += b
    val = sigmoid(tot)
    if (val>=0.5):
        return 1.0
    return 0.0

### Make folds for cross-validation

In [27]:
def shuffleRDD(RDD):
    RDD_with_rand = RDD.map(lambda x: (x[0], x[1], random.random()))
    RDD_sorted = RDD_with_rand.sortBy(lambda x: x[2])
    RDD_cleaned = RDD_sorted.map(lambda x: (x[0], x[1]))
    return RDD_cleaned

In [51]:
def kFoldsCV(k_fold, iterations, rdd_data, learning_rate, lambda_reg):
    
    n = len(rdd_data.take(1)[0][0])
    
    fold_length = rdd_data.count()/k_fold
    train_errors_fold = []
    test_errors_fold = []
    folds_accuracy = []
    
    for i_fold in range(k_fold):
        global w
        w = sc.broadcast(np.random.rand(n))
        
        global b
        b = 0
        
        starting_fold = fold_length * i_fold
        end_fold = starting_fold + fold_length
        
        train_fold = rdd_data.zipWithIndex().filter(lambda x: (x[1] >= starting_fold and x[1]<end_fold)).map(lambda x: x[0])
        test_fold = rdd_data.zipWithIndex().filter(lambda x: (x[1] < starting_fold or x[1] >= end_fold)).map(lambda x: x[0])
        train_size = train_fold.count()
        
        w, b = train(train_fold, iterations, learning_rate, lambda_reg) 
        fold_accuracy = accuracy(w, b, test_fold)
        folds_accuracy.append(fold_accuracy)
        print(f"Accuracy for fold {i_fold}: {fold_accuracy}")
    
    print(folds_accuracy)
    return sum(folds_accuracy)/len(folds_accuracy)

## Testing

In [54]:
RDD_Xy = readFile("../data/botnet_tot_syn_l.csv")
RDD_Xy.count()

1001

In [55]:
RDD_Xy_normalized = normalize(RDD_Xy)

In [56]:
RDD_shuffled = shuffleRDD(RDD_Xy_normalized)

In [57]:
avg_accuracy = kFoldsCV(10, 10, RDD_shuffled, 1.5, 0)
avg_accuracy

Cost for it 0: 1.3449537602258286
Cost for it 1: 0.5672418181659159
Cost for it 2: 0.3330206004818843
Cost for it 3: 0.2491964354246278
Cost for it 4: 0.20803593721269384
Cost for it 5: 0.183432449283634
Cost for it 6: 0.16685484440094192
Cost for it 7: 0.15477541799900202
Cost for it 8: 0.14548417861225468
Cost for it 9: 0.1380535307141174
Accuacy 87.55555555555556%
Accuracy for fold 0: 0.8755555555555555
Cost for it 0: 1.1942010161029064
Cost for it 1: 0.6135288625550839
Cost for it 2: 0.4236821669195999
Cost for it 3: 0.34185116800708976
Cost for it 4: 0.2952588324487856
Cost for it 5: 0.2646390248214121
Cost for it 6: 0.2427576900610568
Cost for it 7: 0.22621621504131179
Cost for it 8: 0.21320446763716383
Cost for it 9: 0.20266760057079575
Accuacy 91.00998890122086%
Accuracy for fold 1: 0.9100998890122086
Cost for it 0: 1.00419105148884
Cost for it 1: 0.42979441873600494
Cost for it 2: 0.3035672865430616
Cost for it 3: 0.2530230229816308
Cost for it 4: 0.22574397320193845
Cost for 

0.9146365766432358

## Experiments

In [None]:
elapsed_times = {}
start = time.time()

for cores in range(1,9,1):
    sc.stop()
    conf = SparkConf().set("spark.executor.cores", cores).setAppName("appName").setMaster("local")
    sc = SparkContext(conf=conf)
    
    print(f"---------- Starting execution with {cores} cores ----------")
    
    # ---------- Execution ----------
    RDD_Xy = readFile("../data/botnet_tot_syn_l.csv")
    RDD_Xy_normalized = normalize(RDD_Xy)
    RDD_shuffled = shuffleRDD(RDD_Xy_normalized)
    avg_accuracy = kFoldsCV(10, 3, RDD_shuffled, 1.5, 0)
    print("Average accuracy for cores {cores}: {avg_accuracy}")
    
    # ---------- Execution ----------

    end = time.time()
    print(f"---------- Finished execution with {cores} cores ----------")
    elapsed_time = end - start
    elapsed_times[cores] = elapsed_time
    print(f"Elapsed time for cores {cores} is {elapsed_time} seconds")