# Cross-validation version

In [1]:
import math
import random
import numpy as np
np.random.seed(33)
import time

## Initializing spark

In [2]:
from pyspark import SparkConf, SparkContext

In [3]:
conf = SparkConf().setAppName("appName").setMaster("local[*]")
sc = SparkContext()

## Functions

### Auxiliar functions

In [4]:
def trainTestSplit(dataset):
    train = dataset.filter(lambda x: x[2]==1)
    train = train.map(lambda x: (x[0], x[1]))
    test = dataset.filter(lambda x: x[2]==0)
    test = test.map(lambda x: (x[0], x[1]))
    return (train, test)

In [5]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [6]:
def rdd_cost_function(RDD_Xyyhat, lambda_ref, m, w):
    y = RDD_Xyyhat[1]
    y_hat = RDD_Xyyhat[2]
    temp = (-1/m) * (
        y * np.log(y_hat) + (1 - y)
        * np.log(1 - y_hat)
    )
    cost = temp + (lambda_ref/(2*m))*sum([i*i for i in w.value])
    return cost

In [7]:
def str_to_number_list(line):
    numbers = line.replace(" ","").split(",")
    numbers = [float(number) for number in numbers]
    
    # Separate X and y
    numbers = (numbers[:-1], numbers[-1])
    
    return numbers

In [8]:
def sum_list_values(list1, list2):
    return [n1 + n2 for n1, n2 in zip(list1, list2)]

In [9]:
def sum_minus_mean_squared(numbers):
    return [(number-means.value[x])**2 for x,number in enumerate(numbers)]

In [10]:
def rdd_mean_by_column(RDD_Xy, m):
    means = RDD_Xy.map(lambda x: x[0]).reduce(sum_list_values)
    means = [x/m for x in means]
    return means

In [11]:
def rdd_std_by_column(RDD_Xy, m):
    stds = RDD_Xy.map(lambda x: x[0]).map(sum_minus_mean_squared).reduce(sum_list_values)
    stds = [math.sqrt(x/m) for x in stds]
    return stds

In [12]:
def multiply_RDDXy_by_w(Xy):
    global b
    tot = 0
    for xi, wi in zip(Xy[0], w.value):
        tot += xi * wi
    tot += b
    return (Xy[0], Xy[1], sigmoid(tot))

In [13]:
def calculate_dw(RDD_Xyyhat):
    dw = []
    for x in RDD_Xyyhat[0]:
        dw.append((RDD_Xyyhat[2]-RDD_Xyyhat[1])*x)
    return dw

### Mandatory functions

In [14]:
def readFile(filename):
    dataset = sc.textFile(filename)
    dataset = dataset.map(str_to_number_list)
    return dataset.sample(False, 1)

In [15]:
def normalize(RDD_Xy):
    m = RDD_Xy.count()
    global means
    means = sc.broadcast(rdd_mean_by_column(RDD_Xy, m))
    stds = sc.broadcast(rdd_std_by_column(RDD_Xy, m))
    norm_rdd = RDD_Xy.map(lambda values: ([(x - mean) / std for x, mean, std in zip(values[0], means.value, stds.value)], values[1]))
    return norm_rdd

In [16]:
def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    global b
    global m
    m = RDD_Xy.count()
    n = len(RDD_Xy.take(1)[0][0])
    global w
    w_temp = w.value.copy()
    for it in range(iterations):
        RDD_Xyyhat = RDD_Xy.map(multiply_RDDXy_by_w)
        print(f"Cost for it {it}:", RDD_Xyyhat.map(lambda x: rdd_cost_function(x, lambda_reg, m, w)).reduce(lambda x,y: x+y))
        dw=[0 for i in range(0,n)]
        for cl in range(n):
                X_cl = RDD_Xyyhat.map(lambda x: (x[2]-x[1])*x[0][cl]).reduce(lambda x, y: x+y)
                dw[cl] = (1/m)*(X_cl)+(lambda_reg/m)*w_temp[cl]
                w_temp[cl] -= learning_rate * dw[cl]
        w = sc.broadcast(w_temp)
        db = (1/m)*RDD_Xyyhat.map(lambda x: x[2]-x[1]).reduce(lambda x, y: x+y)
        b -= learning_rate * db
    return w, b

In [17]:
def checkPrediction(y, y_hat):
    if (y == y_hat and y_hat==0):
        return (0, 1, 0, 0)
    if (y == y_hat and y_hat==1):
        return (1, 0, 0, 0)
    if (y != y_hat and y_hat==1):
        return (0, 0, 1, 0)
    if (y != y_hat and y_hat==0):
        return (0,0,0,1)

In [18]:
def metrics(ws, b, RDD_Xy):
    total = RDD_Xy.count()
    y_and_y_hat = RDD_Xy.map(lambda x: (x[1], predict(ws, x[0], b)))
    result = y_and_y_hat.map(lambda x: checkPrediction(x[0], x[1]))
    print(result.take(1))
    tp = result.map(lambda x: x[0]).reduce(lambda x,y: x+y)
    tn = result.map(lambda x: x[1]).reduce(lambda x,y: x+y)
    fp = result.map(lambda x: x[2]).reduce(lambda x,y: x+y)
    fn = result.map(lambda x: x[3]).reduce(lambda x,y: x+y)
    acc = (tp+tn)/total
    
    tpr = tp/(tp+fn)
    fpr = fp/(fp+tn)
    
    return acc, tp, tn, fp, fn, tpr, fpr

In [19]:
def accuracy(ws, b, RDD_Xy):
    total = RDD_Xy.count()
    y_and_y_hat = RDD_Xy.map(lambda x: (x[1], predict(ws, x[0], b)))
    result = y_and_y_hat.map(lambda x: checkPrediction(x[0], x[1]))
    print(result.take(1))
    tp = result.map(lambda x: x[0]).reduce(lambda x,y: x+y)
    tn = result.map(lambda x: x[1]).reduce(lambda x,y: x+y)
    fp = result.map(lambda x: x[2]).reduce(lambda x,y: x+y)
    fn = result.map(lambda x: x[3]).reduce(lambda x,y: x+y)
    acc = (tp+tn)/total
    
    return acc

In [20]:
def predict(w, X, b):
    tot = 0
    for xi, wi in zip(X, w.value):
        tot += xi * wi
    tot += b
    val = sigmoid(tot)
    if (val>=0.5):
        return 1.0
    return 0.0

In [21]:
def predict_with_teshold(w, X, b, tresh):
    tot = 0
    for xi, wi in zip(X, w.value):
        tot += xi * wi
    tot += b
    val = sigmoid(tot)
    if (val>=tresh):
        return 1.0
    return 0.0

### Make folds for cross-validation

In [22]:
def shuffleRDD(RDD):
    RDD_with_rand = RDD.map(lambda x: (x[0], x[1], random.random()))
    RDD_sorted = RDD_with_rand.sortBy(lambda x: x[2])
    RDD_cleaned = RDD_sorted.map(lambda x: (x[0], x[1]))
    return RDD_cleaned

In [38]:
def kFoldsCV(k_fold, iterations, rdd_data, learning_rate, lambda_reg):
    
    n = len(rdd_data.take(1)[0][0])
    
    fold_length = rdd_data.count()/k_fold
    train_errors_fold = []
    test_errors_fold = []
    folds_accuracy = []
    
    for i_fold in range(k_fold):
        print(f"---Starting with fold {i_fold}---")
        global w
        w = sc.broadcast(np.random.rand(n))
        
        global b
        b = 0
        
        starting_fold = fold_length * i_fold
        end_fold = starting_fold + fold_length
        
        train_fold = rdd_data.zipWithIndex().filter(lambda x: (x[1] >= starting_fold and x[1]<end_fold)).map(lambda x: x[0])
        test_fold = rdd_data.zipWithIndex().filter(lambda x: (x[1] < starting_fold or x[1] >= end_fold)).map(lambda x: x[0])
        train_size = train_fold.count()
        
        w, b = train(train_fold, iterations, learning_rate, lambda_reg) 
        fold_accuracy = accuracy(w, b, test_fold)
        folds_accuracy.append(fold_accuracy)
        print(f"Accuracy for fold {i_fold}: {fold_accuracy}")
        
    print(folds_accuracy)
    return sum(folds_accuracy)/len(folds_accuracy)

## Testing

In [34]:
RDD_Xy = readFile("../data/botnet_tot_syn_l.csv")
RDD_Xy.count()

1000000

In [35]:
RDD_Xy_normalized = normalize(RDD_Xy)

In [36]:
RDD_shuffled = shuffleRDD(RDD_Xy_normalized)

## Experiments

In [39]:
elapsed_times = {}

for cores in range(1,13,1):
    sc.stop()
    conf = SparkConf().setAppName("appName").setMaster(f"local[{cores}]")
    sc = SparkContext(conf=conf)
    
    print(f"---------- Starting execution with {cores} cores ----------")
    
    start = time.time()
    # ---------- Execution ----------
    RDD_Xy = readFile("../data/botnet_tot_syn_l.csv")
    RDD_Xy_normalized = normalize(RDD_Xy)
    RDD_shuffled = shuffleRDD(RDD_Xy_normalized)
    avg_accuracy = kFoldsCV(10, 10, RDD_shuffled, 1.5, 0)
    print(f"Average accuracy for cores {cores}: {avg_accuracy}")
    
    # ---------- Execution ----------

    end = time.time()
    print(f"---------- Finished execution with {cores} cores ----------")
    elapsed_time = end - start
    elapsed_times[cores] = elapsed_time
    print(f"Elapsed time for cores {cores} is {elapsed_time} seconds")

---------- Starting execution with 1 cores ----------
---Starting with fold 0---
Cost for it 0: 1.2483736465050832
Cost for it 1: 0.624001479284665
Cost for it 2: 0.41292193800569604
Cost for it 3: 0.3273497801344928
Cost for it 4: 0.28420081868371955
Cost for it 5: 0.25881803274774323
Cost for it 6: 0.24222077124343622
Cost for it 7: 0.23052513475714045
Cost for it 8: 0.22181894806193914
Cost for it 9: 0.21506375412819753
[(0, 1, 0, 0)]
Accuracy for fold 0: 0.9285555555555556
---Starting with fold 1---
Cost for it 0: 1.159546325475648
Cost for it 1: 0.6481483084992861
Cost for it 2: 0.43484911858112407
Cost for it 3: 0.343488767181838
Cost for it 4: 0.29714658371011554
Cost for it 5: 0.26959891098441496
Cost for it 6: 0.25130185015988116
Cost for it 7: 0.2382126611477019
Cost for it 8: 0.2283552157770755
Cost for it 9: 0.22064926951269753
[(1, 0, 0, 0)]
Accuracy for fold 1: 0.9276888888888889
---Starting with fold 2---
Cost for it 0: 1.1569595071411272
Cost for it 1: 0.604897256889035

Cost for it 2: 0.3925923313000537
Cost for it 3: 0.32481665614135075
Cost for it 4: 0.2883868528194663
Cost for it 5: 0.26571132368624784
Cost for it 6: 0.2501286992747068
Cost for it 7: 0.23867738492826393
Cost for it 8: 0.22985468578500262
Cost for it 9: 0.22281730468045913
[(1, 0, 0, 0)]
Accuracy for fold 8: 0.9285944444444444
---Starting with fold 9---
Cost for it 0: 1.5437026994563912
Cost for it 1: 0.7537078742142684
Cost for it 2: 0.4591335312993782
Cost for it 3: 0.3450700856973611
Cost for it 4: 0.2914187397254151
Cost for it 5: 0.26172111997167236
Cost for it 6: 0.24314929265224827
Cost for it 7: 0.23046516601389555
Cost for it 8: 0.22122891172793036
Cost for it 9: 0.21417523051211246
[(1, 0, 0, 0)]
Accuracy for fold 9: 0.9272233333333333
[0.9251133333333333, 0.9273022222222222, 0.9274577777777778, 0.9280922222222222, 0.9285222222222222, 0.92723, 0.93199, 0.9266877777777778, 0.9285944444444444, 0.9272233333333333]
Average accuracy for cores 2: 0.9278213333333334
---------- Fi

Cost for it 8: 0.22129966588459246
Cost for it 9: 0.2147408788194774
[(0, 1, 0, 0)]
Accuracy for fold 5: 0.9281133333333333
---Starting with fold 6---
Cost for it 0: 1.550085272331485
Cost for it 1: 0.7954445003158895
Cost for it 2: 0.49746778488179166
Cost for it 3: 0.3727864792155655
Cost for it 4: 0.3119127690286684
Cost for it 5: 0.27749535817387116
Cost for it 6: 0.2556722132795602
Cost for it 7: 0.2406524724101429
Cost for it 8: 0.22968619813413513
Cost for it 9: 0.2213196702142903
[(0, 1, 0, 0)]
Accuracy for fold 6: 0.9274933333333333
---Starting with fold 7---
Cost for it 0: 1.4446258799639267
Cost for it 1: 0.6412964418480864
Cost for it 2: 0.39969287677716625
Cost for it 3: 0.32148330959568555
Cost for it 4: 0.28351704719986254
Cost for it 5: 0.2605925250222873
Cost for it 6: 0.24502000749501326
Cost for it 7: 0.23365180197304072
Cost for it 8: 0.22494068561484823
Cost for it 9: 0.21802808343140093
[(0, 1, 0, 0)]
Accuracy for fold 7: 0.9299744444444444
---Starting with fold 8

Cost for it 0: 1.266596536440698
Cost for it 1: 0.6085338808088991
Cost for it 2: 0.3983849708940725
Cost for it 3: 0.317664216557617
Cost for it 4: 0.2776108291157477
Cost for it 5: 0.2540438698489165
Cost for it 6: 0.23852994622718032
Cost for it 7: 0.22750577638022926
Cost for it 8: 0.21923462056143536
Cost for it 9: 0.21277487967622513
[(1, 0, 0, 0)]
Accuracy for fold 3: 0.9304655555555555
---Starting with fold 4---
Cost for it 0: 0.7979620870263037
Cost for it 1: 0.44970668688362564
Cost for it 2: 0.33465727743373963
Cost for it 3: 0.28507973166256195
Cost for it 4: 0.2584041811323411
Cost for it 5: 0.2417071250114975
Cost for it 6: 0.230163209931357
Cost for it 7: 0.2216263224564811
Cost for it 8: 0.21500624944312408
Cost for it 9: 0.20969093398583427
[(1, 0, 0, 0)]
Accuracy for fold 4: 0.9324466666666666
---Starting with fold 5---
Cost for it 0: 1.403243807658162
Cost for it 1: 0.693262726878156
Cost for it 2: 0.4430034292312002
Cost for it 3: 0.3460465357635528
Cost for it 4: 0

Cost for it 5: 0.2773214897168879
Cost for it 6: 0.25837079360522686
Cost for it 7: 0.24459799176101846
Cost for it 8: 0.2340967646675661
Cost for it 9: 0.22580338941000194
[(1, 0, 0, 0)]
Accuracy for fold 0: 0.9265811111111111
---Starting with fold 1---
Cost for it 0: 0.9032872674101556
Cost for it 1: 0.5256222564712237
Cost for it 2: 0.37630990035697043
Cost for it 3: 0.31022298328395703
Cost for it 4: 0.27525375710579914
Cost for it 5: 0.25381400843474344
Cost for it 6: 0.23924246247592273
Cost for it 7: 0.22861829745541132
Cost for it 8: 0.22048089906303645
Cost for it 9: 0.21401987783913676
[(1, 0, 0, 0)]
Accuracy for fold 1: 0.9290311111111111
---Starting with fold 2---
Cost for it 0: 0.9879294186741622
Cost for it 1: 0.49229096031194364
Cost for it 2: 0.3603584478116717
Cost for it 3: 0.30467475798991267
Cost for it 4: 0.2738859784593772
Cost for it 5: 0.254168638006511
Cost for it 6: 0.24034793141528418
Cost for it 7: 0.23005864846718027
Cost for it 8: 0.2220639982196691
Cost f

Cost for it 9: 0.2205283025294722
[(0, 1, 0, 0)]
Accuracy for fold 8: 0.9277322222222222
---Starting with fold 9---
Cost for it 0: 1.5187935108066226
Cost for it 1: 0.7233657041489602
Cost for it 2: 0.4261230242751235
Cost for it 3: 0.324265972787432
Cost for it 4: 0.27911526502567324
Cost for it 5: 0.25420839581814
Cost for it 6: 0.23836277458406963
Cost for it 7: 0.2272952307772303
Cost for it 8: 0.21905761366211485
Cost for it 9: 0.21264325495170638
[(0, 1, 0, 0)]
Accuracy for fold 9: 0.9301611111111111
[0.9315655555555555, 0.9288444444444445, 0.9287666666666666, 0.9288466666666667, 0.9315522222222222, 0.9297877777777778, 0.9241711111111112, 0.9295255555555556, 0.9277322222222222, 0.9301611111111111]
Average accuracy for cores 9: 0.9290953333333333
---------- Finished execution with 9 cores ----------
Elapsed time for cores 9 is 1721.8863859176636 seconds
---------- Starting execution with 10 cores ----------
---Starting with fold 0---
Cost for it 0: 1.1337662688061596
Cost for it 1

Cost for it 1: 0.8004604396204578
Cost for it 2: 0.5046002863502551
Cost for it 3: 0.37659280279146057
Cost for it 4: 0.3138930101279551
Cost for it 5: 0.27915543770228635
Cost for it 6: 0.2575441622257593
Cost for it 7: 0.24280896699713134
Cost for it 8: 0.23206562153853683
Cost for it 9: 0.22384090210042784
[(1, 0, 0, 0)]
Accuracy for fold 6: 0.9264811111111111
---Starting with fold 7---
Cost for it 0: 1.29832185166662
Cost for it 1: 0.5873045264063365
Cost for it 2: 0.40070384150559873
Cost for it 3: 0.33163529991825846
Cost for it 4: 0.2943680332973006
Cost for it 5: 0.2705255013705193
Cost for it 6: 0.2537917360573106
Cost for it 7: 0.24133787696557946
Cost for it 8: 0.23168124571611926
Cost for it 9: 0.22396091061840412
[(1, 0, 0, 0)]
Accuracy for fold 7: 0.9266022222222222
---Starting with fold 8---
Cost for it 0: 1.1039871424331544
Cost for it 1: 0.5144575330987311
Cost for it 2: 0.3545323302493976
Cost for it 3: 0.2957998861703379
Cost for it 4: 0.2660135608562655
Cost for it 

In [40]:
elapsed_times

{1: 5801.66516661644,
 2: 3351.4654548168182,
 3: 2529.703380346298,
 4: 2075.378036737442,
 5: 1851.9260251522064,
 6: 1833.1730937957764,
 7: 1807.4542572498322,
 8: 1786.9305987358093,
 9: 1721.8863859176636,
 10: 1733.3411436080933,
 11: 1765.4818713665009,
 12: 1791.3741402626038}