# Cross-validation version

In [29]:
import math
import random
import numpy as np
np.random.seed(33)
import time

## Initializing spark

In [30]:
from pyspark import SparkConf, SparkContext

In [31]:
conf = SparkConf().setAppName("appName").setMaster("local[*]")
sc = SparkContext()

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=appName, master=local[8]) created by __init__ at <ipython-input-24-135e8171e276>:6 

## Functions

### Auxiliar functions

In [32]:
def trainTestSplit(dataset):
    train = dataset.filter(lambda x: x[2]==1)
    train = train.map(lambda x: (x[0], x[1]))
    test = dataset.filter(lambda x: x[2]==0)
    test = test.map(lambda x: (x[0], x[1]))
    return (train, test)

In [33]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [34]:
def rdd_cost_function(RDD_Xyyhat, lambda_ref, m, w):
    y = RDD_Xyyhat[1]
    y_hat = RDD_Xyyhat[2]
    temp = (-1/m) * (
        y * np.log(y_hat) + (1 - y)
        * np.log(1 - y_hat)
    )
    cost = temp + (lambda_ref/(2*m))*sum([i*i for i in w.value])
    return cost

In [35]:
def str_to_number_list(line):
    numbers = line.replace(" ","").split(",")
    numbers = [float(number) for number in numbers]
    
    # Separate X and y
    numbers = (numbers[:-1], numbers[-1])
    
    return numbers

In [36]:
def sum_list_values(list1, list2):
    return [n1 + n2 for n1, n2 in zip(list1, list2)]

In [37]:
def sum_minus_mean_squared(numbers):
    return [(number-means.value[x])**2 for x,number in enumerate(numbers)]

In [38]:
def rdd_mean_by_column(RDD_Xy, m):
    means = RDD_Xy.map(lambda x: x[0]).reduce(sum_list_values)
    means = [x/m for x in means]
    return means

In [39]:
def rdd_std_by_column(RDD_Xy, m):
    stds = RDD_Xy.map(lambda x: x[0]).map(sum_minus_mean_squared).reduce(sum_list_values)
    stds = [math.sqrt(x/m) for x in stds]
    return stds

In [40]:
def multiply_RDDXy_by_w(Xy):
    global b
    tot = 0
    for xi, wi in zip(Xy[0], w.value):
        tot += xi * wi
    tot += b
    return (Xy[0], Xy[1], sigmoid(tot))

In [41]:
def calculate_dw(RDD_Xyyhat):
    dw = []
    for x in RDD_Xyyhat[0]:
        dw.append((RDD_Xyyhat[2]-RDD_Xyyhat[1])*x)
    return dw

### Mandatory functions

In [42]:
def readFile(filename):
    dataset = sc.textFile(filename)
    dataset = dataset.map(str_to_number_list)
    return dataset.sample(False, 1)

In [43]:
def normalize(RDD_Xy):
    m = RDD_Xy.count()
    global means
    means = sc.broadcast(rdd_mean_by_column(RDD_Xy, m))
    stds = sc.broadcast(rdd_std_by_column(RDD_Xy, m))
    norm_rdd = RDD_Xy.map(lambda values: ([(x - mean) / std for x, mean, std in zip(values[0], means.value, stds.value)], values[1]))
    return norm_rdd

In [44]:
def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    global b
    global m
    m = RDD_Xy.count()
    n = len(RDD_Xy.take(1)[0][0])
    global w
    w_temp = w.value.copy()
    for it in range(iterations):
        RDD_Xyyhat = RDD_Xy.map(multiply_RDDXy_by_w)
        print(f"Cost for it {it}:", RDD_Xyyhat.map(lambda x: rdd_cost_function(x, lambda_reg, m, w)).reduce(lambda x,y: x+y))
        dw=[0 for i in range(0,n)]
        for cl in range(n):
                X_cl = RDD_Xyyhat.map(lambda x: (x[2]-x[1])*x[0][cl]).reduce(lambda x, y: x+y)
                dw[cl] = (1/m)*(X_cl)+(lambda_reg/m)*w_temp[cl]
                w_temp[cl] -= learning_rate * dw[cl]
        w = sc.broadcast(w_temp)
        db = (1/m)*RDD_Xyyhat.map(lambda x: x[2]-x[1]).reduce(lambda x, y: x+y)
        b -= learning_rate * db
    return w, b

In [55]:
def checkPrediction(y, y_hat):
    if (y == y_hat and y_hat==0):
        return (0, 1, 0, 0)
    if (y == y_hat and y_hat==1):
        return (1, 0, 0, 0)
    if (y != y_hat and y_hat==1):
        return (0, 0, 1, 0)
    if (y != y_hat and y_hat==0):
        return (0,0,0,1)

In [56]:
def accuracy(ws, b, RDD_Xy):
    total = RDD_Xy.count()
    y_and_y_hat = RDD_Xy.map(lambda x: (x[1], predict(ws, x[0], b)))
    result = y_and_y_hat.map(lambda x: checkPrediction(x[0], x[1]))
    print(result.take(1))
    tp = result.map(lambda x: x[0]).reduce(lambda x,y: x+y)
    tn = result.map(lambda x: x[1]).reduce(lambda x,y: x+y)
    fp = result.map(lambda x: x[2]).reduce(lambda x,y: x+y)
    fn = result.map(lambda x: x[3]).reduce(lambda x,y: x+y)
    acc = (tp+tn)/total
    
    return acc, tp, tn, fp, fn

In [47]:
def predict(w, X, b):
    tot = 0
    for xi, wi in zip(X, w.value):
        tot += xi * wi
    tot += b
    val = sigmoid(tot)
    if (val>=0.5):
        return 1.0
    return 0.0

### Make folds for cross-validation

In [48]:
def shuffleRDD(RDD):
    RDD_with_rand = RDD.map(lambda x: (x[0], x[1], random.random()))
    RDD_sorted = RDD_with_rand.sortBy(lambda x: x[2])
    RDD_cleaned = RDD_sorted.map(lambda x: (x[0], x[1]))
    return RDD_cleaned

In [61]:
def kFoldsCV(k_fold, iterations, rdd_data, learning_rate, lambda_reg):
    
    n = len(rdd_data.take(1)[0][0])
    
    fold_length = rdd_data.count()/k_fold
    train_errors_fold = []
    test_errors_fold = []
    folds_accuracy = []
    
    for i_fold in range(k_fold):
        print(f"---Starting with fold {i_fold}---")
        global w
        w = sc.broadcast(np.random.rand(n))
        
        global b
        b = 0
        
        starting_fold = fold_length * i_fold
        end_fold = starting_fold + fold_length
        
        train_fold = rdd_data.zipWithIndex().filter(lambda x: (x[1] >= starting_fold and x[1]<end_fold)).map(lambda x: x[0])
        test_fold = rdd_data.zipWithIndex().filter(lambda x: (x[1] < starting_fold or x[1] >= end_fold)).map(lambda x: x[0])
        train_size = train_fold.count()
        
        w, b = train(train_fold, iterations, learning_rate, lambda_reg) 
        fold_accuracy, tp, tn, fp, fn = accuracy(w, b, test_fold)
        total = tp+tn+fp+fn
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1 = (2*precision*recall)/(precision+recall)
        folds_accuracy.append(fold_accuracy)
        print(f"Accuracy for fold {i_fold}: {fold_accuracy}")
        print(f"The tp percentage is: {tp/total}")
        print(f"The tf percentage is: {tn/total}")
        print(f"The fp percentage is: {fp/total}")
        print(f"The fn percentage is: {fn/total}")
        print(f"The precision is: {precision}")
        print(f"The recall is: {recall}")
        print(f"The f1 score is: {f1}")
    
    print(folds_accuracy)
    return sum(folds_accuracy)/len(folds_accuracy)

## Testing

In [50]:
RDD_Xy = readFile("../data/botnet_tot_syn_l.csv")
RDD_Xy.count()

1000000

In [51]:
RDD_Xy_normalized = normalize(RDD_Xy)

In [52]:
RDD_shuffled = shuffleRDD(RDD_Xy_normalized)

## Experiments

In [64]:
elapsed_times = {}

for cores in range(1,13,1):
    sc.stop()
    conf = SparkConf().setAppName("appName").setMaster(f"local[{cores}]")
    sc = SparkContext(conf=conf)
    
    print(f"---------- Starting execution with {cores} cores ----------")
    
    start = time.time()
    # ---------- Execution ----------
    RDD_Xy = readFile("../data/botnet_tot_syn_l.csv")
    RDD_Xy_normalized = normalize(RDD_Xy)
    RDD_shuffled = shuffleRDD(RDD_Xy_normalized)
    avg_accuracy = kFoldsCV(10, 10, RDD_shuffled, 1.5, 0)
    print(f"Average accuracy for cores {cores}: {avg_accuracy}")
    
    # ---------- Execution ----------

    end = time.time()
    print(f"---------- Finished execution with {cores} cores ----------")
    elapsed_time = end - start
    elapsed_times[cores] = elapsed_time
    print(f"Elapsed time for cores {cores} is {elapsed_time} seconds")

---------- Starting execution with 1 cores ----------
---Starting with fold 0---
Cost for it 0: 1.520855916006981
Cost for it 1: 0.7430759003476427
Cost for it 2: 0.44666541444194224
Cost for it 3: 0.34103792824931783
Cost for it 4: 0.29252374359890865
Cost for it 5: 0.26494051769702387
Cost for it 6: 0.24700247960563618
Cost for it 7: 0.23429489398120198
Cost for it 8: 0.22476070912007348
Cost for it 9: 0.21731095177499152
[(1, 0, 0, 0)]
Accuracy for fold 0: 0.9288011111111111
The tp percentage is: 0.47013555555555553
The tf percentage is: 0.45866555555555555
The fp percentage is: 0.041188888888888886
The fn percentage is: 0.03001
The precision is: 0.9194466657395174
The recall is: 0.9399974674039335
The f1 score is: 0.9296085015977846
---Starting with fold 1---
Cost for it 0: 1.3005384357602978
Cost for it 1: 0.6171789803230214
Cost for it 2: 0.40192202933534493
Cost for it 3: 0.31924520449383637
Cost for it 4: 0.278200242525103
Cost for it 5: 0.2541216821868584
Cost for it 6: 0.2383

Cost for it 3: 0.31320202332982416
Cost for it 4: 0.2781424825699476
Cost for it 5: 0.25690524393804337
Cost for it 6: 0.24244938894405535
Cost for it 7: 0.23183234667365085
Cost for it 8: 0.2236244371516899
Cost for it 9: 0.21704525955591156
[(1, 0, 0, 0)]
Accuracy for fold 1: 0.9290444444444445
The tp percentage is: 0.47105555555555556
The tf percentage is: 0.4579888888888889
The fp percentage is: 0.042014444444444445
The fn percentage is: 0.028941111111111112
The precision is: 0.918111672004903
The recall is: 0.9421173918937238
The f1 score is: 0.92995963850136
---Starting with fold 2---
Cost for it 0: 1.1811304094016142
Cost for it 1: 0.6425475576566106
Cost for it 2: 0.4353265150232255
Cost for it 3: 0.3462024114496465
Cost for it 4: 0.299855221367975
Cost for it 5: 0.27184223995682527
Cost for it 6: 0.2530799563961593
Cost for it 7: 0.2396055898068856
Cost for it 8: 0.2294387107120091
Cost for it 9: 0.2214812695170612
[(1, 0, 0, 0)]
Accuracy for fold 2: 0.9261466666666667
The tp 

Cost for it 8: 0.22469092788425887
Cost for it 9: 0.21660670224471823
[(1, 0, 0, 0)]
Accuracy for fold 2: 0.9288377777777778
The tp percentage is: 0.47108555555555554
The tf percentage is: 0.4577522222222222
The fp percentage is: 0.042076666666666665
The fn percentage is: 0.029085555555555555
The precision is: 0.9180051359111046
The recall is: 0.9418487895253624
The f1 score is: 0.9297741228070175
---Starting with fold 3---
Cost for it 0: 1.2568436992103649
Cost for it 1: 0.5528802696594497
Cost for it 2: 0.3649695027780177
Cost for it 3: 0.30234983960565753
Cost for it 4: 0.2712052797422163
Cost for it 5: 0.2521340849541628
Cost for it 6: 0.2390081333932034
Cost for it 7: 0.22929407005696464
Cost for it 8: 0.2217463837162457
Cost for it 9: 0.21567522965727925
[(1, 0, 0, 0)]
Accuracy for fold 3: 0.9283855555555556
The tp percentage is: 0.47159666666666666
The tf percentage is: 0.4567888888888889
The fp percentage is: 0.04325888888888889
The fn percentage is: 0.028355555555555554
The pr

Cost for it 0: 0.8895370010119309
Cost for it 1: 0.48632379680544235
Cost for it 2: 0.35096044346651295
Cost for it 3: 0.2927873690792687
Cost for it 4: 0.2618708251484502
Cost for it 5: 0.24289550551945144
Cost for it 6: 0.23004504968063616
Cost for it 7: 0.22072288300613724
Cost for it 8: 0.21361561771568355
Cost for it 9: 0.2079921357075556
[(0, 1, 0, 0)]
Accuracy for fold 4: 0.9314977777777778
The tp percentage is: 0.47180444444444447
The tf percentage is: 0.45969333333333334
The fp percentage is: 0.0403
The fn percentage is: 0.028202222222222222
The precision is: 0.9213051157099029
The recall is: 0.9435963076047875
The f1 score is: 0.932317488198485
---Starting with fold 5---
Cost for it 0: 1.3058247061208343
Cost for it 1: 0.6756100771291231
Cost for it 2: 0.44654663328331834
Cost for it 3: 0.35166351466711937
Cost for it 4: 0.30305520040959005
Cost for it 5: 0.2739513463306052
Cost for it 6: 0.2546146095727906
Cost for it 7: 0.24081973275816382
Cost for it 8: 0.23046405033101275

Cost for it 6: 0.24799646121710775
Cost for it 7: 0.23608791319540412
Cost for it 8: 0.22695445142592185
Cost for it 9: 0.2196982701318011
[(0, 1, 0, 0)]
Accuracy for fold 5: 0.9291788888888889
The tp percentage is: 0.4695311111111111
The tf percentage is: 0.45964777777777777
The fp percentage is: 0.04048
The fn percentage is: 0.03034111111111111
The precision is: 0.9206291801921527
The recall is: 0.9393022661346788
The f1 score is: 0.9298719874132875
---Starting with fold 6---
Cost for it 0: 1.3802491258756922
Cost for it 1: 0.6989075654849465
Cost for it 2: 0.446723352788423
Cost for it 3: 0.34871220157077293
Cost for it 4: 0.3011100399120128
Cost for it 5: 0.2731521801593051
Cost for it 6: 0.2546071505927808
Cost for it 7: 0.24130854486278502
Cost for it 8: 0.2312554815253937
Cost for it 9: 0.2233631943259173
[(0, 1, 0, 0)]
Accuracy for fold 6: 0.9280211111111111
The tp percentage is: 0.4714333333333333
The tf percentage is: 0.45658777777777776
The fp percentage is: 0.04347777777777

Cost for it 9: 0.21915757662025154
[(1, 0, 0, 0)]
Accuracy for fold 6: 0.9289488888888889
The tp percentage is: 0.47102666666666665
The tf percentage is: 0.45792222222222223
The fp percentage is: 0.041688888888888886
The fn percentage is: 0.029362222222222224
The precision is: 0.9186900252251627
The recall is: 0.9413211946264017
The f1 score is: 0.9298679306948718
---Starting with fold 7---
Cost for it 0: 1.3688444982633912
Cost for it 1: 0.6147722607776762
Cost for it 2: 0.3916294144367045
Cost for it 3: 0.3159627997569563
Cost for it 4: 0.27909475276107587
Cost for it 5: 0.2570033659004452
Cost for it 6: 0.24207926389798648
Cost for it 7: 0.23120783007262335
Cost for it 8: 0.22287510065541488
Cost for it 9: 0.21625135255087058
[(1, 0, 0, 0)]
Accuracy for fold 7: 0.9291222222222222
The tp percentage is: 0.47109555555555555
The tf percentage is: 0.4580266666666667
The fp percentage is: 0.041884444444444446
The fn percentage is: 0.028993333333333333
The precision is: 0.9183507262574673


Cost for it 0: 1.3283171118785309
Cost for it 1: 0.6754464265215567
Cost for it 2: 0.4146489946246748
Cost for it 3: 0.31932840603010004
Cost for it 4: 0.27625676220749773
Cost for it 5: 0.25229973403986716
Cost for it 6: 0.23696829558025292
Cost for it 7: 0.2262076329491887
Cost for it 8: 0.2181646845843158
Cost for it 9: 0.2118784945305078
[(0, 1, 0, 0)]
Accuracy for fold 8: 0.9314844444444444
The tp percentage is: 0.4715877777777778
The tf percentage is: 0.4598966666666667
The fp percentage is: 0.04027888888888889
The fn percentage is: 0.028236666666666667
The precision is: 0.9213098029000608
The recall is: 0.9435068312874298
The f1 score is: 0.9322762107889759
---Starting with fold 9---
Cost for it 0: 1.1645384632295261
Cost for it 1: 0.5267029940811501
Cost for it 2: 0.36170363781045606
Cost for it 3: 0.30164708000107887
Cost for it 4: 0.27026282311346483
Cost for it 5: 0.25069311468024913
Cost for it 6: 0.23717484464603664
Cost for it 7: 0.22719637939635348
Cost for it 8: 0.21948

Cost for it 4: 0.2811772138080416
Cost for it 5: 0.2572073489784322
Cost for it 6: 0.24120297822987202
Cost for it 7: 0.2297356703731643
Cost for it 8: 0.22109398000729438
Cost for it 9: 0.2143312422989752
[(0, 0, 1, 0)]
Accuracy for fold 9: 0.9282477777777778
The tp percentage is: 0.4709833333333333
The tf percentage is: 0.4572644444444444
The fp percentage is: 0.04302444444444444
The fn percentage is: 0.02872777777777778
The precision is: 0.9162961217621004
The recall is: 0.9425112287099213
The f1 score is: 0.9292188169632826
[0.9301144444444445, 0.9309077777777778, 0.9321966666666667, 0.9271811111111111, 0.9296277777777778, 0.9308711111111111, 0.9296455555555555, 0.9292822222222222, 0.9337866666666667, 0.9282477777777778]
Average accuracy for cores 8: 0.930186111111111
---------- Finished execution with 8 cores ----------
Elapsed time for cores 8 is 1730.224130153656 seconds
---------- Starting execution with 9 cores ----------
---Starting with fold 0---
Cost for it 0: 1.28158766837

---------- Starting execution with 10 cores ----------
---Starting with fold 0---
Cost for it 0: 0.9867730191826536
Cost for it 1: 0.49263371099704095
Cost for it 2: 0.36128868668514985
Cost for it 3: 0.30573522438202133
Cost for it 4: 0.27496370957669897
Cost for it 5: 0.25523419359500243
Cost for it 6: 0.24139375780040698
Cost for it 7: 0.23108398041483144
Cost for it 8: 0.22307028861795372
Cost for it 9: 0.2166410543938096
[(1, 0, 0, 0)]
Accuracy for fold 0: 0.9288911111111111
The tp percentage is: 0.4719055555555556
The tf percentage is: 0.45698555555555553
The fp percentage is: 0.04290888888888889
The fn percentage is: 0.0282
The precision is: 0.9166517385983731
The recall is: 0.9436119041535675
The f1 score is: 0.9299364591407313
---Starting with fold 1---
Cost for it 0: 1.6972119944639654
Cost for it 1: 0.7466102346492138
Cost for it 2: 0.42052912563489697
Cost for it 3: 0.3226522032718703
Cost for it 4: 0.2802115489384533
Cost for it 5: 0.2565185172163171
Cost for it 6: 0.24119

Cost for it 3: 0.3368621785083488
Cost for it 4: 0.29269253521295485
Cost for it 5: 0.26728011263784957
Cost for it 6: 0.25050196630570537
Cost for it 7: 0.23841533734501286
Cost for it 8: 0.2291958524234101
Cost for it 9: 0.2218796968668536
[(0, 1, 0, 0)]
Accuracy for fold 1: 0.9287755555555556
The tp percentage is: 0.4716633333333333
The tf percentage is: 0.4571122222222222
The fp percentage is: 0.042654444444444446
The fn percentage is: 0.02857
The precision is: 0.9170659730473594
The recall is: 0.9428866528953155
The f1 score is: 0.929797085958103
---Starting with fold 2---
Cost for it 0: 1.4501190545793756
Cost for it 1: 0.6574278155421952
Cost for it 2: 0.40338572961326724
Cost for it 3: 0.31712107433705006
Cost for it 4: 0.27666436064776145
Cost for it 5: 0.25332939931756177
Cost for it 6: 0.23806889102235124
Cost for it 7: 0.22723752324044988
Cost for it 8: 0.21910049098310463
Cost for it 9: 0.21272950178817762
[(0, 1, 0, 0)]
Accuracy for fold 2: 0.9310266666666667
The tp perce

Cost for it 8: 0.22992520245931425
Cost for it 9: 0.22275877230916266
[(0, 1, 0, 0)]
Accuracy for fold 2: 0.9271966666666667
The tp percentage is: 0.47024
The tf percentage is: 0.4569566666666667
The fp percentage is: 0.04328111111111111
The fn percentage is: 0.02952222222222222
The precision is: 0.9157169779885713
The recall is: 0.9409274632825388
The f1 score is: 0.9281510600852017
---Starting with fold 3---
Cost for it 0: 1.2465787906726122
Cost for it 1: 0.6949921789675017
Cost for it 2: 0.4661214863163274
Cost for it 3: 0.3649673620339682
Cost for it 4: 0.31248188607825267
Cost for it 5: 0.2809816929706154
Cost for it 6: 0.26000291462660724
Cost for it 7: 0.2450014309394593
Cost for it 8: 0.23372207792759495
Cost for it 9: 0.2249210042729386
[(0, 1, 0, 0)]
Accuracy for fold 3: 0.9231222222222222
The tp percentage is: 0.4707388888888889
The tf percentage is: 0.45238333333333336
The fp percentage is: 0.04774
The fn percentage is: 0.029137777777777776
The precision is: 0.907922962683

In [65]:
elapsed_times

{1: 6729.4781584739685,
 2: 3685.620101451874,
 3: 2442.6064653396606,
 4: 2206.490614414215,
 5: 1978.4220292568207,
 6: 1830.3048017024994,
 7: 1878.293996334076,
 8: 1730.224130153656,
 9: 1713.4700286388397,
 10: 1708.12730550766,
 11: 1709.0504715442657,
 12: 1818.922093629837}