# Parallelized version

In [1]:
import math

## Initializing spark

In [2]:
from pyspark import SparkConf, SparkContext

In [3]:
conf = SparkConf().setAppName("appName").setMaster("local")
sc = SparkContext()

22/11/11 12:13:10 WARN Utils: Your hostname, Alexs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.131 instead (on interface en0)
22/11/11 12:13:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/11 12:13:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Functions

### Auxiliar functions

In [4]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [5]:
def cost_function(w, y, y_hat):
    # Computes the cost function for all the training samples
    m = x.shape[0]
    return -(1 / m) * np.sum(
        y * np.log(y_hat) + (1 - y)
        * np.log(1 - y_hat)
    )

In [6]:
def str_to_number_list(line):
    numbers = line.replace(" ","").split(",")
    numbers = [float(number) for number in numbers]
    
    # Separate X and y
    numbers = (numbers[:-1], numbers[-1])
    
    return numbers

In [7]:
def sum_list_values(list1, list2):
    return [n1 + n2 for n1, n2 in zip(list1, list2)]

In [8]:
def sum_minus_mean_squared(numbers):
    return [(number-means.value[x])**2 for x,number in enumerate(numbers)]

In [9]:
def rdd_mean_by_column(RDD_Xy, m):
    means = RDD_Xy.map(lambda x: x[0]).reduce(sum_list_values)
    means = [x/m for x in means]
    return means

In [10]:
def rdd_std_by_column(RDD_Xy, m):
    stds = RDD_Xy.map(lambda x: x[0]).map(sum_minus_mean_squared).reduce(sum_list_values)
    stds = [math.sqrt(x/m) for x in stds]
    return stds

### Mandatory functions

In [11]:
def readFile(filename):
    dataset = sc.textFile(filename)
    dataset = dataset.map(str_to_number_list)
    return dataset.sample(False, 0.00001)

In [12]:
def normalize(RDD_Xy):
    m = RDD_Xy.count()
    global means
    means = sc.broadcast(rdd_mean_by_column(RDD_Xy, m))
    stds = sc.broadcast(rdd_std_by_column(RDD_Xy, m))
    norm_rdd = RDD_Xy.map(lambda values: ([(x - mean) / std for x, mean, std in zip(values[0], means.value, stds.value)], values[1]))
    return norm_rdd

In [13]:
def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    
    m = RDD_Xy.shape[0]
    n = RDD_Xy.shape[1]
    
    X = np.c_[np.ones(m), RDD_Xy[:,0:-1]]
    y = RDD_Xy[:,-1]
    
    w = np.random.rand(n)
    
    for it in range(iterations):
        y_hat = np.array([sigmoid(x) for x in np.matmul(X, w)])
        
        print(f"Cost for it {it}:", cost_function(w, y, y_hat))
        
        dw = ((X.transpose() * (y_hat - y)).sum(axis=1) + lambda_reg * w)/m
        w -= learning_rate * dw
              
    return w

In [14]:
def accuracy(w, RDD_Xy):
    m = RDD_Xy.shape[0]
    
    X = np.c_[np.ones(m), RDD_Xy[:,0:-1]]
    y = RDD_Xy[:,-1]
    
    y_hat = predict(w, X)
    
    return np.sum(y_hat == y)/len(y)

In [15]:
def predict(w, X):
    y_hat = np.array([np.around(sigmoid(x),0) for x in np.matmul(X, w)])
    
    return y_hat

## Testing

In [16]:
RDD_Xy = readFile("../data/botnet_tot_syn_l.csv")

In [17]:
RDD_Xy = normalize(RDD_Xy)

                                                                                

In [20]:
w = train(data, 15, 0.1, 0.1)

AttributeError: 'PipelinedRDD' object has no attribute 'shape'

In [None]:
acc = accuracy(w, data)
acc