In [3]:
import numpy as np
from math import sqrt
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("KNN")
sc = SparkContext.getOrCreate()

# getting the data in a (y, (x1,...xn)) format where y is the class label 
# and x1,...,xn are the predictive attribute values
# mydata = sc.textFile().split().map(lambda x: (x[0], (x[1:])))

# example of mydata
mydata = ((0,(2,1)), (1,(4,5)), (0,(1,3)), (0,(-2,1)), (1,(5,3)), (0,(1,1)), (1, (2,2.5)), (1, (3,5)))
# we use parallelize to partition the tuple so spark can work on it in parallel
data_rdd = sc.parallelize(mydata)

def getDistance(x1, x2):
  """
  calculates the distance between two points
  """
  n = len(x1)
  distance = 0.0
  for i in range(n):
    distance += (x1[i] - x2[i])**2
  return sqrt(distance)


# (k, (x1,...,xn)) where k is the number of nearest neighbors
# and (x1,...,xn) is the point of interest, in this case it have 2 attributes
k = 3
parameters = (3, (2,3))
# Broadcast the parameters to all nodes 
bc = sc.broadcast(parameters)

# map the data to (class, (distance, 1)) => sort it from smallest to largest
# => and take the ones with the k shortest "smallest" distances
kNeighbors = data_rdd.map(lambda x: (x[0], ((getDistance(x[1], bc.value[1]), 1)))) \
                     .sortBy(lambda k: k[1][0], ascending=True) \
                     .take(bc.value[0])
kNeighbors = sc.parallelize(kNeighbors)
print(kNeighbors.collect())

# map to (class, 1) => sum the counts => sort from largest to smallest this time
# => and take the largset one
pred = kNeighbors.map(lambda x: (x[0], x[1][1])) \
                 .reduceByKey(lambda a,b: a+b) \
                 .sortBy(lambda x:x[1], ascending=False) \
                 .take(1)

print(f"predicted class is: {pred[0][0]}")

[(1, (0.5, 1)), (0, (1.0, 1)), (0, (2.0, 1))]
predicted class is: 0


In [40]:
# number of folds
K = 3
fold_size = int(len(mydata) / K)

CVdata = tuple((mydata[k:k + fold_size],k) for k in range(0, len(mydata), fold_size))
CVdata = sc.parallelize(CVdata)

error = sc.emptyRDD()
p = (2,3)
for i in range(0, len(mydata), fold_size):
    train = CVdata.filter(lambda x: x[1] != i).flatMap(lambda x: x[0])
    test  = CVdata.filter(lambda x: x[1] == i).flatMap(lambda x: x[0])
    testError = test.foreach(lambda x: (x[1][0] - LR(p,i))**2, 1)
    testError = sc.parallelize(testError).reduce(lambda a,b: (a[0]+b[0], a[1]+b[1]))
    error[i] = x[0]/x[1]

MSE = mean(error)

print(f"Average Test Error: {MSE}")

((0, (2, 1)), (1, (4, 5)), (0, (1, 3)), (0, (-2, 1)), (1, (5, 3)), (0, (1, 1)), (1, (2, 2.5)), (1, (3, 5)))
[(((0, (2, 1)), (1, (4, 5))), 0), (((0, (1, 3)), (0, (-2, 1))), 2), (((1, (5, 3)), (0, (1, 1))), 4), (((1, (2, 2.5)), (1, (3, 5))), 6)]
[(0, (2, 1)), (1, (4, 5)), (0, (1, 3)), (0, (-2, 1)), (1, (5, 3)), (0, (1, 1))]
[(1, (2, 2.5)), (1, (3, 5))]
