In [4]:
from math import sqrt
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Moving_Window_Classifier")
sc = SparkContext.getOrCreate()

# map the data to ((x1,...xn), t) format where t is the class label 
# and x1,...,xn are the predictive attribute values
# mydata = sc.textFile().split().map()

# example of mydata
mydata = (((2,1), 0), ((4,5), 1), ((1,3), 0), ((-2,1), 1), ((5,3), 1), ((1,1), 0), ((2,2.5), 1), ((3,5), 1))
# we use parallelize to partition the tuple so spark can work on it in parallel
data_rdd = sc.parallelize(mydata)

def getDistance(x1, x2):
  """
  calculates the distance between two points
  """
  n = len(x1)
  distance = 0.0
  for i in range(n):
    distance += (x1[i] - x2[i])**2
  return sqrt(distance)


# (h, (x1,...,xn)) where h is the diameter of the moving window
# and (x1,...,xn) is the point of interest, in this case it have 2 attributes
h = 2
p = (h, (3,1))
# Broadcast the parameters to all nodes 
bc = sc.broadcast(p)

window = data_rdd.filter(lambda a: getDistance(a[0], bc.value[1]) <= h) 
class_1_count = window.map(lambda a: a[1]).sum()
# class_1_count = window.values().sum()
total_count = window.map(lambda a: len(a)).count()

print(f"points in window: {window.collect()}")
print(f"count of class 1 is: {class_1_count}")
print(f"total count is: {total_count}")
if class_1_count > total_count-class_1_count:
    print(f"predicted class is: 1")
else:
    print("predicted class is: 0")


points in window: [((2, 1), 0), ((1, 1), 0), ((2, 2.5), 1)]
count of class 1 is: 1
total count is: 3
predicted class is: 0
