In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from numpy import array
from math import sqrt

sc = SparkContext("local", "KMeans")
sqlContext = SQLContext(sc)

# Load the data from the csv file
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('players_22.csv')

# Select the columns you want to use for clustering
data = data.select('overall', 'potential')

# Convert the DataFrame to an RDD of numpy arrays
parsedData = data.rdd.map(lambda row: array([row['overall'], row['potential']]))

# Initialize two centroids
centroids = [array([0.0,0.0]), array([3.0,3.0])]

# Function to compute the closest centroid for a data point
def closestCentroid(point):
    bestIndex = 0
    closest = float("+inf")
    for i in range(len(centroids)):
        tempDist = sqrt(sum([x**2 for x in (point-centroids[i])]))
        if tempDist < closest:
            closest = tempDist
            bestIndex = i
    return (bestIndex, (point, 1))

# Function to compute the new centroid
def computeNewCentroid(data):
    newCentroid = data[1][0]/data[1][1]
    return (data[0], newCentroid)

for i in range(10):
    # Assign each data point to the closest centroid
    closest = parsedData.map(closestCentroid)
    
    # Compute the new centroids by averaging the data points assigned to each centroid
    newCentroids = closest.reduceByKey(lambda x,y : (x[0]+y[0], x[1]+y[1])).map(computeNewCentroid).collect()
    
    # Update the centroids
    for newCentroid in newCentroids:
        centroids[newCentroid[0]] = newCentroid[1]

print("Final centroids: " + str(centroids))