In [2]:
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkConf,SparkContext

### Boilerplate - Spark Session & HDFS Access

In [3]:
cwd = os.getcwd()
for part in cwd.split('/'):
    if part.lower().startswith('edureka'):
        user_id = part.title()
app_name = '{0} : K-Means Clustering'.format(user_id)
app_name

'Edureka_121039 : K-Means Clustering'

In [6]:
spark = SparkSession.builder.appName(app_name).getOrCreate()
sparkContext = spark.sparkContext
sqlContext = SQLContext(sparkContext)

In [7]:
def get_hdfs_filepath(file_name):
    my_hdfs = '/user/{0}'.format(user_id.lower())
    return os.path.join(my_hdfs, file_name)

In [8]:
df = sqlContext.createDataFrame([[0, 33.3, -17.5],
                              [1, 40.4, -20.5],
                              [2, 28., -23.9],
                              [3, 29.5, -19.0],
                              [4, 32.8, -18.84]
                             ],["other","lat", "long"])

In [9]:
df.show()

+-----+----+------+
|other| lat|  long|
+-----+----+------+
|    0|33.3| -17.5|
|    1|40.4| -20.5|
|    2|28.0| -23.9|
|    3|29.5| -19.0|
|    4|32.8|-18.84|
+-----+----+------+



In [10]:
from pyspark.ml.feature import VectorAssembler

In [11]:
vecAssembler = VectorAssembler(inputCols=["lat", "long"], outputCol="features")
new_df = vecAssembler.transform(df)
new_df.show()

+-----+----+------+-------------+
|other| lat|  long|     features|
+-----+----+------+-------------+
|    0|33.3| -17.5| [33.3,-17.5]|
|    1|40.4| -20.5| [40.4,-20.5]|
|    2|28.0| -23.9| [28.0,-23.9]|
|    3|29.5| -19.0| [29.5,-19.0]|
|    4|32.8|-18.84|[32.8,-18.84]|
+-----+----+------+-------------+



In [12]:
from pyspark.ml.clustering import KMeans

In [13]:
kmeans = KMeans(k=2, seed=1)  # 2 clusters here
model = kmeans.fit(new_df.select('features'))

In [14]:
transformed = model.transform(new_df)
transformed.show()  

+-----+----+------+-------------+----------+
|other| lat|  long|     features|prediction|
+-----+----+------+-------------+----------+
|    0|33.3| -17.5| [33.3,-17.5]|         0|
|    1|40.4| -20.5| [40.4,-20.5]|         1|
|    2|28.0| -23.9| [28.0,-23.9]|         0|
|    3|29.5| -19.0| [29.5,-19.0]|         0|
|    4|32.8|-18.84|[32.8,-18.84]|         0|
+-----+----+------+-------------+----------+



In [15]:
df.select('lat', 'long').rdd.collect()

[Row(lat=33.3, long=-17.5),
 Row(lat=40.4, long=-20.5),
 Row(lat=28.0, long=-23.9),
 Row(lat=29.5, long=-19.0),
 Row(lat=32.8, long=-18.84)]

In [16]:
df.select('lat', 'long').rdd.map(lambda x: (x[0], x[1])).collect()

[(33.3, -17.5), (40.4, -20.5), (28.0, -23.9), (29.5, -19.0), (32.8, -18.84)]

In [17]:
from pyspark.mllib.clustering import KMeans, KMeansModel

rdd = df.select('lat', 'long').rdd.map(lambda x: (x[0], x[1]))
clusters = KMeans.train(rdd, 2, maxIterations=10, initializationMode="random")

In [18]:
clusters.centers

[array([ 40.4, -20.5]), array([ 30.9 , -19.81])]