[KMeans clustering in PySpark](https://stackoverflow.com/questions/47585723/kmeans-clustering-in-pyspark]) -- StackOverflow

In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("OFF")

Create the DataFrame

In [10]:
df = spark.createDataFrame([
    ("Delhi", 28.7, 77.1),   
    ("Agra", 27.2, 78.0),   
    ("Manipal", 13.4, 74.8),
    ("Udupi", 13.3, 74.7),   
    ("Mumbai", 19.1, 72.9),   
    ("Pune", 18.5, 73.9)    
], schema="city string, latitude double, longitude double")

df.show()

+-------+--------+---------+
|   city|latitude|longitude|
+-------+--------+---------+
|  Delhi|    28.7|     77.1|
|   Agra|    27.2|     78.0|
|Manipal|    13.4|     74.8|
|  Udupi|    13.3|     74.7|
| Mumbai|    19.1|     72.9|
|   Pune|    18.5|     73.9|
+-------+--------+---------+



Assemble feature vectors

In [11]:
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=["latitude", "longitude"], outputCol="features")
new_df = vecAssembler.transform(df)
new_df.show()

+-------+--------+---------+-----------+
|   city|latitude|longitude|   features|
+-------+--------+---------+-----------+
|  Delhi|    28.7|     77.1|[28.7,77.1]|
|   Agra|    27.2|     78.0|[27.2,78.0]|
|Manipal|    13.4|     74.8|[13.4,74.8]|
|  Udupi|    13.3|     74.7|[13.3,74.7]|
| Mumbai|    19.1|     72.9|[19.1,72.9]|
|   Pune|    18.5|     73.9|[18.5,73.9]|
+-------+--------+---------+-----------+



Fit the model

In [12]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=3, seed=1) 
model = kmeans.fit(new_df.select('features'))

In [13]:
transformed = model.transform(new_df)
transformed.show()  

+-------+--------+---------+-----------+----------+
|   city|latitude|longitude|   features|prediction|
+-------+--------+---------+-----------+----------+
|  Delhi|    28.7|     77.1|[28.7,77.1]|         1|
|   Agra|    27.2|     78.0|[27.2,78.0]|         1|
|Manipal|    13.4|     74.8|[13.4,74.8]|         0|
|  Udupi|    13.3|     74.7|[13.3,74.7]|         0|
| Mumbai|    19.1|     72.9|[19.1,72.9]|         2|
|   Pune|    18.5|     73.9|[18.5,73.9]|         2|
+-------+--------+---------+-----------+----------+



Identify anomalies

Randomly choosing cluster 0 to be the anomaly

In [14]:
transformed.where("prediction = 0").select("city").show()

+-------+
|   city|
+-------+
|Manipal|
|  Udupi|
+-------+

