In [9]:
#Code Snippet 34
#Step 1 - Importing the Data and Required Libraries
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('K-MeansClustering').getOrCreate()
data = spark.read.csv('latitude_longitude.csv',header=True,inferSchema=True)
print("Initial Data")
data.show(4)
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['latitude','longitude'],outputCol='features')
final_assembled_data = assembler.transform(data)
print("Consolidated Data with features")
final_assembled_data.show(4)
#Step 3 - Training our K-Means Model
#Since our Initial Data is well scaled, we can pass it directly to our K-Means 
kmeans = KMeans(featuresCol='features',k=3)
kmeans_model = kmeans.fit(final_assembled_data)
#Step 4 - Displaying the predictions
predictions = kmeans_model.transform(final_assembled_data)
print("Prediction Data")
predictions.show(4)
centres = kmeans_model.clusterCenters() #Determining the centroids of the cluster
print("The company can setup 3 of their towers at these locations- latitudes and longitudes for optimal network coverage")
cluster_list=[]
i=1
for centre in centres:
    cluster_list.append(centre)
    print("{} - {}".format(i,centre))
    i=i+1
print("\nDetermining the number of users that belongs to each clusters")
predictions.groupBy('prediction').count().show()
#Step 4 -Evaluating our model
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator_object = ClusteringEvaluator(predictionCol='prediction',featuresCol='features')
Silhouette_Score = evaluator_object.evaluate(predictions)
print("The Silhouette Score when k=3 is {}".format(Silhouette_Score))
print("\nWithin set Sum of Square Error {}\n".format(kmeans_model.computeCost(final_assembled_data)))
print("-"*50)
#Additional Info Step - Performing K-Means with Scaled Features
# Example of Scaling the Data and performing K-Means
from pyspark.ml.feature import StandardScaler
scalar_object = StandardScaler(inputCol='features',outputCol='ScaledFeatures')
scalar_model = scalar_object.fit(final_assembled_data)
final_scaled_data = scalar_model.transform(final_assembled_data)
print("\nConsolidated Data with Scaled Features")
final_scaled_data.show(4)
scaled_kmeans = KMeans(featuresCol='features',k=4)
scaled_kmeans_model = scaled_kmeans.fit(final_scaled_data)
scaled_predictions = scaled_kmeans_model.transform(final_scaled_data)
print("Prediction Data")
scaled_predictions.select('latitude','longitude','ScaledFeatures','prediction').show(4)
scaled_centres = scaled_kmeans_model.clusterCenters()
print("Scaled Tower Locations {}".format(scaled_centres))
Scaled_Silhouette_Score = evaluator_object.evaluate(scaled_predictions)
print("\nThe Silhouette Score when k=4 is {}".format(Scaled_Silhouette_Score))
print("\nWithin set Sum of Square Error {}".format(scaled_kmeans_model.computeCost(final_scaled_data)))
print("\nDetermining the number of users that belongs to each clusters")
scaled_predictions.groupBy('prediction').count().show()

Initial Data
+-------------+--------------+
|     latitude|     longitude|
+-------------+--------------+
|          0.0|           0.0|
|32.8247811394|-116.870394352|
| 45.326414382|-117.807811103|
|39.4708861702|-119.659926097|
+-------------+--------------+
only showing top 4 rows

Consolidated Data with features
+-------------+--------------+--------------------+
|     latitude|     longitude|            features|
+-------------+--------------+--------------------+
|          0.0|           0.0|           (2,[],[])|
|32.8247811394|-116.870394352|[32.8247811394,-1...|
| 45.326414382|-117.807811103|[45.326414382,-11...|
|39.4708861702|-119.659926097|[39.4708861702,-1...|
+-------------+--------------+--------------------+
only showing top 4 rows

Prediction Data
+-------------+--------------+--------------------+----------+
|     latitude|     longitude|            features|prediction|
+-------------+--------------+--------------------+----------+
|          0.0|           0.0|      