# Demo
Set up spark context and SparkSession

In [56]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.feature import StringIndexer

def mapLibSVM(row): 
    return (row[5],Vectors.dense(row[:3]))
spark = SparkSession \
    .builder \
    .appName("Python Spark K-means example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Load dataset

In [57]:


df = spark.read.format('com.databricks.spark.csv').\
                       options(header='true', \
                       inferschema='true').\
            load("./data/iris.csv",header=True);
# check the data set

df.show(5,True)
df.printSchema()

df.describe().show()


+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows

root
 |-- sepal.length: double (nullable = true)
 |-- sepal.width: double (nullable = true)
 |-- petal.length: double (nullable = true)
 |-- petal.width: double (nullable = true)
 |-- variety: string (nullable = true)

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal.length|        sepal.width|      petal.length|       petal.width|  variety|
+-------+-------------

# Convert the data to dense vector (features)

You are strongly encouraged to try my get_dummy function for dealing with the categorical data in complex dataset.

## Supervised learning version:
```python
def get_dummy(df,indexCol,categoricalCols,continuousCols,labelCol):

    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.sql.functions import col

    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
                 for c in categoricalCols ]

    # default setting: dropLast=True
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]

    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(stages=indexers + encoders + [assembler])

    model=pipeline.fit(df)
    data = model.transform(df)

    data = data.withColumn('label',col(labelCol))

    return data.select(indexCol,'features','label')
```
## Unsupervised learning version:
```python

def get_dummy(df,indexCol,categoricalCols,continuousCols):
    '''
    Get dummy variables and concat with continuous variables for unsupervised learning.
    :param df: the dataframe
    :param categoricalCols: the name list of the categorical data
    :param continuousCols:  the name list of the numerical data
    :return k: feature matrix

    :author: Wenqiang Feng
    :email:  von198@gmail.com
    '''

    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
                 for c in categoricalCols ]

    # default setting: dropLast=True
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]

    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(stages=indexers + encoders + [assembler])

    model=pipeline.fit(df)
    data = model.transform(df)

    return data.select(indexCol,'features')
```
## Two in one:
```python
def get_dummy(df,indexCol,categoricalCols,continuousCols,labelCol,dropLast=False):

    '''
    Get dummy variables and concat with continuous variables for ml modeling.
    :param df: the dataframe
    :param categoricalCols: the name list of the categorical data
    :param continuousCols:  the name list of the numerical data
    :param labelCol:  the name of label column
    :param dropLast:  the flag of drop last column
    :return: feature matrix

    :author: Wenqiang Feng
    :email:  von198@gmail.com

    >>> df = spark.createDataFrame([
                  (0, "a"),
                  (1, "b"),
                  (2, "c"),
                  (3, "a"),
                  (4, "a"),
                  (5, "c")
              ], ["id", "category"])

    >>> indexCol = 'id'
    >>> categoricalCols = ['category']
    >>> continuousCols = []
    >>> labelCol = []

    >>> mat = get_dummy(df,indexCol,categoricalCols,continuousCols,labelCol)
    >>> mat.show()

    >>>
        +---+-------------+
        | id|     features|
        +---+-------------+
        |  0|[1.0,0.0,0.0]|
        |  1|[0.0,0.0,1.0]|
        |  2|[0.0,1.0,0.0]|
        |  3|[1.0,0.0,0.0]|
        |  4|[1.0,0.0,0.0]|
        |  5|[0.0,1.0,0.0]|
        +---+-------------+
    '''

    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.sql.functions import col

    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
                 for c in categoricalCols ]

    # default setting: dropLast=True
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()),dropLast=dropLast)
                 for indexer in indexers ]

    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(stages=indexers + encoders + [assembler])

    model=pipeline.fit(df)
    data = model.transform(df)

    if indexCol and labelCol:
        # for supervised learning
        data = data.withColumn('label',col(labelCol))
        return data.select(indexCol,'features','label')
    elif not indexCol and labelCol:
        # for supervised learning
        data = data.withColumn('label',col(labelCol))
        return data.select('features','label')
    elif indexCol and not labelCol:
        # for unsupervised learning
        return data.select(indexCol,'features')
    elif not indexCol and not labelCol:
        # for unsupervised learning
        return data.select('features')
```

In [58]:
df.columns

['sepal.length', 'sepal.width', 'petal.length', 'petal.width', 'variety']

In [59]:
indexer = StringIndexer(inputCol="variety", outputCol="labelIndex")
indexer = indexer.fit(df).transform(df)
indexer.show()
df = indexer.rdd.map(mapLibSVM).toDF(["variety", "features"])
df.show()

+------------+-----------+------------+-----------+-------+----------+
|sepal.length|sepal.width|petal.length|petal.width|variety|labelIndex|
+------------+-----------+------------+-----------+-------+----------+
|         5.1|        3.5|         1.4|        0.2| Setosa|       0.0|
|         4.9|        3.0|         1.4|        0.2| Setosa|       0.0|
|         4.7|        3.2|         1.3|        0.2| Setosa|       0.0|
|         4.6|        3.1|         1.5|        0.2| Setosa|       0.0|
|         5.0|        3.6|         1.4|        0.2| Setosa|       0.0|
|         5.4|        3.9|         1.7|        0.4| Setosa|       0.0|
|         4.6|        3.4|         1.4|        0.3| Setosa|       0.0|
|         5.0|        3.4|         1.5|        0.2| Setosa|       0.0|
|         4.4|        2.9|         1.4|        0.2| Setosa|       0.0|
|         4.9|        3.1|         1.5|        0.1| Setosa|       0.0|
|         5.4|        3.7|         1.5|        0.2| Setosa|       0.0|
|     

In [60]:
# Trains a k-means model (Estimator).
kmeans = KMeans().setK(3).setSeed(3)
model = kmeans.fit(df)



In [61]:
# Make predictions
from pyspark.ml.evaluation import ClusteringEvaluator

predictions = model.transform(df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.7353747891731895


0.7353747891731895

In [62]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[5.86833333 2.74       4.38166667]
[5.006 3.428 1.462]
[6.8525 3.07   5.6925]


In [64]:
predictions = predictions.select(["prediction","variety"])
predictions.show()

+----------+-------+
|prediction|variety|
+----------+-------+
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
|         1|    0.0|
+----------+-------+
only showing top 20 rows

