In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *
sc.stop()
from pyspark.sql import SQLContext
sc =SparkContext()
sqlContext = SQLContext(sc)
import pyspark.sql.functions as func
import sys
from pyspark.sql.functions import countDistinct

In [2]:
#other imports
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [3]:
#reading in sticker type data
feature_data = sqlContext.read.parquet("gs://ds-url-catag/stick_statistics/derived_features/")

In [32]:
feature_data.columns

['user_id_n',
 'numofdays',
 'avg_con_days',
 'sticker_packs_sent',
 'distinct_sticker_packs_sent',
 'sum_paid',
 'sum_free',
 'sum_subs',
 'sum_discont']

In [35]:
#basic statistics of each of the data
# feature_data.describe('sticker_packs_sent').show()

In [4]:
#convert to a vector column
from pyspark.mllib.linalg import Vectors

assembler = VectorAssembler(
    inputCols=["numofdays", "avg_con_days","sticker_packs_sent","distinct_sticker_packs_sent","sum_paid","sum_free","sum_subs","sum_discont"],
    outputCol="features")
output = assembler.transform(feature_data)
# output.select("features").show()


In [5]:
#normalize the features
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import Normalizer

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1000000.0)
l1NormData = normalizer.transform(output)


In [6]:
dataset = l1NormData.select('normFeatures')
dataset = dataset.withColumn('features',col('normFeatures'))
dataset = dataset.select('features')

In [7]:
dataset.select('features').show(10,False)

+-------------------------------------------------------------------------------------------+
|features                                                                                   |
+-------------------------------------------------------------------------------------------+
|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]                                                          |
|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]                                                          |
|(8,[0,2,3,7],[0.0,0.0,0.0,0.0])                                                            |
|(8,[0,2,3,5],[0.0,0.0,0.0,0.0])                                                            |
|(8,[0,2,3,5],[0.0,0.0,0.0,0.0])                                                            |
|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]                                                          |
|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]                                                          |
|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]                          

In [19]:
#clustering
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setK(10).setSeed(1)
model = kmeans.fit(dataset)

In [20]:
#compute wssse error for those constructed model
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))

Within Set Sum of Squared Errors = 3.9999832864


In [13]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
# $example off$

Cluster Centers: 
[ 0.  0.  0.  0.  0.  0.  0.  0.]
[  9.99998614e-01   0.00000000e+00   9.99998614e-01   9.99998614e-01
   0.00000000e+00   9.99997208e-01   1.40581532e-06   0.00000000e+00]
[ 0.99999861  0.          0.99999861  0.99999861  0.          0.          0.
  0.99999861]
[ 0.99999861  0.          0.99999861  0.99999861  0.99999861  0.          0.
  0.        ]


In [14]:
transformed = model.transform(dataset)

In [15]:
transformed.columns

['features', 'prediction']

In [16]:
transformed.groupby('prediction').count().show(100)

+----------+-------+
|prediction|  count|
+----------+-------+
|         1|1422660|
|         3|   1597|
|         2|  14885|
|         0|6055898|
+----------+-------+

